Struct parquet::arrow::arrow_writer::ArrowColumnWriter
source · pub struct ArrowColumnWriter { /* private fields */ }
Expand description
Encodes ArrowLeafColumn
to ArrowColumnChunk
Note: This is a low-level interface for applications that require fine-grained control
of encoding, see ArrowWriter
for a higher-level interface
// The arrow schema
let schema = Arc::new(Schema::new(vec![
Field::new("i32", DataType::Int32, false),
Field::new("f32", DataType::Float32, false),
]));
// Compute the parquet schema
let parquet_schema = arrow_to_parquet_schema(schema.as_ref()).unwrap();
let props = Arc::new(WriterProperties::default());
// Create writers for each of the leaf columns
let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap();
// Spawn a worker thread for each column
// This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better
let mut workers: Vec<_> = col_writers
.into_iter()
.map(|mut col_writer| {
let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
let handle = std::thread::spawn(move || {
for col in recv {
col_writer.write(&col)?;
}
col_writer.close()
});
(handle, send)
})
.collect();
// Create parquet writer
let root_schema = parquet_schema.root_schema_ptr();
let mut out = Vec::with_capacity(1024); // This could be a File
let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone()).unwrap();
// Start row group
let mut row_group = writer.next_row_group().unwrap();
// Columns to encode
let to_write = vec![
Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _,
Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _,
];
// Spawn work to encode columns
let mut worker_iter = workers.iter_mut();
for (arr, field) in to_write.iter().zip(&schema.fields) {
for leaves in compute_leaves(field, arr).unwrap() {
worker_iter.next().unwrap().1.send(leaves).unwrap();
}
}
// Finish up parallel column encoding
for (handle, send) in workers {
drop(send); // Drop send side to signal termination
let chunk = handle.join().unwrap().unwrap();
chunk.append_to_row_group(&mut row_group).unwrap();
}
row_group.close().unwrap();
let metadata = writer.close().unwrap();
assert_eq!(metadata.num_rows, 3);
Implementations§
source§impl ArrowColumnWriter
impl ArrowColumnWriter
sourcepub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
Write an ArrowLeafColumn
sourcepub fn close(self) -> Result<ArrowColumnChunk>
pub fn close(self) -> Result<ArrowColumnChunk>
Close this column returning the written ArrowColumnChunk
sourcepub fn get_estimated_total_bytes(&self) -> usize
pub fn get_estimated_total_bytes(&self) -> usize
Returns the estimated total bytes for this column writer
Trait Implementations§
Auto Trait Implementations§
impl !Freeze for ArrowColumnWriter
impl !RefUnwindSafe for ArrowColumnWriter
impl Send for ArrowColumnWriter
impl !Sync for ArrowColumnWriter
impl Unpin for ArrowColumnWriter
impl !UnwindSafe for ArrowColumnWriter
Blanket Implementations§
source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more