Struct parquet::arrow::arrow_writer::ArrowColumnWriter
source · pub struct ArrowColumnWriter { /* private fields */ }
Expand description
Encodes ArrowLeafColumn
to ArrowColumnChunk
Note: This is a low-level interface for applications that require fine-grained control
of encoding, see ArrowWriter
for a higher-level interface
// The arrow schema
let schema = Arc::new(Schema::new(vec![
Field::new("i32", DataType::Int32, false),
Field::new("f32", DataType::Float32, false),
]));
// Compute the parquet schema
let parquet_schema = arrow_to_parquet_schema(schema.as_ref()).unwrap();
let props = Arc::new(WriterProperties::default());
// Create writers for each of the leaf columns
let col_writers = get_column_writers(&parquet_schema, &props, &schema).unwrap();
// Spawn a worker thread for each column
// This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better
let mut workers: Vec<_> = col_writers
.into_iter()
.map(|mut col_writer| {
let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
let handle = std::thread::spawn(move || {
for col in recv {
col_writer.write(&col)?;
}
col_writer.close()
});
(handle, send)
})
.collect();
// Create parquet writer
let root_schema = parquet_schema.root_schema_ptr();
let mut out = Vec::with_capacity(1024); // This could be a File
let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone()).unwrap();
// Start row group
let mut row_group = writer.next_row_group().unwrap();
// Columns to encode
let to_write = vec![
Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _,
Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _,
];
// Spawn work to encode columns
let mut worker_iter = workers.iter_mut();
for (arr, field) in to_write.iter().zip(&schema.fields) {
for leaves in compute_leaves(field, arr).unwrap() {
worker_iter.next().unwrap().1.send(leaves).unwrap();
}
}
// Finish up parallel column encoding
for (handle, send) in workers {
drop(send); // Drop send side to signal termination
let chunk = handle.join().unwrap().unwrap();
chunk.append_to_row_group(&mut row_group).unwrap();
}
row_group.close().unwrap();
let metadata = writer.close().unwrap();
assert_eq!(metadata.num_rows, 3);
Implementations§
source§impl ArrowColumnWriter
impl ArrowColumnWriter
sourcepub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
Write an ArrowLeafColumn
sourcepub fn close(self) -> Result<ArrowColumnChunk>
pub fn close(self) -> Result<ArrowColumnChunk>
Close this column returning the written ArrowColumnChunk
sourcepub fn memory_size(&self) -> usize
pub fn memory_size(&self) -> usize
Returns the estimated total memory usage by the writer.
This Self::get_estimated_total_bytes
this is an estimate
of the current memory usage and not it’s anticipated encoded size.
This includes:
- Data buffered in encoded form
- Data buffered in un-encoded form (e.g.
usize
dictionary keys)
This value should be greater than or equal to Self::get_estimated_total_bytes
sourcepub fn get_estimated_total_bytes(&self) -> usize
pub fn get_estimated_total_bytes(&self) -> usize
Returns the estimated total encoded bytes for this column writer.
This includes:
- Data buffered in encoded form
- An estimate of how large the data buffered in un-encoded form would be once encoded
This value should be less than or equal to Self::memory_size
Trait Implementations§
Auto Trait Implementations§
impl !Freeze for ArrowColumnWriter
impl !RefUnwindSafe for ArrowColumnWriter
impl Send for ArrowColumnWriter
impl !Sync for ArrowColumnWriter
impl Unpin for ArrowColumnWriter
impl !UnwindSafe for ArrowColumnWriter
Blanket Implementations§
source§impl<T> BorrowMut<T> for Twhere
T: ?Sized,
impl<T> BorrowMut<T> for Twhere
T: ?Sized,
source§fn borrow_mut(&mut self) -> &mut T
fn borrow_mut(&mut self) -> &mut T
Mutably borrows from an owned value. Read more