pub struct ArrowColumnWriter { /* private fields */ }Expand description
Encodes ArrowLeafColumn to ArrowColumnChunk
ArrowColumnWriter instances can be created using an ArrowRowGroupWriterFactory;
Note: This is a low-level interface for applications that require
fine-grained control of encoding (e.g. encoding using multiple threads),
see ArrowWriter for a higher-level interface
§Example: Encoding two Arrow Array’s in Parallel
// The arrow schema
let schema = Arc::new(Schema::new(vec![
Field::new("i32", DataType::Int32, false),
Field::new("f32", DataType::Float32, false),
]));
// Compute the parquet schema
let props = Arc::new(WriterProperties::default());
let parquet_schema = ArrowSchemaConverter::new()
.with_coerce_types(props.coerce_types())
.convert(&schema)
.unwrap();
// Create parquet writer
let root_schema = parquet_schema.root_schema_ptr();
// write to memory in the example, but this could be a File
let mut out = Vec::with_capacity(1024);
let mut writer = SerializedFileWriter::new(&mut out, root_schema, props.clone())
.unwrap();
// Create a factory for building Arrow column writers
let row_group_factory = ArrowRowGroupWriterFactory::new(&writer, Arc::clone(&schema));
// Create column writers for the 0th row group
let col_writers = row_group_factory.create_column_writers(0).unwrap();
// Spawn a worker thread for each column
//
// Note: This is for demonstration purposes, a thread-pool e.g. rayon or tokio, would be better.
// The `map` produces an iterator of type `tuple of (thread handle, send channel)`.
let mut workers: Vec<_> = col_writers
.into_iter()
.map(|mut col_writer| {
let (send, recv) = std::sync::mpsc::channel::<ArrowLeafColumn>();
let handle = std::thread::spawn(move || {
// receive Arrays to encode via the channel
for col in recv {
col_writer.write(&col)?;
}
// once the input is complete, close the writer
// to return the newly created ArrowColumnChunk
col_writer.close()
});
(handle, send)
})
.collect();
// Start row group
let mut row_group_writer: SerializedRowGroupWriter<'_, _> = writer
.next_row_group()
.unwrap();
// Create some example input columns to encode
let to_write = vec![
Arc::new(Int32Array::from_iter_values([1, 2, 3])) as _,
Arc::new(Float32Array::from_iter_values([1., 45., -1.])) as _,
];
// Send the input columns to the workers
let mut worker_iter = workers.iter_mut();
for (arr, field) in to_write.iter().zip(&schema.fields) {
for leaves in compute_leaves(field, arr).unwrap() {
worker_iter.next().unwrap().1.send(leaves).unwrap();
}
}
// Wait for the workers to complete encoding, and append
// the resulting column chunks to the row group (and the file)
for (handle, send) in workers {
drop(send); // Drop send side to signal termination
// wait for the worker to send the completed chunk
let chunk: ArrowColumnChunk = handle.join().unwrap().unwrap();
chunk.append_to_row_group(&mut row_group_writer).unwrap();
}
// Close the row group which writes to the underlying file
row_group_writer.close().unwrap();
let metadata = writer.close().unwrap();
assert_eq!(metadata.file_metadata().num_rows(), 3);Implementations§
Source§impl ArrowColumnWriter
impl ArrowColumnWriter
Sourcepub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
pub fn write(&mut self, col: &ArrowLeafColumn) -> Result<()>
Write an ArrowLeafColumn
Sourcepub fn close(self) -> Result<ArrowColumnChunk>
pub fn close(self) -> Result<ArrowColumnChunk>
Close this column returning the written ArrowColumnChunk
Sourcepub fn memory_size(&self) -> usize
pub fn memory_size(&self) -> usize
Returns the estimated total memory usage by the writer.
This Self::get_estimated_total_bytes this is an estimate
of the current memory usage and not it’s anticipated encoded size.
This includes:
- Data buffered in encoded form
- Data buffered in un-encoded form (e.g.
usizedictionary keys)
This value should be greater than or equal to Self::get_estimated_total_bytes
Sourcepub fn get_estimated_total_bytes(&self) -> usize
pub fn get_estimated_total_bytes(&self) -> usize
Returns the estimated total encoded bytes for this column writer.
This includes:
- Data buffered in encoded form
- An estimate of how large the data buffered in un-encoded form would be once encoded
This value should be less than or equal to Self::memory_size