Struct parquet::file::properties::WriterPropertiesBuilder

source ·
pub struct WriterPropertiesBuilder { /* private fields */ }
Expand description

Builder for WriterProperties parquet writer configuration.

See example on WriterProperties

Implementations§

source§

impl WriterPropertiesBuilder

source

pub fn build(self) -> WriterProperties

Finalizes the configuration and returns immutable writer properties struct.

source

pub fn set_writer_version(self, value: WriterVersion) -> Self

Sets the WriterVersion written into the parquet metadata (defaults to PARQUET_1_0)

This value can determine what features some readers will support.

source

pub fn set_data_pagesize_limit(self, value: usize) -> Self

👎Deprecated since 41.0.0: Use set_data_page_size_limit

Sets best effort maximum size of a data page in bytes.

Note: this is a best effort limit based on value of set_write_batch_size.

source

pub fn set_data_page_size_limit(self, value: usize) -> Self

Sets best effort maximum size of a data page in bytes (defaults to 1024 * 1024).

The parquet writer will attempt to limit the sizes of each DataPage to this many bytes. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

source

pub fn set_data_page_row_count_limit(self, value: usize) -> Self

Sets best effort maximum number of rows in a data page (defaults to 20_000).

The parquet writer will attempt to limit the number of rows in each DataPage to this value. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

source

pub fn set_dictionary_pagesize_limit(self, value: usize) -> Self

👎Deprecated since 41.0.0: Use set_dictionary_page_size_limit

Sets best effort maximum dictionary page size, in bytes.

Note: this is a best effort limit based on value of set_write_batch_size.

source

pub fn set_dictionary_page_size_limit(self, value: usize) -> Self

Sets best effort maximum dictionary page size, in bytes (defaults to 1024 * 1024).

The parquet writer will attempt to limit the size of each DataPage used to store dictionaries to this many bytes. Reducing this value will result in larger parquet files, but may improve the effectiveness of page index based predicate pushdown during reading.

Note: this is a best effort limit based on value of set_write_batch_size.

source

pub fn set_write_batch_size(self, value: usize) -> Self

Sets write batch size (defaults to 1024).

For performance reasons, data for each column is written in batches of this size.

Additional limits such as such as set_data_page_row_count_limit are checked between batches, and thus the write batch size value acts as an upper-bound on the enforcement granularity of other limits.

source

pub fn set_max_row_group_size(self, value: usize) -> Self

Sets maximum number of rows in a row group (defaults to 1024 * 1024).

§Panics

If the value is set to 0.

source

pub fn set_bloom_filter_position(self, value: BloomFilterPosition) -> Self

Sets where in the final file Bloom Filters are written (default AfterRowGroup)

source

pub fn set_created_by(self, value: String) -> Self

Sets “created by” property (defaults to parquet-rs version <VERSION>).

source

pub fn set_key_value_metadata(self, value: Option<Vec<KeyValue>>) -> Self

Sets “key_value_metadata” property (defaults to None).

source

pub fn set_sorting_columns(self, value: Option<Vec<SortingColumn>>) -> Self

Sets sorting order of rows in the row group if any (defaults to None).

source

pub fn set_encoding(self, value: Encoding) -> Self

Sets default encoding for all columns.

If dictionary is not enabled, this is treated as a primary encoding for all columns. In case when dictionary is enabled for any column, this value is considered to be a fallback encoding for that column.

§Panics

if dictionary encoding is specified, regardless of dictionary encoding flag being set.

source

pub fn set_compression(self, value: Compression) -> Self

Sets default compression codec for all columns (default to UNCOMPRESSED).

source

pub fn set_dictionary_enabled(self, value: bool) -> Self

Sets default flag to enable/disable dictionary encoding for all columns (defaults to true).

Use this method to set dictionary encoding, instead of explicitly specifying encoding in set_encoding method.

source

pub fn set_statistics_enabled(self, value: EnabledStatistics) -> Self

Sets default statistics level for all columns (defaults to Page).

source

pub fn set_max_statistics_size(self, value: usize) -> Self

Sets default max statistics size for all columns (defaults to 4096).

Applicable only if statistics are enabled.

source

pub fn set_bloom_filter_enabled(self, value: bool) -> Self

Sets if bloom filter is enabled by default for all columns (defaults to false).

§Notes
  • If the bloom filter is enabled previously then it is a no-op.

  • If the bloom filter is not enabled, default values for ndv and fpp value are used used. See set_bloom_filter_ndv and set_bloom_filter_fpp to further adjust the ndv and fpp.

source

pub fn set_bloom_filter_fpp(self, value: f64) -> Self

Sets the default target bloom filter false positive probability (fpp) for all columns (defaults to 0.05).

Implicitly enables bloom writing, as if set_bloom_filter_enabled had been called.

source

pub fn set_bloom_filter_ndv(self, value: u64) -> Self

Sets default number of distinct values (ndv) for bloom filter for all columns (defaults to 1_000_000).

Implicitly enables bloom writing, as if set_bloom_filter_enabled had been called.

source

pub fn set_column_encoding(self, col: ColumnPath, value: Encoding) -> Self

Sets encoding for a specific column.

Takes precedence over Self::set_encoding.

If dictionary is not enabled, this is treated as a primary encoding for this column. In case when dictionary is enabled for this column, either through global defaults or explicitly, this value is considered to be a fallback encoding for this column.

§Panics

If user tries to set dictionary encoding here, regardless of dictionary encoding flag being set.

source

pub fn set_column_compression(self, col: ColumnPath, value: Compression) -> Self

Sets compression codec for a specific column.

Takes precedence over Self::set_compression.

source

pub fn set_column_dictionary_enabled(self, col: ColumnPath, value: bool) -> Self

Sets flag to enable/disable dictionary encoding for a specific column.

Takes precedence over Self::set_dictionary_enabled.

source

pub fn set_column_statistics_enabled( self, col: ColumnPath, value: EnabledStatistics, ) -> Self

Sets statistics level for a specific column.

Takes precedence over Self::set_statistics_enabled.

source

pub fn set_column_max_statistics_size( self, col: ColumnPath, value: usize, ) -> Self

Sets max size for statistics for a specific column.

Takes precedence over Self::set_max_statistics_size.

source

pub fn set_column_bloom_filter_enabled( self, col: ColumnPath, value: bool, ) -> Self

Sets whether a bloom filter should be written for a specific column.

Takes precedence over Self::set_bloom_filter_enabled.

source

pub fn set_column_bloom_filter_fpp(self, col: ColumnPath, value: f64) -> Self

Sets the false positive probability for bloom filter for a specific column.

Takes precedence over Self::set_bloom_filter_fpp.

source

pub fn set_column_bloom_filter_ndv(self, col: ColumnPath, value: u64) -> Self

Sets the number of distinct values for bloom filter for a specific column.

Takes precedence over Self::set_bloom_filter_ndv.

source

pub fn set_column_index_truncate_length(self, max_length: Option<usize>) -> Self

Sets the max length of min/max value fields when writing the column Index (defaults to None).

This can be used to prevent columns with very long values (hundreds of bytes long) from causing the parquet metadata to become huge.

§Notes

The column Index is written when Self::set_statistics_enabled is set to EnabledStatistics::Page.

  • If Some, must be greater than 0, otherwise will panic
  • If None, there’s no effective limit.
source

pub fn set_statistics_truncate_length(self, max_length: Option<usize>) -> Self

Sets the max length of min/max value fields in row group level Statistics (defaults to None).

§Notes

Row group level Statistics are written when Self::set_statistics_enabled is set to EnabledStatistics::Chunk or EnabledStatistics::Page.

  • If Some, must be greater than 0, otherwise will panic
  • If None, there’s no effective limit.

Auto Trait Implementations§

Blanket Implementations§

source§

impl<T> Any for T
where T: 'static + ?Sized,

source§

fn type_id(&self) -> TypeId

Gets the TypeId of self. Read more
source§

impl<T> Borrow<T> for T
where T: ?Sized,

source§

fn borrow(&self) -> &T

Immutably borrows from an owned value. Read more
source§

impl<T> BorrowMut<T> for T
where T: ?Sized,

source§

fn borrow_mut(&mut self) -> &mut T

Mutably borrows from an owned value. Read more
source§

impl<T> From<T> for T

source§

fn from(t: T) -> T

Returns the argument unchanged.

source§

impl<T, U> Into<U> for T
where U: From<T>,

source§

fn into(self) -> U

Calls U::from(self).

That is, this conversion is whatever the implementation of From<T> for U chooses to do.

source§

impl<T, U> TryFrom<U> for T
where U: Into<T>,

§

type Error = Infallible

The type returned in the event of a conversion error.
source§

fn try_from(value: U) -> Result<T, <T as TryFrom<U>>::Error>

Performs the conversion.
source§

impl<T, U> TryInto<U> for T
where U: TryFrom<T>,

§

type Error = <U as TryFrom<T>>::Error

The type returned in the event of a conversion error.
source§

fn try_into(self) -> Result<U, <U as TryFrom<T>>::Error>

Performs the conversion.
source§

impl<V, T> VZip<V> for T
where V: MultiLane<T>,

source§

fn vzip(self) -> V

source§

impl<T> Allocation for T
where T: RefUnwindSafe + Send + Sync,