parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use crate::basic::{Compression, Encoding};
20use crate::compression::{CodecOptions, CodecOptionsBuilder};
21#[cfg(feature = "encryption")]
22use crate::encryption::encrypt::FileEncryptionProperties;
23use crate::file::metadata::KeyValue;
24use crate::format::SortingColumn;
25use crate::schema::types::ColumnPath;
26use std::str::FromStr;
27use std::{collections::HashMap, sync::Arc};
28
29/// Default value for [`WriterProperties::data_page_size_limit`]
30pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
31/// Default value for [`WriterProperties::write_batch_size`]
32pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
33/// Default value for [`WriterProperties::writer_version`]
34pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
35/// Default value for [`WriterProperties::compression`]
36pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
37/// Default value for [`WriterProperties::dictionary_enabled`]
38pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
39/// Default value for [`WriterProperties::dictionary_page_size_limit`]
40pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
41/// Default value for [`WriterProperties::data_page_row_count_limit`]
42pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
43/// Default value for [`WriterProperties::statistics_enabled`]
44pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
45/// Default value for [`WriterProperties::max_statistics_size`]
46#[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
47pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
48/// Default value for [`WriterProperties::max_row_group_size`]
49pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
50/// Default value for [`WriterProperties::bloom_filter_position`]
51pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
52/// Default value for [`WriterProperties::created_by`]
53pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
54/// Default value for [`WriterProperties::column_index_truncate_length`]
55pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
56/// Default value for [`BloomFilterProperties::fpp`]
57pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
58/// Default value for [`BloomFilterProperties::ndv`]
59pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
60/// Default values for [`WriterProperties::statistics_truncate_length`]
61pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
62/// Default value for [`WriterProperties::offset_index_disabled`]
63pub const DEFAULT_OFFSET_INDEX_DISABLED: bool = false;
64/// Default values for [`WriterProperties::coerce_types`]
65pub const DEFAULT_COERCE_TYPES: bool = false;
66
67/// Parquet writer version.
68///
69/// Basic constant, which is not part of the Thrift definition.
70#[derive(Debug, Clone, Copy, PartialEq, Eq)]
71#[allow(non_camel_case_types)]
72pub enum WriterVersion {
73    /// Parquet format version 1.0
74    PARQUET_1_0,
75    /// Parquet format version 2.0
76    PARQUET_2_0,
77}
78
79impl WriterVersion {
80    /// Returns writer version as `i32`.
81    pub fn as_num(&self) -> i32 {
82        match self {
83            WriterVersion::PARQUET_1_0 => 1,
84            WriterVersion::PARQUET_2_0 => 2,
85        }
86    }
87}
88
89impl FromStr for WriterVersion {
90    type Err = String;
91
92    fn from_str(s: &str) -> Result<Self, Self::Err> {
93        match s {
94            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
95            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
96            _ => Err(format!("Invalid writer version: {}", s)),
97        }
98    }
99}
100
101/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
102/// write Bloom filters
103///
104/// Basic constant, which is not part of the Thrift definition.
105#[derive(Debug, Clone, Copy, PartialEq, Eq)]
106pub enum BloomFilterPosition {
107    /// Write Bloom Filters of each row group right after the row group
108    ///
109    /// This saves memory by writing it as soon as it is computed, at the cost
110    /// of data locality for readers
111    AfterRowGroup,
112    /// Write Bloom Filters at the end of the file
113    ///
114    /// This allows better data locality for readers, at the cost of memory usage
115    /// for writers.
116    End,
117}
118
119/// Reference counted writer properties.
120pub type WriterPropertiesPtr = Arc<WriterProperties>;
121
122/// Configuration settings for writing parquet files.
123///
124/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
125///
126/// # Example
127///
128/// ```rust
129/// # use parquet::{
130/// #    basic::{Compression, Encoding},
131/// #    file::properties::*,
132/// #    schema::types::ColumnPath,
133/// # };
134/// #
135/// // Create properties with default configuration.
136/// let props = WriterProperties::default();
137///
138/// // Use properties builder to set certain options and assemble the configuration.
139/// let props = WriterProperties::builder()
140///     .set_writer_version(WriterVersion::PARQUET_1_0)
141///     .set_encoding(Encoding::PLAIN)
142///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
143///     .set_compression(Compression::SNAPPY)
144///     .build();
145///
146/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
147/// assert_eq!(
148///     props.encoding(&ColumnPath::from("col1")),
149///     Some(Encoding::DELTA_BINARY_PACKED)
150/// );
151/// assert_eq!(
152///     props.encoding(&ColumnPath::from("col2")),
153///     Some(Encoding::PLAIN)
154/// );
155/// ```
156#[derive(Debug, Clone)]
157pub struct WriterProperties {
158    data_page_size_limit: usize,
159    dictionary_page_size_limit: usize,
160    data_page_row_count_limit: usize,
161    write_batch_size: usize,
162    max_row_group_size: usize,
163    bloom_filter_position: BloomFilterPosition,
164    writer_version: WriterVersion,
165    created_by: String,
166    offset_index_disabled: bool,
167    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
168    default_column_properties: ColumnProperties,
169    column_properties: HashMap<ColumnPath, ColumnProperties>,
170    sorting_columns: Option<Vec<SortingColumn>>,
171    column_index_truncate_length: Option<usize>,
172    statistics_truncate_length: Option<usize>,
173    coerce_types: bool,
174    #[cfg(feature = "encryption")]
175    pub(crate) file_encryption_properties: Option<FileEncryptionProperties>,
176}
177
178impl Default for WriterProperties {
179    fn default() -> Self {
180        Self::builder().build()
181    }
182}
183
184impl WriterProperties {
185    /// Create a new [`WriterProperties`] with the default settings
186    ///
187    /// See [`WriterProperties::builder`] for customising settings
188    pub fn new() -> Self {
189        Self::default()
190    }
191
192    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
193    /// properties.
194    pub fn builder() -> WriterPropertiesBuilder {
195        WriterPropertiesBuilder::with_defaults()
196    }
197
198    /// Returns data page size limit.
199    ///
200    /// Note: this is a best effort limit based on the write batch size
201    ///
202    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
203    pub fn data_page_size_limit(&self) -> usize {
204        self.data_page_size_limit
205    }
206
207    /// Returns dictionary page size limit.
208    ///
209    /// Note: this is a best effort limit based on the write batch size
210    ///
211    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
212    pub fn dictionary_page_size_limit(&self) -> usize {
213        self.dictionary_page_size_limit
214    }
215
216    /// Returns the maximum page row count
217    ///
218    /// Note: this is a best effort limit based on the write batch size
219    ///
220    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
221    pub fn data_page_row_count_limit(&self) -> usize {
222        self.data_page_row_count_limit
223    }
224
225    /// Returns configured batch size for writes.
226    ///
227    /// When writing a batch of data, this setting allows to split it internally into
228    /// smaller batches so we can better estimate the size of a page currently being
229    /// written.
230    ///
231    /// For more details see [`WriterPropertiesBuilder::set_write_batch_size`]
232    pub fn write_batch_size(&self) -> usize {
233        self.write_batch_size
234    }
235
236    /// Returns maximum number of rows in a row group.
237    ///
238    /// For more details see [`WriterPropertiesBuilder::set_max_row_group_size`]
239    pub fn max_row_group_size(&self) -> usize {
240        self.max_row_group_size
241    }
242
243    /// Returns bloom filter position.
244    ///
245    /// For more details see [`WriterPropertiesBuilder::set_bloom_filter_position`]
246    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
247        self.bloom_filter_position
248    }
249
250    /// Returns configured writer version.
251    ///
252    /// For more details see [`WriterPropertiesBuilder::set_writer_version`]
253    pub fn writer_version(&self) -> WriterVersion {
254        self.writer_version
255    }
256
257    /// Returns `created_by` string.
258    ///
259    /// For more details see [`WriterPropertiesBuilder::set_created_by`]
260    pub fn created_by(&self) -> &str {
261        &self.created_by
262    }
263
264    /// Returns `true` if offset index writing is disabled.
265    ///
266    /// For more details see [`WriterPropertiesBuilder::set_offset_index_disabled`]
267    pub fn offset_index_disabled(&self) -> bool {
268        // If page statistics are to be collected, then do not disable the offset indexes.
269        let default_page_stats_enabled =
270            self.default_column_properties.statistics_enabled() == Some(EnabledStatistics::Page);
271        let column_page_stats_enabled = self
272            .column_properties
273            .iter()
274            .any(|path_props| path_props.1.statistics_enabled() == Some(EnabledStatistics::Page));
275        if default_page_stats_enabled || column_page_stats_enabled {
276            return false;
277        }
278
279        self.offset_index_disabled
280    }
281
282    /// Returns `key_value_metadata` KeyValue pairs.
283    ///
284    /// For more details see [`WriterPropertiesBuilder::set_key_value_metadata`]
285    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
286        self.key_value_metadata.as_ref()
287    }
288
289    /// Returns sorting columns.
290    ///
291    /// For more details see [`WriterPropertiesBuilder::set_sorting_columns`]
292    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
293        self.sorting_columns.as_ref()
294    }
295
296    /// Returns the maximum length of truncated min/max values in the column index.
297    ///
298    /// `None` if truncation is disabled, must be greater than 0 otherwise.
299    ///
300    /// For more details see [`WriterPropertiesBuilder::set_column_index_truncate_length`]
301    pub fn column_index_truncate_length(&self) -> Option<usize> {
302        self.column_index_truncate_length
303    }
304
305    /// Returns the maximum length of truncated min/max values in [`Statistics`].
306    ///
307    /// `None` if truncation is disabled, must be greater than 0 otherwise.
308    ///
309    /// For more details see [`WriterPropertiesBuilder::set_statistics_truncate_length`]
310    ///
311    /// [`Statistics`]: crate::file::statistics::Statistics
312    pub fn statistics_truncate_length(&self) -> Option<usize> {
313        self.statistics_truncate_length
314    }
315
316    /// Returns `true` if type coercion is enabled.
317    ///
318    /// For more details see [`WriterPropertiesBuilder::set_coerce_types`]
319    pub fn coerce_types(&self) -> bool {
320        self.coerce_types
321    }
322
323    /// Returns encoding for a data page, when dictionary encoding is enabled.
324    ///
325    /// This is not configurable.
326    #[inline]
327    pub fn dictionary_data_page_encoding(&self) -> Encoding {
328        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
329        // Dictionary values are encoded using RLE_DICTIONARY encoding.
330        Encoding::RLE_DICTIONARY
331    }
332
333    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
334    ///
335    /// This is not configurable.
336    #[inline]
337    pub fn dictionary_page_encoding(&self) -> Encoding {
338        // PLAIN_DICTIONARY is deprecated in writer version 1.
339        // Dictionary is encoded using plain encoding.
340        Encoding::PLAIN
341    }
342
343    /// Returns encoding for a column, if set.
344    ///
345    /// In case when dictionary is enabled, returns fallback encoding.
346    ///
347    /// If encoding is not set, then column writer will choose the best encoding
348    /// based on the column type.
349    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
350        self.column_properties
351            .get(col)
352            .and_then(|c| c.encoding())
353            .or_else(|| self.default_column_properties.encoding())
354    }
355
356    /// Returns compression codec for a column.
357    ///
358    /// For more details see [`WriterPropertiesBuilder::set_column_compression`]
359    pub fn compression(&self, col: &ColumnPath) -> Compression {
360        self.column_properties
361            .get(col)
362            .and_then(|c| c.compression())
363            .or_else(|| self.default_column_properties.compression())
364            .unwrap_or(DEFAULT_COMPRESSION)
365    }
366
367    /// Returns `true` if dictionary encoding is enabled for a column.
368    ///
369    /// For more details see [`WriterPropertiesBuilder::set_dictionary_enabled`]
370    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
371        self.column_properties
372            .get(col)
373            .and_then(|c| c.dictionary_enabled())
374            .or_else(|| self.default_column_properties.dictionary_enabled())
375            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
376    }
377
378    /// Returns which statistics are written for a column.
379    ///
380    /// For more details see [`WriterPropertiesBuilder::set_statistics_enabled`]
381    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
382        self.column_properties
383            .get(col)
384            .and_then(|c| c.statistics_enabled())
385            .or_else(|| self.default_column_properties.statistics_enabled())
386            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
387    }
388
389    /// Returns max size for statistics.
390    ///
391    /// UNUSED
392    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
393    pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
394        #[allow(deprecated)]
395        self.column_properties
396            .get(col)
397            .and_then(|c| c.max_statistics_size())
398            .or_else(|| self.default_column_properties.max_statistics_size())
399            .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
400    }
401
402    /// Returns the [`BloomFilterProperties`] for the given column
403    ///
404    /// Returns `None` if bloom filter is disabled
405    ///
406    /// For more details see [`WriterPropertiesBuilder::set_column_bloom_filter_enabled`]
407    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
408        self.column_properties
409            .get(col)
410            .and_then(|c| c.bloom_filter_properties())
411            .or_else(|| self.default_column_properties.bloom_filter_properties())
412    }
413
414    /// Return file encryption properties
415    ///
416    /// For more details see [`WriterPropertiesBuilder::with_file_encryption_properties`]
417    #[cfg(feature = "encryption")]
418    pub fn file_encryption_properties(&self) -> Option<&FileEncryptionProperties> {
419        self.file_encryption_properties.as_ref()
420    }
421}
422
423/// Builder for  [`WriterProperties`] Parquet writer configuration.
424///
425/// See example on [`WriterProperties`]
426pub struct WriterPropertiesBuilder {
427    data_page_size_limit: usize,
428    dictionary_page_size_limit: usize,
429    data_page_row_count_limit: usize,
430    write_batch_size: usize,
431    max_row_group_size: usize,
432    bloom_filter_position: BloomFilterPosition,
433    writer_version: WriterVersion,
434    created_by: String,
435    offset_index_disabled: bool,
436    key_value_metadata: Option<Vec<KeyValue>>,
437    default_column_properties: ColumnProperties,
438    column_properties: HashMap<ColumnPath, ColumnProperties>,
439    sorting_columns: Option<Vec<SortingColumn>>,
440    column_index_truncate_length: Option<usize>,
441    statistics_truncate_length: Option<usize>,
442    coerce_types: bool,
443    #[cfg(feature = "encryption")]
444    file_encryption_properties: Option<FileEncryptionProperties>,
445}
446
447impl WriterPropertiesBuilder {
448    /// Returns default state of the builder.
449    fn with_defaults() -> Self {
450        Self {
451            data_page_size_limit: DEFAULT_PAGE_SIZE,
452            dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
453            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
454            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
455            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
456            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
457            writer_version: DEFAULT_WRITER_VERSION,
458            created_by: DEFAULT_CREATED_BY.to_string(),
459            offset_index_disabled: DEFAULT_OFFSET_INDEX_DISABLED,
460            key_value_metadata: None,
461            default_column_properties: Default::default(),
462            column_properties: HashMap::new(),
463            sorting_columns: None,
464            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
465            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
466            coerce_types: DEFAULT_COERCE_TYPES,
467            #[cfg(feature = "encryption")]
468            file_encryption_properties: None,
469        }
470    }
471
472    /// Finalizes the configuration and returns immutable writer properties struct.
473    pub fn build(self) -> WriterProperties {
474        WriterProperties {
475            data_page_size_limit: self.data_page_size_limit,
476            dictionary_page_size_limit: self.dictionary_page_size_limit,
477            data_page_row_count_limit: self.data_page_row_count_limit,
478            write_batch_size: self.write_batch_size,
479            max_row_group_size: self.max_row_group_size,
480            bloom_filter_position: self.bloom_filter_position,
481            writer_version: self.writer_version,
482            created_by: self.created_by,
483            offset_index_disabled: self.offset_index_disabled,
484            key_value_metadata: self.key_value_metadata,
485            default_column_properties: self.default_column_properties,
486            column_properties: self.column_properties,
487            sorting_columns: self.sorting_columns,
488            column_index_truncate_length: self.column_index_truncate_length,
489            statistics_truncate_length: self.statistics_truncate_length,
490            coerce_types: self.coerce_types,
491            #[cfg(feature = "encryption")]
492            file_encryption_properties: self.file_encryption_properties,
493        }
494    }
495
496    // ----------------------------------------------------------------------
497    // Writer properties related to a file
498
499    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`]
500    /// via [`DEFAULT_WRITER_VERSION`])
501    ///
502    /// This value can determine what features some readers will support.
503    ///
504    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
505    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
506        self.writer_version = value;
507        self
508    }
509
510    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`
511    /// via [`DEFAULT_PAGE_SIZE`]).
512    ///
513    /// The parquet writer will attempt to limit the sizes of each
514    /// `DataPage` to this many bytes. Reducing this value will result
515    /// in larger parquet files, but may improve the effectiveness of
516    /// page index based predicate pushdown during reading.
517    ///
518    /// Note: this is a best effort limit based on value of
519    /// [`set_write_batch_size`](Self::set_write_batch_size).
520    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
521        self.data_page_size_limit = value;
522        self
523    }
524
525    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`
526    /// via [`DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT`]).
527    ///
528    /// The parquet writer will attempt to limit the number of rows in
529    /// each `DataPage` to this value. Reducing this value will result
530    /// in larger parquet files, but may improve the effectiveness of
531    /// page index based predicate pushdown during reading.
532    ///
533    /// Note: this is a best effort limit based on value of
534    /// [`set_write_batch_size`](Self::set_write_batch_size).
535    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
536        self.data_page_row_count_limit = value;
537        self
538    }
539
540    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`
541    /// via [`DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT`]).
542    ///
543    /// The parquet writer will attempt to limit the size of each
544    /// `DataPage` used to store dictionaries to this many
545    /// bytes. Reducing this value will result in larger parquet
546    /// files, but may improve the effectiveness of page index based
547    /// predicate pushdown during reading.
548    ///
549    /// Note: this is a best effort limit based on value of
550    /// [`set_write_batch_size`](Self::set_write_batch_size).
551    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
552        self.dictionary_page_size_limit = value;
553        self
554    }
555
556    /// Sets write batch size (defaults to 1024 via [`DEFAULT_WRITE_BATCH_SIZE`]).
557    ///
558    /// For performance reasons, data for each column is written in
559    /// batches of this size.
560    ///
561    /// Additional limits such as such as
562    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
563    /// are checked between batches, and thus the write batch size value acts as an
564    /// upper-bound on the enforcement granularity of other limits.
565    pub fn set_write_batch_size(mut self, value: usize) -> Self {
566        self.write_batch_size = value;
567        self
568    }
569
570    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`
571    /// via [`DEFAULT_MAX_ROW_GROUP_SIZE`]).
572    ///
573    /// # Panics
574    /// If the value is set to 0.
575    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
576        assert!(value > 0, "Cannot have a 0 max row group size");
577        self.max_row_group_size = value;
578        self
579    }
580
581    /// Sets where in the final file Bloom Filters are written (defaults to  [`AfterRowGroup`]
582    /// via [`DEFAULT_BLOOM_FILTER_POSITION`])
583    ///
584    /// [`AfterRowGroup`]: BloomFilterPosition::AfterRowGroup
585    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
586        self.bloom_filter_position = value;
587        self
588    }
589
590    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>` via
591    /// [`DEFAULT_CREATED_BY`]).
592    ///
593    /// This is a string that will be written into the file metadata
594    pub fn set_created_by(mut self, value: String) -> Self {
595        self.created_by = value;
596        self
597    }
598
599    /// Sets whether the writing of offset indexes is disabled (defaults to `false` via
600    /// [`DEFAULT_OFFSET_INDEX_DISABLED`]).
601    ///
602    /// If statistics level is set to [`Page`] this setting will be overridden with `false`.
603    ///
604    /// Note: As the offset indexes are useful for accessing data by row number,
605    /// they are always written by default, regardless of whether other statistics
606    /// are enabled. Disabling this metadata may result in a degradation in read
607    /// performance, so use this option with care.
608    ///
609    /// [`Page`]: EnabledStatistics::Page
610    pub fn set_offset_index_disabled(mut self, value: bool) -> Self {
611        self.offset_index_disabled = value;
612        self
613    }
614
615    /// Sets "key_value_metadata" property (defaults to `None`).
616    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
617        self.key_value_metadata = value;
618        self
619    }
620
621    /// Sets sorting order of rows in the row group if any (defaults to `None`).
622    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
623        self.sorting_columns = value;
624        self
625    }
626
627    /// Sets the max length of min/max value fields when writing the column
628    /// [`Index`] (defaults to `Some(64)` via [`DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH`]).
629    ///
630    /// This can be used to prevent columns with very long values (hundreds of
631    /// bytes long) from causing the parquet metadata to become huge.
632    ///
633    /// # Notes
634    ///
635    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
636    /// set to [`EnabledStatistics::Page`].
637    ///
638    /// * If `Some`, must be greater than 0, otherwise will panic
639    /// * If `None`, there's no effective limit.
640    ///
641    /// [`Index`]: crate::file::page_index::index::Index
642    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
643        if let Some(value) = max_length {
644            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
645        }
646
647        self.column_index_truncate_length = max_length;
648        self
649    }
650
651    /// Sets the max length of min/max value fields in row group and data page header
652    /// [`Statistics`] (defaults to `None` (no limit) via [`DEFAULT_STATISTICS_TRUNCATE_LENGTH`]).
653    ///
654    /// # Notes
655    /// Row group [`Statistics`] are written when [`Self::set_statistics_enabled`] is
656    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`]. Data page header
657    /// [`Statistics`] are written when [`Self::set_statistics_enabled`] is set to
658    /// [`EnabledStatistics::Page`].
659    ///
660    /// * If `Some`, must be greater than 0, otherwise will panic
661    /// * If `None`, there's no effective limit.
662    ///
663    /// # See also
664    /// Truncation of Page Index statistics is controlled separately via
665    /// [`WriterPropertiesBuilder::set_column_index_truncate_length`]
666    ///
667    /// [`Statistics`]: crate::file::statistics::Statistics
668    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
669        if let Some(value) = max_length {
670            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
671        }
672
673        self.statistics_truncate_length = max_length;
674        self
675    }
676
677    /// Should the writer coerce types to parquet native types (defaults to `false` via
678    /// [`DEFAULT_COERCE_TYPES`]).
679    ///
680    /// Leaving this option the default `false` will ensure the exact same data
681    /// written to parquet using this library will be read.
682    ///
683    /// Setting this option to `true` will result in parquet files that can be
684    /// read by more readers, but potentially lose information in the process.
685    ///
686    /// * Types such as [`DataType::Date64`], which have no direct corresponding
687    ///   Parquet type, may be stored with lower precision.
688    ///
689    /// * The internal field names of `List` and `Map` types will be renamed if
690    ///   necessary to match what is required by the newest Parquet specification.
691    ///
692    /// See [`ArrowToParquetSchemaConverter::with_coerce_types`] for more details
693    ///
694    /// [`DataType::Date64`]: arrow_schema::DataType::Date64
695    /// [`ArrowToParquetSchemaConverter::with_coerce_types`]: crate::arrow::ArrowSchemaConverter::with_coerce_types
696    pub fn set_coerce_types(mut self, coerce_types: bool) -> Self {
697        self.coerce_types = coerce_types;
698        self
699    }
700
701    /// Sets FileEncryptionProperties (defaults to `None`)
702    #[cfg(feature = "encryption")]
703    pub fn with_file_encryption_properties(
704        mut self,
705        file_encryption_properties: FileEncryptionProperties,
706    ) -> Self {
707        self.file_encryption_properties = Some(file_encryption_properties);
708        self
709    }
710
711    // ----------------------------------------------------------------------
712    // Setters for any column (global)
713
714    /// Sets default encoding for all columns.
715    ///
716    /// If dictionary is not enabled, this is treated as a primary encoding for all
717    /// columns. In case when dictionary is enabled for any column, this value is
718    /// considered to be a fallback encoding for that column.
719    ///
720    /// # Panics
721    ///
722    /// if dictionary encoding is specified, regardless of dictionary
723    /// encoding flag being set.
724    pub fn set_encoding(mut self, value: Encoding) -> Self {
725        self.default_column_properties.set_encoding(value);
726        self
727    }
728
729    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`] via
730    /// [`DEFAULT_COMPRESSION`]).
731    ///
732    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
733    pub fn set_compression(mut self, value: Compression) -> Self {
734        self.default_column_properties.set_compression(value);
735        self
736    }
737
738    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`
739    /// via [`DEFAULT_DICTIONARY_ENABLED`]).
740    ///
741    /// Use this method to set dictionary encoding, instead of explicitly specifying
742    /// encoding in `set_encoding` method.
743    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
744        self.default_column_properties.set_dictionary_enabled(value);
745        self
746    }
747
748    /// Sets default statistics level for all columns (defaults to [`Page`] via
749    /// [`DEFAULT_STATISTICS_ENABLED`]).
750    ///
751    /// [`Page`]: EnabledStatistics::Page
752    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
753        self.default_column_properties.set_statistics_enabled(value);
754        self
755    }
756
757    /// Sets default max statistics size for all columns (defaults to `4096` via
758    /// [`DEFAULT_MAX_STATISTICS_SIZE`]).
759    ///
760    /// Applicable only if statistics are enabled.
761    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
762    pub fn set_max_statistics_size(mut self, value: usize) -> Self {
763        #[allow(deprecated)]
764        self.default_column_properties
765            .set_max_statistics_size(value);
766        self
767    }
768
769    /// Sets if bloom filter should be written for all columns (defaults to `false`).
770    ///
771    /// # Notes
772    ///
773    /// * If the bloom filter is enabled previously then it is a no-op.
774    ///
775    /// * If the bloom filter is not enabled, default values for ndv and fpp
776    ///   value are used used. See [`set_bloom_filter_ndv`] and
777    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
778    ///
779    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
780    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
781    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
782        self.default_column_properties
783            .set_bloom_filter_enabled(value);
784        self
785    }
786
787    /// Sets the default target bloom filter false positive probability (fpp)
788    /// for all columns (defaults to `0.05` via [`DEFAULT_BLOOM_FILTER_FPP`]).
789    ///
790    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
791    /// been called.
792    ///
793    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
794    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
795        self.default_column_properties.set_bloom_filter_fpp(value);
796        self
797    }
798
799    /// Sets default number of distinct values (ndv) for bloom filter for all
800    /// columns (defaults to `1_000_000` via [`DEFAULT_BLOOM_FILTER_NDV`]).
801    ///
802    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
803    /// been called.
804    ///
805    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
806    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
807        self.default_column_properties.set_bloom_filter_ndv(value);
808        self
809    }
810
811    // ----------------------------------------------------------------------
812    // Setters for a specific column
813
814    /// Helper method to get existing or new mutable reference of column properties.
815    #[inline]
816    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
817        self.column_properties.entry(col).or_default()
818    }
819
820    /// Sets encoding for a specific column.
821    ///
822    /// Takes precedence over [`Self::set_encoding`].
823    ///
824    /// If dictionary is not enabled, this is treated as a primary encoding for this
825    /// column. In case when dictionary is enabled for this column, either through
826    /// global defaults or explicitly, this value is considered to be a fallback
827    /// encoding for this column.
828    ///
829    /// # Panics
830    /// If user tries to set dictionary encoding here, regardless of dictionary
831    /// encoding flag being set.
832    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
833        self.get_mut_props(col).set_encoding(value);
834        self
835    }
836
837    /// Sets compression codec for a specific column.
838    ///
839    /// Takes precedence over [`Self::set_compression`].
840    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
841        self.get_mut_props(col).set_compression(value);
842        self
843    }
844
845    /// Sets flag to enable/disable dictionary encoding for a specific column.
846    ///
847    /// Takes precedence over [`Self::set_dictionary_enabled`].
848    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
849        self.get_mut_props(col).set_dictionary_enabled(value);
850        self
851    }
852
853    /// Sets statistics level for a specific column
854    ///
855    /// Takes precedence over [`Self::set_statistics_enabled`].
856    pub fn set_column_statistics_enabled(
857        mut self,
858        col: ColumnPath,
859        value: EnabledStatistics,
860    ) -> Self {
861        self.get_mut_props(col).set_statistics_enabled(value);
862        self
863    }
864
865    /// Sets max size for statistics for a specific column.
866    ///
867    /// Takes precedence over [`Self::set_max_statistics_size`].
868    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
869    pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
870        #[allow(deprecated)]
871        self.get_mut_props(col).set_max_statistics_size(value);
872        self
873    }
874
875    /// Sets whether a bloom filter should be written for a specific column.
876    ///
877    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
878    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
879        self.get_mut_props(col).set_bloom_filter_enabled(value);
880        self
881    }
882
883    /// Sets the false positive probability for bloom filter for a specific column.
884    ///
885    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
886    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
887        self.get_mut_props(col).set_bloom_filter_fpp(value);
888        self
889    }
890
891    /// Sets the number of distinct values for bloom filter for a specific column.
892    ///
893    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
894    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
895        self.get_mut_props(col).set_bloom_filter_ndv(value);
896        self
897    }
898}
899
900/// Controls the level of statistics to be computed by the writer and stored in
901/// the parquet file.
902///
903/// Enabling statistics makes the resulting Parquet file larger and requires
904/// more time to read the parquet footer.
905///
906/// Statistics can be used to improve query performance by pruning row groups
907/// and pages during query execution if the query engine supports evaluating the
908/// predicate using the statistics.
909#[derive(Debug, Clone, Copy, Eq, PartialEq)]
910pub enum EnabledStatistics {
911    /// Compute no statistics.
912    None,
913    /// Compute column chunk-level statistics but not page-level.
914    ///
915    /// Setting this option will store one set of statistics for each relevant
916    /// column for each row group. The more row groups written, the more
917    /// statistics will be stored.
918    Chunk,
919    /// Compute page-level and column chunk-level statistics.
920    ///
921    /// Setting this option will store one set of statistics for each relevant
922    /// column for each page and row group. The more row groups and the more
923    /// pages written, the more statistics will be stored.
924    Page,
925}
926
927impl FromStr for EnabledStatistics {
928    type Err = String;
929
930    fn from_str(s: &str) -> Result<Self, Self::Err> {
931        match s {
932            "NONE" | "none" => Ok(EnabledStatistics::None),
933            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
934            "PAGE" | "page" => Ok(EnabledStatistics::Page),
935            _ => Err(format!("Invalid statistics arg: {}", s)),
936        }
937    }
938}
939
940impl Default for EnabledStatistics {
941    fn default() -> Self {
942        DEFAULT_STATISTICS_ENABLED
943    }
944}
945
946/// Controls the bloom filter to be computed by the writer.
947#[derive(Debug, Clone, PartialEq)]
948pub struct BloomFilterProperties {
949    /// False positive probability. This should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
950    ///
951    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
952    ///
953    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
954    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
955    /// e.g. 0.1, 0.05, or 0.001 is recommended.
956    ///
957    /// Setting to a very small number diminishes the value of the filter itself, as the bitset size is
958    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
959    /// be known in advance to greatly reduce space usage.
960    pub fpp: f64,
961    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
962    ///
963    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
964    ///
965    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
966    /// is to set ndv to the number of rows. However, it can reduce disk size if you know in advance a smaller
967    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
968    /// anyway.
969    ///
970    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
971    pub ndv: u64,
972}
973
974impl Default for BloomFilterProperties {
975    fn default() -> Self {
976        BloomFilterProperties {
977            fpp: DEFAULT_BLOOM_FILTER_FPP,
978            ndv: DEFAULT_BLOOM_FILTER_NDV,
979        }
980    }
981}
982
983/// Container for column properties that can be changed as part of writer.
984///
985/// If a field is `None`, it means that no specific value has been set for this column,
986/// so some subsequent or default value must be used.
987#[derive(Debug, Clone, Default, PartialEq)]
988struct ColumnProperties {
989    encoding: Option<Encoding>,
990    codec: Option<Compression>,
991    dictionary_enabled: Option<bool>,
992    statistics_enabled: Option<EnabledStatistics>,
993    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
994    max_statistics_size: Option<usize>,
995    /// bloom filter related properties
996    bloom_filter_properties: Option<BloomFilterProperties>,
997}
998
999impl ColumnProperties {
1000    /// Sets encoding for this column.
1001    ///
1002    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
1003    /// In case when dictionary is enabled for a column, this value is considered to
1004    /// be a fallback encoding.
1005    ///
1006    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
1007    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
1008    /// for a column.
1009    fn set_encoding(&mut self, value: Encoding) {
1010        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
1011            panic!("Dictionary encoding can not be used as fallback encoding");
1012        }
1013        self.encoding = Some(value);
1014    }
1015
1016    /// Sets compression codec for this column.
1017    fn set_compression(&mut self, value: Compression) {
1018        self.codec = Some(value);
1019    }
1020
1021    /// Sets whether dictionary encoding is enabled for this column.
1022    fn set_dictionary_enabled(&mut self, enabled: bool) {
1023        self.dictionary_enabled = Some(enabled);
1024    }
1025
1026    /// Sets the statistics level for this column.
1027    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
1028        self.statistics_enabled = Some(enabled);
1029    }
1030
1031    /// Sets max size for statistics for this column.
1032    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1033    #[allow(deprecated)]
1034    fn set_max_statistics_size(&mut self, value: usize) {
1035        self.max_statistics_size = Some(value);
1036    }
1037
1038    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
1039    /// otherwise it is a no-op.
1040    /// If `value` is `false`, resets bloom filter properties to `None`.
1041    fn set_bloom_filter_enabled(&mut self, value: bool) {
1042        if value && self.bloom_filter_properties.is_none() {
1043            self.bloom_filter_properties = Some(Default::default())
1044        } else if !value {
1045            self.bloom_filter_properties = None
1046        }
1047    }
1048
1049    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
1050    /// bloom filter if not previously enabled.
1051    ///
1052    /// # Panics
1053    ///
1054    /// Panics if the `value` is not between 0 and 1 exclusive
1055    fn set_bloom_filter_fpp(&mut self, value: f64) {
1056        assert!(
1057            value > 0. && value < 1.0,
1058            "fpp must be between 0 and 1 exclusive, got {value}"
1059        );
1060
1061        self.bloom_filter_properties
1062            .get_or_insert_with(Default::default)
1063            .fpp = value;
1064    }
1065
1066    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
1067    /// enables bloom filter if not previously enabled.
1068    fn set_bloom_filter_ndv(&mut self, value: u64) {
1069        self.bloom_filter_properties
1070            .get_or_insert_with(Default::default)
1071            .ndv = value;
1072    }
1073
1074    /// Returns optional encoding for this column.
1075    fn encoding(&self) -> Option<Encoding> {
1076        self.encoding
1077    }
1078
1079    /// Returns optional compression codec for this column.
1080    fn compression(&self) -> Option<Compression> {
1081        self.codec
1082    }
1083
1084    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
1085    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
1086    /// been provided.
1087    fn dictionary_enabled(&self) -> Option<bool> {
1088        self.dictionary_enabled
1089    }
1090
1091    /// Returns optional statistics level requested for this column. If result is `None`,
1092    /// then no setting has been provided.
1093    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
1094        self.statistics_enabled
1095    }
1096
1097    /// Returns optional max size in bytes for statistics.
1098    #[deprecated(since = "54.0.0", note = "Unused; will be removed in 56.0.0")]
1099    fn max_statistics_size(&self) -> Option<usize> {
1100        #[allow(deprecated)]
1101        self.max_statistics_size
1102    }
1103
1104    /// Returns the bloom filter properties, or `None` if not enabled
1105    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
1106        self.bloom_filter_properties.as_ref()
1107    }
1108}
1109
1110/// Reference counted reader properties.
1111pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
1112
1113const DEFAULT_READ_BLOOM_FILTER: bool = false;
1114
1115/// Configuration settings for reading parquet files.
1116///
1117/// All properties are immutable and `Send` + `Sync`.
1118/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
1119///
1120/// # Example
1121///
1122/// ```rust
1123/// use parquet::file::properties::ReaderProperties;
1124///
1125/// // Create properties with default configuration.
1126/// let props = ReaderProperties::builder().build();
1127///
1128/// // Use properties builder to set certain options and assemble the configuration.
1129/// let props = ReaderProperties::builder()
1130///     .set_backward_compatible_lz4(false)
1131///     .build();
1132/// ```
1133pub struct ReaderProperties {
1134    codec_options: CodecOptions,
1135    read_bloom_filter: bool,
1136}
1137
1138impl ReaderProperties {
1139    /// Returns builder for reader properties with default values.
1140    pub fn builder() -> ReaderPropertiesBuilder {
1141        ReaderPropertiesBuilder::with_defaults()
1142    }
1143
1144    /// Returns codec options.
1145    pub(crate) fn codec_options(&self) -> &CodecOptions {
1146        &self.codec_options
1147    }
1148
1149    /// Returns whether to read bloom filter
1150    pub(crate) fn read_bloom_filter(&self) -> bool {
1151        self.read_bloom_filter
1152    }
1153}
1154
1155/// Builder for parquet file reader configuration. See example on
1156/// [`ReaderProperties`]
1157pub struct ReaderPropertiesBuilder {
1158    codec_options_builder: CodecOptionsBuilder,
1159    read_bloom_filter: Option<bool>,
1160}
1161
1162/// Reader properties builder.
1163impl ReaderPropertiesBuilder {
1164    /// Returns default state of the builder.
1165    fn with_defaults() -> Self {
1166        Self {
1167            codec_options_builder: CodecOptionsBuilder::default(),
1168            read_bloom_filter: None,
1169        }
1170    }
1171
1172    /// Finalizes the configuration and returns immutable reader properties struct.
1173    pub fn build(self) -> ReaderProperties {
1174        ReaderProperties {
1175            codec_options: self.codec_options_builder.build(),
1176            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1177        }
1178    }
1179
1180    /// Enable/disable backward compatible LZ4.
1181    ///
1182    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1183    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1184    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1185    /// compatibility with files generated by older versions of parquet-cpp.
1186    ///
1187    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1188    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1189        self.codec_options_builder = self
1190            .codec_options_builder
1191            .set_backward_compatible_lz4(value);
1192        self
1193    }
1194
1195    /// Enable/disable reading bloom filter
1196    ///
1197    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1198    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1199    ///
1200    /// By default bloom filter is set to be read.
1201    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1202        self.read_bloom_filter = Some(value);
1203        self
1204    }
1205}
1206
1207#[cfg(test)]
1208mod tests {
1209    use super::*;
1210
1211    #[test]
1212    fn test_writer_version() {
1213        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1214        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1215    }
1216
1217    #[test]
1218    fn test_writer_properties_default_settings() {
1219        let props = WriterProperties::default();
1220        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1221        assert_eq!(
1222            props.dictionary_page_size_limit(),
1223            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1224        );
1225        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1226        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1227        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1228        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1229        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1230        assert_eq!(props.key_value_metadata(), None);
1231        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1232        assert_eq!(
1233            props.compression(&ColumnPath::from("col")),
1234            DEFAULT_COMPRESSION
1235        );
1236        assert_eq!(
1237            props.dictionary_enabled(&ColumnPath::from("col")),
1238            DEFAULT_DICTIONARY_ENABLED
1239        );
1240        assert_eq!(
1241            props.statistics_enabled(&ColumnPath::from("col")),
1242            DEFAULT_STATISTICS_ENABLED
1243        );
1244        assert!(props
1245            .bloom_filter_properties(&ColumnPath::from("col"))
1246            .is_none());
1247    }
1248
1249    #[test]
1250    fn test_writer_properties_dictionary_encoding() {
1251        // dictionary encoding is not configurable, and it should be the same for both
1252        // writer version 1 and 2.
1253        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1254            let props = WriterProperties::builder()
1255                .set_writer_version(*version)
1256                .build();
1257            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1258            assert_eq!(
1259                props.dictionary_data_page_encoding(),
1260                Encoding::RLE_DICTIONARY
1261            );
1262        }
1263    }
1264
1265    #[test]
1266    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1267    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1268        // Should panic when user specifies dictionary encoding as fallback encoding.
1269        WriterProperties::builder()
1270            .set_encoding(Encoding::PLAIN_DICTIONARY)
1271            .build();
1272    }
1273
1274    #[test]
1275    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1276    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1277        // Should panic when user specifies dictionary encoding as fallback encoding.
1278        WriterProperties::builder()
1279            .set_encoding(Encoding::RLE_DICTIONARY)
1280            .build();
1281    }
1282
1283    #[test]
1284    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1285    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1286        WriterProperties::builder()
1287            .set_dictionary_enabled(true)
1288            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1289            .build();
1290    }
1291
1292    #[test]
1293    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1294    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1295        WriterProperties::builder()
1296            .set_dictionary_enabled(false)
1297            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1298            .build();
1299    }
1300
1301    #[test]
1302    fn test_writer_properties_builder() {
1303        let props = WriterProperties::builder()
1304            // file settings
1305            .set_writer_version(WriterVersion::PARQUET_2_0)
1306            .set_data_page_size_limit(10)
1307            .set_dictionary_page_size_limit(20)
1308            .set_write_batch_size(30)
1309            .set_max_row_group_size(40)
1310            .set_created_by("default".to_owned())
1311            .set_key_value_metadata(Some(vec![KeyValue::new(
1312                "key".to_string(),
1313                "value".to_string(),
1314            )]))
1315            // global column settings
1316            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1317            .set_compression(Compression::GZIP(Default::default()))
1318            .set_dictionary_enabled(false)
1319            .set_statistics_enabled(EnabledStatistics::None)
1320            // specific column settings
1321            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1322            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1323            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1324            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1325            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1326            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1327            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1328            .build();
1329
1330        assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1331        assert_eq!(props.data_page_size_limit(), 10);
1332        assert_eq!(props.dictionary_page_size_limit(), 20);
1333        assert_eq!(props.write_batch_size(), 30);
1334        assert_eq!(props.max_row_group_size(), 40);
1335        assert_eq!(props.created_by(), "default");
1336        assert_eq!(
1337            props.key_value_metadata(),
1338            Some(&vec![
1339                KeyValue::new("key".to_string(), "value".to_string(),)
1340            ])
1341        );
1342
1343        assert_eq!(
1344            props.encoding(&ColumnPath::from("a")),
1345            Some(Encoding::DELTA_BINARY_PACKED)
1346        );
1347        assert_eq!(
1348            props.compression(&ColumnPath::from("a")),
1349            Compression::GZIP(Default::default())
1350        );
1351        assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1352        assert_eq!(
1353            props.statistics_enabled(&ColumnPath::from("a")),
1354            EnabledStatistics::None
1355        );
1356
1357        assert_eq!(
1358            props.encoding(&ColumnPath::from("col")),
1359            Some(Encoding::RLE)
1360        );
1361        assert_eq!(
1362            props.compression(&ColumnPath::from("col")),
1363            Compression::SNAPPY
1364        );
1365        assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1366        assert_eq!(
1367            props.statistics_enabled(&ColumnPath::from("col")),
1368            EnabledStatistics::Chunk
1369        );
1370        assert_eq!(
1371            props.bloom_filter_properties(&ColumnPath::from("col")),
1372            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1373        );
1374    }
1375
1376    #[test]
1377    fn test_writer_properties_builder_partial_defaults() {
1378        let props = WriterProperties::builder()
1379            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1380            .set_compression(Compression::GZIP(Default::default()))
1381            .set_bloom_filter_enabled(true)
1382            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1383            .build();
1384
1385        assert_eq!(
1386            props.encoding(&ColumnPath::from("col")),
1387            Some(Encoding::RLE)
1388        );
1389        assert_eq!(
1390            props.compression(&ColumnPath::from("col")),
1391            Compression::GZIP(Default::default())
1392        );
1393        assert_eq!(
1394            props.dictionary_enabled(&ColumnPath::from("col")),
1395            DEFAULT_DICTIONARY_ENABLED
1396        );
1397        assert_eq!(
1398            props.bloom_filter_properties(&ColumnPath::from("col")),
1399            Some(&BloomFilterProperties {
1400                fpp: 0.05,
1401                ndv: 1_000_000_u64
1402            })
1403        );
1404    }
1405
1406    #[test]
1407    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1408        assert_eq!(
1409            WriterProperties::builder()
1410                .build()
1411                .bloom_filter_properties(&ColumnPath::from("col")),
1412            None
1413        );
1414        assert_eq!(
1415            WriterProperties::builder()
1416                .set_bloom_filter_ndv(100)
1417                .build()
1418                .bloom_filter_properties(&ColumnPath::from("col")),
1419            Some(&BloomFilterProperties {
1420                fpp: 0.05,
1421                ndv: 100
1422            })
1423        );
1424        assert_eq!(
1425            WriterProperties::builder()
1426                .set_bloom_filter_fpp(0.1)
1427                .build()
1428                .bloom_filter_properties(&ColumnPath::from("col")),
1429            Some(&BloomFilterProperties {
1430                fpp: 0.1,
1431                ndv: 1_000_000_u64
1432            })
1433        );
1434    }
1435
1436    #[test]
1437    fn test_reader_properties_default_settings() {
1438        let props = ReaderProperties::builder().build();
1439
1440        let codec_options = CodecOptionsBuilder::default()
1441            .set_backward_compatible_lz4(true)
1442            .build();
1443
1444        assert_eq!(props.codec_options(), &codec_options);
1445        assert!(!props.read_bloom_filter());
1446    }
1447
1448    #[test]
1449    fn test_reader_properties_builder() {
1450        let props = ReaderProperties::builder()
1451            .set_backward_compatible_lz4(false)
1452            .build();
1453
1454        let codec_options = CodecOptionsBuilder::default()
1455            .set_backward_compatible_lz4(false)
1456            .build();
1457
1458        assert_eq!(props.codec_options(), &codec_options);
1459    }
1460
1461    #[test]
1462    fn test_parse_writerversion() {
1463        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1464        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1465        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1466        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1467
1468        // test lowercase
1469        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1470        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1471
1472        // test invalid version
1473        match "PARQUET_-1_0".parse::<WriterVersion>() {
1474            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1475            Err(e) => {
1476                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1477            }
1478        }
1479    }
1480
1481    #[test]
1482    fn test_parse_enabledstatistics() {
1483        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1484        assert_eq!(enabled_statistics, EnabledStatistics::None);
1485        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1486        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1487        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1488        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1489
1490        // test lowercase
1491        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1492        assert_eq!(enabled_statistics, EnabledStatistics::None);
1493
1494        //test invalid statistics
1495        match "ChunkAndPage".parse::<EnabledStatistics>() {
1496            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1497            Err(e) => {
1498                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1499            }
1500        }
1501    }
1502}
parquet/file/properties.rs

parquet/file/
properties.rs