parquet/file/
properties.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use std::str::FromStr;
20use std::{collections::HashMap, sync::Arc};
21
22use crate::basic::{Compression, Encoding};
23use crate::compression::{CodecOptions, CodecOptionsBuilder};
24use crate::file::metadata::KeyValue;
25use crate::format::SortingColumn;
26use crate::schema::types::ColumnPath;
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::max_statistics_size`]
45pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
60
61/// Parquet writer version.
62///
63/// Basic constant, which is not part of the Thrift definition.
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65#[allow(non_camel_case_types)]
66pub enum WriterVersion {
67    /// Parquet format version 1.0
68    PARQUET_1_0,
69    /// Parquet format version 2.0
70    PARQUET_2_0,
71}
72
73impl WriterVersion {
74    /// Returns writer version as `i32`.
75    pub fn as_num(&self) -> i32 {
76        match self {
77            WriterVersion::PARQUET_1_0 => 1,
78            WriterVersion::PARQUET_2_0 => 2,
79        }
80    }
81}
82
83impl FromStr for WriterVersion {
84    type Err = String;
85
86    fn from_str(s: &str) -> Result<Self, Self::Err> {
87        match s {
88            "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
89            "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
90            _ => Err(format!("Invalid writer version: {}", s)),
91        }
92    }
93}
94
95/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
96/// write Bloom filters
97///
98/// Basic constant, which is not part of the Thrift definition.
99#[derive(Debug, Clone, Copy, PartialEq, Eq)]
100pub enum BloomFilterPosition {
101    /// Write Bloom Filters of each row group right after the row group
102    ///
103    /// This saves memory by writing it as soon as it is computed, at the cost
104    /// of data locality for readers
105    AfterRowGroup,
106    /// Write Bloom Filters at the end of the file
107    ///
108    /// This allows better data locality for readers, at the cost of memory usage
109    /// for writers.
110    End,
111}
112
113/// Reference counted writer properties.
114pub type WriterPropertiesPtr = Arc<WriterProperties>;
115
116/// Configuration settings for writing parquet files.
117///
118/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
119///
120/// # Example
121///
122/// ```rust
123/// # use parquet::{
124/// #    basic::{Compression, Encoding},
125/// #    file::properties::*,
126/// #    schema::types::ColumnPath,
127/// # };
128/// #
129/// // Create properties with default configuration.
130/// let props = WriterProperties::default();
131///
132/// // Use properties builder to set certain options and assemble the configuration.
133/// let props = WriterProperties::builder()
134///     .set_writer_version(WriterVersion::PARQUET_1_0)
135///     .set_encoding(Encoding::PLAIN)
136///     .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
137///     .set_compression(Compression::SNAPPY)
138///     .build();
139///
140/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
141/// assert_eq!(
142///     props.encoding(&ColumnPath::from("col1")),
143///     Some(Encoding::DELTA_BINARY_PACKED)
144/// );
145/// assert_eq!(
146///     props.encoding(&ColumnPath::from("col2")),
147///     Some(Encoding::PLAIN)
148/// );
149/// ```
150#[derive(Debug, Clone)]
151pub struct WriterProperties {
152    data_page_size_limit: usize,
153    dictionary_page_size_limit: usize,
154    data_page_row_count_limit: usize,
155    write_batch_size: usize,
156    max_row_group_size: usize,
157    bloom_filter_position: BloomFilterPosition,
158    writer_version: WriterVersion,
159    created_by: String,
160    pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
161    default_column_properties: ColumnProperties,
162    column_properties: HashMap<ColumnPath, ColumnProperties>,
163    sorting_columns: Option<Vec<SortingColumn>>,
164    column_index_truncate_length: Option<usize>,
165    statistics_truncate_length: Option<usize>,
166}
167
168impl Default for WriterProperties {
169    fn default() -> Self {
170        Self::builder().build()
171    }
172}
173
174impl WriterProperties {
175    /// Create a new [`WriterProperties`] with the default settings
176    ///
177    /// See [`WriterProperties::builder`] for customising settings
178    pub fn new() -> Self {
179        Self::default()
180    }
181
182    /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
183    /// properties.
184    pub fn builder() -> WriterPropertiesBuilder {
185        WriterPropertiesBuilder::with_defaults()
186    }
187
188    /// Returns data page size limit.
189    ///
190    /// Note: this is a best effort limit based on the write batch size
191    #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")]
192    pub fn data_pagesize_limit(&self) -> usize {
193        self.data_page_size_limit
194    }
195
196    /// Returns data page size limit.
197    ///
198    /// Note: this is a best effort limit based on the write batch size
199    ///
200    /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
201    pub fn data_page_size_limit(&self) -> usize {
202        self.data_page_size_limit
203    }
204
205    /// Returns dictionary page size limit.
206    ///
207    /// Note: this is a best effort limit based on the write batch size
208    #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")]
209    pub fn dictionary_pagesize_limit(&self) -> usize {
210        self.dictionary_page_size_limit
211    }
212
213    /// Returns dictionary page size limit.
214    ///
215    /// Note: this is a best effort limit based on the write batch size
216    ///
217    /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
218    pub fn dictionary_page_size_limit(&self) -> usize {
219        self.dictionary_page_size_limit
220    }
221
222    /// Returns the maximum page row count
223    ///
224    /// Note: this is a best effort limit based on the write batch size
225    ///
226    /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
227    pub fn data_page_row_count_limit(&self) -> usize {
228        self.data_page_row_count_limit
229    }
230
231    /// Returns configured batch size for writes.
232    ///
233    /// When writing a batch of data, this setting allows to split it internally into
234    /// smaller batches so we can better estimate the size of a page currently being
235    /// written.
236    pub fn write_batch_size(&self) -> usize {
237        self.write_batch_size
238    }
239
240    /// Returns maximum number of rows in a row group.
241    pub fn max_row_group_size(&self) -> usize {
242        self.max_row_group_size
243    }
244
245    /// Returns maximum number of rows in a row group.
246    pub fn bloom_filter_position(&self) -> BloomFilterPosition {
247        self.bloom_filter_position
248    }
249
250    /// Returns configured writer version.
251    pub fn writer_version(&self) -> WriterVersion {
252        self.writer_version
253    }
254
255    /// Returns `created_by` string.
256    pub fn created_by(&self) -> &str {
257        &self.created_by
258    }
259
260    /// Returns `key_value_metadata` KeyValue pairs.
261    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
262        self.key_value_metadata.as_ref()
263    }
264
265    /// Returns sorting columns.
266    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
267        self.sorting_columns.as_ref()
268    }
269
270    /// Returns the maximum length of truncated min/max values in the column index.
271    ///
272    /// `None` if truncation is disabled, must be greater than 0 otherwise.
273    pub fn column_index_truncate_length(&self) -> Option<usize> {
274        self.column_index_truncate_length
275    }
276
277    /// Returns the maximum length of truncated min/max values in statistics.
278    ///
279    /// `None` if truncation is disabled, must be greater than 0 otherwise.
280    pub fn statistics_truncate_length(&self) -> Option<usize> {
281        self.statistics_truncate_length
282    }
283
284    /// Returns encoding for a data page, when dictionary encoding is enabled.
285    /// This is not configurable.
286    #[inline]
287    pub fn dictionary_data_page_encoding(&self) -> Encoding {
288        // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
289        // Dictionary values are encoded using RLE_DICTIONARY encoding.
290        Encoding::RLE_DICTIONARY
291    }
292
293    /// Returns encoding for dictionary page, when dictionary encoding is enabled.
294    /// This is not configurable.
295    #[inline]
296    pub fn dictionary_page_encoding(&self) -> Encoding {
297        // PLAIN_DICTIONARY is deprecated in writer version 1.
298        // Dictionary is encoded using plain encoding.
299        Encoding::PLAIN
300    }
301
302    /// Returns encoding for a column, if set.
303    /// In case when dictionary is enabled, returns fallback encoding.
304    ///
305    /// If encoding is not set, then column writer will choose the best encoding
306    /// based on the column type.
307    pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
308        self.column_properties
309            .get(col)
310            .and_then(|c| c.encoding())
311            .or_else(|| self.default_column_properties.encoding())
312    }
313
314    /// Returns compression codec for a column.
315    pub fn compression(&self, col: &ColumnPath) -> Compression {
316        self.column_properties
317            .get(col)
318            .and_then(|c| c.compression())
319            .or_else(|| self.default_column_properties.compression())
320            .unwrap_or(DEFAULT_COMPRESSION)
321    }
322
323    /// Returns `true` if dictionary encoding is enabled for a column.
324    pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
325        self.column_properties
326            .get(col)
327            .and_then(|c| c.dictionary_enabled())
328            .or_else(|| self.default_column_properties.dictionary_enabled())
329            .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
330    }
331
332    /// Returns which statistics are written for a column.
333    pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
334        self.column_properties
335            .get(col)
336            .and_then(|c| c.statistics_enabled())
337            .or_else(|| self.default_column_properties.statistics_enabled())
338            .unwrap_or(DEFAULT_STATISTICS_ENABLED)
339    }
340
341    /// Returns max size for statistics.
342    /// Only applicable if statistics are enabled.
343    pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
344        self.column_properties
345            .get(col)
346            .and_then(|c| c.max_statistics_size())
347            .or_else(|| self.default_column_properties.max_statistics_size())
348            .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
349    }
350
351    /// Returns the [`BloomFilterProperties`] for the given column
352    ///
353    /// Returns `None` if bloom filter is disabled
354    pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
355        self.column_properties
356            .get(col)
357            .and_then(|c| c.bloom_filter_properties())
358            .or_else(|| self.default_column_properties.bloom_filter_properties())
359    }
360}
361
362/// Builder for  [`WriterProperties`] parquet writer configuration.
363///
364/// See example on [`WriterProperties`]
365pub struct WriterPropertiesBuilder {
366    data_page_size_limit: usize,
367    dictionary_page_size_limit: usize,
368    data_page_row_count_limit: usize,
369    write_batch_size: usize,
370    max_row_group_size: usize,
371    bloom_filter_position: BloomFilterPosition,
372    writer_version: WriterVersion,
373    created_by: String,
374    key_value_metadata: Option<Vec<KeyValue>>,
375    default_column_properties: ColumnProperties,
376    column_properties: HashMap<ColumnPath, ColumnProperties>,
377    sorting_columns: Option<Vec<SortingColumn>>,
378    column_index_truncate_length: Option<usize>,
379    statistics_truncate_length: Option<usize>,
380}
381
382impl WriterPropertiesBuilder {
383    /// Returns default state of the builder.
384    fn with_defaults() -> Self {
385        Self {
386            data_page_size_limit: DEFAULT_PAGE_SIZE,
387            dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
388            data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
389            write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
390            max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
391            bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
392            writer_version: DEFAULT_WRITER_VERSION,
393            created_by: DEFAULT_CREATED_BY.to_string(),
394            key_value_metadata: None,
395            default_column_properties: Default::default(),
396            column_properties: HashMap::new(),
397            sorting_columns: None,
398            column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
399            statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
400        }
401    }
402
403    /// Finalizes the configuration and returns immutable writer properties struct.
404    pub fn build(self) -> WriterProperties {
405        WriterProperties {
406            data_page_size_limit: self.data_page_size_limit,
407            dictionary_page_size_limit: self.dictionary_page_size_limit,
408            data_page_row_count_limit: self.data_page_row_count_limit,
409            write_batch_size: self.write_batch_size,
410            max_row_group_size: self.max_row_group_size,
411            bloom_filter_position: self.bloom_filter_position,
412            writer_version: self.writer_version,
413            created_by: self.created_by,
414            key_value_metadata: self.key_value_metadata,
415            default_column_properties: self.default_column_properties,
416            column_properties: self.column_properties,
417            sorting_columns: self.sorting_columns,
418            column_index_truncate_length: self.column_index_truncate_length,
419            statistics_truncate_length: self.statistics_truncate_length,
420        }
421    }
422
423    // ----------------------------------------------------------------------
424    // Writer properties related to a file
425
426    /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
427    ///
428    /// This value can determine what features some readers will support.
429    ///
430    /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
431    pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
432        self.writer_version = value;
433        self
434    }
435
436    /// Sets best effort maximum size of a data page in bytes.
437    ///
438    /// Note: this is a best effort limit based on value of
439    /// [`set_write_batch_size`](Self::set_write_batch_size).
440    #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")]
441    pub fn set_data_pagesize_limit(mut self, value: usize) -> Self {
442        self.data_page_size_limit = value;
443        self
444    }
445
446    /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
447    ///
448    /// The parquet writer will attempt to limit the sizes of each
449    /// `DataPage` to this many bytes. Reducing this value will result
450    /// in larger parquet files, but may improve the effectiveness of
451    /// page index based predicate pushdown during reading.
452    ///
453    /// Note: this is a best effort limit based on value of
454    /// [`set_write_batch_size`](Self::set_write_batch_size).
455    pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
456        self.data_page_size_limit = value;
457        self
458    }
459
460    /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
461    ///
462    /// The parquet writer will attempt to limit the number of rows in
463    /// each `DataPage` to this value. Reducing this value will result
464    /// in larger parquet files, but may improve the effectiveness of
465    /// page index based predicate pushdown during reading.
466    ///
467    /// Note: this is a best effort limit based on value of
468    /// [`set_write_batch_size`](Self::set_write_batch_size).
469    pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
470        self.data_page_row_count_limit = value;
471        self
472    }
473
474    /// Sets best effort maximum dictionary page size, in bytes.
475    ///
476    /// Note: this is a best effort limit based on value of
477    /// [`set_write_batch_size`](Self::set_write_batch_size).
478    #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")]
479    pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self {
480        self.dictionary_page_size_limit = value;
481        self
482    }
483
484    /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
485    ///
486    /// The parquet writer will attempt to limit the size of each
487    /// `DataPage` used to store dictionaries to this many
488    /// bytes. Reducing this value will result in larger parquet
489    /// files, but may improve the effectiveness of page index based
490    /// predicate pushdown during reading.
491    ///
492    /// Note: this is a best effort limit based on value of
493    /// [`set_write_batch_size`](Self::set_write_batch_size).
494    pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
495        self.dictionary_page_size_limit = value;
496        self
497    }
498
499    /// Sets write batch size (defaults to 1024).
500    ///
501    /// For performance reasons, data for each column is written in
502    /// batches of this size.
503    ///
504    /// Additional limits such as such as
505    /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
506    /// are checked between batches, and thus the write batch size value acts as an
507    /// upper-bound on the enforcement granularity of other limits.
508    pub fn set_write_batch_size(mut self, value: usize) -> Self {
509        self.write_batch_size = value;
510        self
511    }
512
513    /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
514    ///
515    /// # Panics
516    /// If the value is set to 0.
517    pub fn set_max_row_group_size(mut self, value: usize) -> Self {
518        assert!(value > 0, "Cannot have a 0 max row group size");
519        self.max_row_group_size = value;
520        self
521    }
522
523    /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
524    pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
525        self.bloom_filter_position = value;
526        self
527    }
528
529    /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
530    pub fn set_created_by(mut self, value: String) -> Self {
531        self.created_by = value;
532        self
533    }
534
535    /// Sets "key_value_metadata" property (defaults to `None`).
536    pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
537        self.key_value_metadata = value;
538        self
539    }
540
541    /// Sets sorting order of rows in the row group if any (defaults to `None`).
542    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
543        self.sorting_columns = value;
544        self
545    }
546
547    // ----------------------------------------------------------------------
548    // Setters for any column (global)
549
550    /// Sets default encoding for all columns.
551    ///
552    /// If dictionary is not enabled, this is treated as a primary encoding for all
553    /// columns. In case when dictionary is enabled for any column, this value is
554    /// considered to be a fallback encoding for that column.
555    ///
556    /// # Panics
557    ///
558    /// if dictionary encoding is specified, regardless of dictionary
559    /// encoding flag being set.
560    pub fn set_encoding(mut self, value: Encoding) -> Self {
561        self.default_column_properties.set_encoding(value);
562        self
563    }
564
565    /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
566    ///
567    /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
568    pub fn set_compression(mut self, value: Compression) -> Self {
569        self.default_column_properties.set_compression(value);
570        self
571    }
572
573    /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
574    ///
575    /// Use this method to set dictionary encoding, instead of explicitly specifying
576    /// encoding in `set_encoding` method.
577    pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
578        self.default_column_properties.set_dictionary_enabled(value);
579        self
580    }
581
582    /// Sets default statistics level for all columns (defaults to [`Page`]).
583    ///
584    /// [`Page`]: EnabledStatistics::Page
585    pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
586        self.default_column_properties.set_statistics_enabled(value);
587        self
588    }
589
590    /// Sets default max statistics size for all columns (defaults to `4096`).
591    ///
592    /// Applicable only if statistics are enabled.
593    pub fn set_max_statistics_size(mut self, value: usize) -> Self {
594        self.default_column_properties
595            .set_max_statistics_size(value);
596        self
597    }
598
599    /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
600    ///
601    /// # Notes
602    ///
603    /// * If the bloom filter is enabled previously then it is a no-op.
604    ///
605    /// * If the bloom filter is not enabled, default values for ndv and fpp
606    ///   value are used used. See [`set_bloom_filter_ndv`] and
607    ///   [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
608    ///
609    /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
610    /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
611    pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
612        self.default_column_properties
613            .set_bloom_filter_enabled(value);
614        self
615    }
616
617    /// Sets the default target bloom filter false positive probability (fpp)
618    /// for all columns (defaults to `0.05`).
619    ///
620    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
621    /// been called.
622    ///
623    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
624    pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
625        self.default_column_properties.set_bloom_filter_fpp(value);
626        self
627    }
628
629    /// Sets default number of distinct values (ndv) for bloom filter for all
630    /// columns (defaults to `1_000_000`).
631    ///
632    /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
633    /// been called.
634    ///
635    /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
636    pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
637        self.default_column_properties.set_bloom_filter_ndv(value);
638        self
639    }
640
641    // ----------------------------------------------------------------------
642    // Setters for a specific column
643
644    /// Helper method to get existing or new mutable reference of column properties.
645    #[inline]
646    fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
647        self.column_properties.entry(col).or_default()
648    }
649
650    /// Sets encoding for a specific column.
651    ///
652    /// Takes precedence over [`Self::set_encoding`].
653    ///
654    /// If dictionary is not enabled, this is treated as a primary encoding for this
655    /// column. In case when dictionary is enabled for this column, either through
656    /// global defaults or explicitly, this value is considered to be a fallback
657    /// encoding for this column.
658    ///
659    /// # Panics
660    /// If user tries to set dictionary encoding here, regardless of dictionary
661    /// encoding flag being set.
662    pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
663        self.get_mut_props(col).set_encoding(value);
664        self
665    }
666
667    /// Sets compression codec for a specific column.
668    ///
669    /// Takes precedence over [`Self::set_compression`].
670    pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
671        self.get_mut_props(col).set_compression(value);
672        self
673    }
674
675    /// Sets flag to enable/disable dictionary encoding for a specific column.
676    ///
677    /// Takes precedence over [`Self::set_dictionary_enabled`].
678    pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
679        self.get_mut_props(col).set_dictionary_enabled(value);
680        self
681    }
682
683    /// Sets statistics level for a specific column.
684    ///
685    /// Takes precedence over [`Self::set_statistics_enabled`].
686    pub fn set_column_statistics_enabled(
687        mut self,
688        col: ColumnPath,
689        value: EnabledStatistics,
690    ) -> Self {
691        self.get_mut_props(col).set_statistics_enabled(value);
692        self
693    }
694
695    /// Sets max size for statistics for a specific column.
696    ///
697    /// Takes precedence over [`Self::set_max_statistics_size`].
698    pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
699        self.get_mut_props(col).set_max_statistics_size(value);
700        self
701    }
702
703    /// Sets whether a bloom filter should be written for a specific column.
704    ///
705    /// Takes precedence over [`Self::set_bloom_filter_enabled`].
706    pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
707        self.get_mut_props(col).set_bloom_filter_enabled(value);
708        self
709    }
710
711    /// Sets the false positive probability for bloom filter for a specific column.
712    ///
713    /// Takes precedence over [`Self::set_bloom_filter_fpp`].
714    pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
715        self.get_mut_props(col).set_bloom_filter_fpp(value);
716        self
717    }
718
719    /// Sets the number of distinct values for bloom filter for a specific column.
720    ///
721    /// Takes precedence over [`Self::set_bloom_filter_ndv`].
722    pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
723        self.get_mut_props(col).set_bloom_filter_ndv(value);
724        self
725    }
726
727    /// Sets the max length of min/max value fields when writing the column
728    /// [`Index`] (defaults to `None`).
729    ///
730    /// This can be used to prevent columns with very long values (hundreds of
731    /// bytes long) from causing the parquet metadata to become huge.
732    ///
733    /// # Notes
734    ///
735    /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
736    /// set to [`EnabledStatistics::Page`].
737    ///
738    /// * If `Some`, must be greater than 0, otherwise will panic
739    /// * If `None`, there's no effective limit.
740    ///
741    /// [`Index`]: crate::file::page_index::index::Index
742    pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
743        if let Some(value) = max_length {
744            assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
745        }
746
747        self.column_index_truncate_length = max_length;
748        self
749    }
750
751    /// Sets the max length of min/max value fields in row group level
752    /// [`Statistics`] (defaults to `None`).
753    ///
754    /// # Notes
755    /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
756    /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
757    ///
758    /// * If `Some`, must be greater than 0, otherwise will panic
759    /// * If `None`, there's no effective limit.
760    ///
761    /// [`Statistics`]: crate::file::statistics::Statistics
762    pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
763        if let Some(value) = max_length {
764            assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
765        }
766
767        self.statistics_truncate_length = max_length;
768        self
769    }
770}
771
772/// Controls the level of statistics to be computed by the writer and stored in
773/// the parquet file.
774///
775/// Enabling statistics makes the resulting Parquet file larger and requires
776/// more time to read the parquet footer.
777///
778/// Statistics can be used to improve query performance by pruning row groups
779/// and pages during query execution if the query engine supports evaluating the
780/// predicate using the statistics.
781#[derive(Debug, Clone, Copy, Eq, PartialEq)]
782pub enum EnabledStatistics {
783    /// Compute no statistics.
784    None,
785    /// Compute column chunk-level statistics but not page-level.
786    ///
787    /// Setting this option will store one set of statistics for each relevant
788    /// column for each row group. The more row groups written, the more
789    /// statistics will be stored.
790    Chunk,
791    /// Compute page-level and column chunk-level statistics.
792    ///
793    /// Setting this option will store one set of statistics for each relevant
794    /// column for each page and row group. The more row groups and the more
795    /// pages written, the more statistics will be stored.
796    Page,
797}
798
799impl FromStr for EnabledStatistics {
800    type Err = String;
801
802    fn from_str(s: &str) -> Result<Self, Self::Err> {
803        match s {
804            "NONE" | "none" => Ok(EnabledStatistics::None),
805            "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
806            "PAGE" | "page" => Ok(EnabledStatistics::Page),
807            _ => Err(format!("Invalid statistics arg: {}", s)),
808        }
809    }
810}
811
812impl Default for EnabledStatistics {
813    fn default() -> Self {
814        DEFAULT_STATISTICS_ENABLED
815    }
816}
817
818/// Controls the bloom filter to be computed by the writer.
819#[derive(Debug, Clone, PartialEq)]
820pub struct BloomFilterProperties {
821    /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
822    ///
823    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
824    ///
825    /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
826    /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
827    /// e.g. 0.1, 0.05, or 0.001 is recommended.
828    ///
829    /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
830    /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
831    /// be known in advance in order to largely reduce space usage.
832    pub fpp: f64,
833    /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
834    ///
835    /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
836    ///
837    /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
838    /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
839    /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
840    /// anyway.
841    ///
842    /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
843    pub ndv: u64,
844}
845
846impl Default for BloomFilterProperties {
847    fn default() -> Self {
848        BloomFilterProperties {
849            fpp: DEFAULT_BLOOM_FILTER_FPP,
850            ndv: DEFAULT_BLOOM_FILTER_NDV,
851        }
852    }
853}
854
855/// Container for column properties that can be changed as part of writer.
856///
857/// If a field is `None`, it means that no specific value has been set for this column,
858/// so some subsequent or default value must be used.
859#[derive(Debug, Clone, Default, PartialEq)]
860struct ColumnProperties {
861    encoding: Option<Encoding>,
862    codec: Option<Compression>,
863    dictionary_enabled: Option<bool>,
864    statistics_enabled: Option<EnabledStatistics>,
865    max_statistics_size: Option<usize>,
866    /// bloom filter related properties
867    bloom_filter_properties: Option<BloomFilterProperties>,
868}
869
870impl ColumnProperties {
871    /// Sets encoding for this column.
872    ///
873    /// If dictionary is not enabled, this is treated as a primary encoding for a column.
874    /// In case when dictionary is enabled for a column, this value is considered to
875    /// be a fallback encoding.
876    ///
877    /// Panics if user tries to set dictionary encoding here, regardless of dictionary
878    /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
879    /// for a column.
880    fn set_encoding(&mut self, value: Encoding) {
881        if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
882            panic!("Dictionary encoding can not be used as fallback encoding");
883        }
884        self.encoding = Some(value);
885    }
886
887    /// Sets compression codec for this column.
888    fn set_compression(&mut self, value: Compression) {
889        self.codec = Some(value);
890    }
891
892    /// Sets whether or not dictionary encoding is enabled for this column.
893    fn set_dictionary_enabled(&mut self, enabled: bool) {
894        self.dictionary_enabled = Some(enabled);
895    }
896
897    /// Sets whether or not statistics are enabled for this column.
898    fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
899        self.statistics_enabled = Some(enabled);
900    }
901
902    /// Sets max size for statistics for this column.
903    fn set_max_statistics_size(&mut self, value: usize) {
904        self.max_statistics_size = Some(value);
905    }
906
907    /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
908    /// otherwise it is a no-op.
909    /// If `value` is `false`, resets bloom filter properties to `None`.
910    fn set_bloom_filter_enabled(&mut self, value: bool) {
911        if value && self.bloom_filter_properties.is_none() {
912            self.bloom_filter_properties = Some(Default::default())
913        } else if !value {
914            self.bloom_filter_properties = None
915        }
916    }
917
918    /// Sets the false positive probability for bloom filter for this column, and implicitly enables
919    /// bloom filter if not previously enabled.
920    ///
921    /// # Panics
922    ///
923    /// Panics if the `value` is not between 0 and 1 exclusive
924    fn set_bloom_filter_fpp(&mut self, value: f64) {
925        assert!(
926            value > 0. && value < 1.0,
927            "fpp must be between 0 and 1 exclusive, got {value}"
928        );
929
930        self.bloom_filter_properties
931            .get_or_insert_with(Default::default)
932            .fpp = value;
933    }
934
935    /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
936    /// enables bloom filter if not previously enabled.
937    fn set_bloom_filter_ndv(&mut self, value: u64) {
938        self.bloom_filter_properties
939            .get_or_insert_with(Default::default)
940            .ndv = value;
941    }
942
943    /// Returns optional encoding for this column.
944    fn encoding(&self) -> Option<Encoding> {
945        self.encoding
946    }
947
948    /// Returns optional compression codec for this column.
949    fn compression(&self) -> Option<Compression> {
950        self.codec
951    }
952
953    /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
954    /// disabled then returns `Some(false)`. If result is `None`, then no setting has
955    /// been provided.
956    fn dictionary_enabled(&self) -> Option<bool> {
957        self.dictionary_enabled
958    }
959
960    /// Returns `Some(true)` if statistics are enabled for this column, if disabled then
961    /// returns `Some(false)`. If result is `None`, then no setting has been provided.
962    fn statistics_enabled(&self) -> Option<EnabledStatistics> {
963        self.statistics_enabled
964    }
965
966    /// Returns optional max size in bytes for statistics.
967    fn max_statistics_size(&self) -> Option<usize> {
968        self.max_statistics_size
969    }
970
971    /// Returns the bloom filter properties, or `None` if not enabled
972    fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
973        self.bloom_filter_properties.as_ref()
974    }
975}
976
977/// Reference counted reader properties.
978pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
979
980const DEFAULT_READ_BLOOM_FILTER: bool = false;
981
982/// Configuration settings for reading parquet files.
983///
984/// All properties are immutable and `Send` + `Sync`.
985/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
986///
987/// # Example
988///
989/// ```rust
990/// use parquet::file::properties::ReaderProperties;
991///
992/// // Create properties with default configuration.
993/// let props = ReaderProperties::builder().build();
994///
995/// // Use properties builder to set certain options and assemble the configuration.
996/// let props = ReaderProperties::builder()
997///     .set_backward_compatible_lz4(false)
998///     .build();
999/// ```
1000pub struct ReaderProperties {
1001    codec_options: CodecOptions,
1002    read_bloom_filter: bool,
1003}
1004
1005impl ReaderProperties {
1006    /// Returns builder for reader properties with default values.
1007    pub fn builder() -> ReaderPropertiesBuilder {
1008        ReaderPropertiesBuilder::with_defaults()
1009    }
1010
1011    /// Returns codec options.
1012    pub(crate) fn codec_options(&self) -> &CodecOptions {
1013        &self.codec_options
1014    }
1015
1016    /// Returns whether to read bloom filter
1017    pub(crate) fn read_bloom_filter(&self) -> bool {
1018        self.read_bloom_filter
1019    }
1020}
1021
1022/// Builder for parquet file reader configuration. See example on
1023/// [`ReaderProperties`]
1024pub struct ReaderPropertiesBuilder {
1025    codec_options_builder: CodecOptionsBuilder,
1026    read_bloom_filter: Option<bool>,
1027}
1028
1029/// Reader properties builder.
1030impl ReaderPropertiesBuilder {
1031    /// Returns default state of the builder.
1032    fn with_defaults() -> Self {
1033        Self {
1034            codec_options_builder: CodecOptionsBuilder::default(),
1035            read_bloom_filter: None,
1036        }
1037    }
1038
1039    /// Finalizes the configuration and returns immutable reader properties struct.
1040    pub fn build(self) -> ReaderProperties {
1041        ReaderProperties {
1042            codec_options: self.codec_options_builder.build(),
1043            read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1044        }
1045    }
1046
1047    /// Enable/disable backward compatible LZ4.
1048    ///
1049    /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1050    /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1051    /// with files generated by older versions of this library, and LZ4_RAW, for backward
1052    /// compatibility with files generated by older versions of parquet-cpp.
1053    ///
1054    /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1055    pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1056        self.codec_options_builder = self
1057            .codec_options_builder
1058            .set_backward_compatible_lz4(value);
1059        self
1060    }
1061
1062    /// Enable/disable reading bloom filter
1063    ///
1064    /// If reading bloom filter is enabled, bloom filter will be read from the file.
1065    /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1066    ///
1067    /// By default bloom filter is set to be read.
1068    pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1069        self.read_bloom_filter = Some(value);
1070        self
1071    }
1072}
1073
1074#[cfg(test)]
1075mod tests {
1076    use super::*;
1077
1078    #[test]
1079    fn test_writer_version() {
1080        assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1081        assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1082    }
1083
1084    #[test]
1085    fn test_writer_properties_default_settings() {
1086        let props = WriterProperties::default();
1087        assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1088        assert_eq!(
1089            props.dictionary_page_size_limit(),
1090            DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1091        );
1092        assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1093        assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1094        assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1095        assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1096        assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1097        assert_eq!(props.key_value_metadata(), None);
1098        assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1099        assert_eq!(
1100            props.compression(&ColumnPath::from("col")),
1101            DEFAULT_COMPRESSION
1102        );
1103        assert_eq!(
1104            props.dictionary_enabled(&ColumnPath::from("col")),
1105            DEFAULT_DICTIONARY_ENABLED
1106        );
1107        assert_eq!(
1108            props.statistics_enabled(&ColumnPath::from("col")),
1109            DEFAULT_STATISTICS_ENABLED
1110        );
1111        assert_eq!(
1112            props.max_statistics_size(&ColumnPath::from("col")),
1113            DEFAULT_MAX_STATISTICS_SIZE
1114        );
1115        assert!(props
1116            .bloom_filter_properties(&ColumnPath::from("col"))
1117            .is_none());
1118    }
1119
1120    #[test]
1121    fn test_writer_properties_dictionary_encoding() {
1122        // dictionary encoding is not configurable, and it should be the same for both
1123        // writer version 1 and 2.
1124        for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1125            let props = WriterProperties::builder()
1126                .set_writer_version(*version)
1127                .build();
1128            assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1129            assert_eq!(
1130                props.dictionary_data_page_encoding(),
1131                Encoding::RLE_DICTIONARY
1132            );
1133        }
1134    }
1135
1136    #[test]
1137    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1138    fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1139        // Should panic when user specifies dictionary encoding as fallback encoding.
1140        WriterProperties::builder()
1141            .set_encoding(Encoding::PLAIN_DICTIONARY)
1142            .build();
1143    }
1144
1145    #[test]
1146    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1147    fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1148        // Should panic when user specifies dictionary encoding as fallback encoding.
1149        WriterProperties::builder()
1150            .set_encoding(Encoding::RLE_DICTIONARY)
1151            .build();
1152    }
1153
1154    #[test]
1155    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1156    fn test_writer_properties_panic_when_dictionary_is_enabled() {
1157        WriterProperties::builder()
1158            .set_dictionary_enabled(true)
1159            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1160            .build();
1161    }
1162
1163    #[test]
1164    #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1165    fn test_writer_properties_panic_when_dictionary_is_disabled() {
1166        WriterProperties::builder()
1167            .set_dictionary_enabled(false)
1168            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1169            .build();
1170    }
1171
1172    #[test]
1173    fn test_writer_properties_builder() {
1174        let props = WriterProperties::builder()
1175            // file settings
1176            .set_writer_version(WriterVersion::PARQUET_2_0)
1177            .set_data_page_size_limit(10)
1178            .set_dictionary_page_size_limit(20)
1179            .set_write_batch_size(30)
1180            .set_max_row_group_size(40)
1181            .set_created_by("default".to_owned())
1182            .set_key_value_metadata(Some(vec![KeyValue::new(
1183                "key".to_string(),
1184                "value".to_string(),
1185            )]))
1186            // global column settings
1187            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1188            .set_compression(Compression::GZIP(Default::default()))
1189            .set_dictionary_enabled(false)
1190            .set_statistics_enabled(EnabledStatistics::None)
1191            .set_max_statistics_size(50)
1192            // specific column settings
1193            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1194            .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1195            .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1196            .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1197            .set_column_max_statistics_size(ColumnPath::from("col"), 123)
1198            .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1199            .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1200            .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1201            .build();
1202
1203        assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1204        assert_eq!(props.data_page_size_limit(), 10);
1205        assert_eq!(props.dictionary_page_size_limit(), 20);
1206        assert_eq!(props.write_batch_size(), 30);
1207        assert_eq!(props.max_row_group_size(), 40);
1208        assert_eq!(props.created_by(), "default");
1209        assert_eq!(
1210            props.key_value_metadata(),
1211            Some(&vec![
1212                KeyValue::new("key".to_string(), "value".to_string(),)
1213            ])
1214        );
1215
1216        assert_eq!(
1217            props.encoding(&ColumnPath::from("a")),
1218            Some(Encoding::DELTA_BINARY_PACKED)
1219        );
1220        assert_eq!(
1221            props.compression(&ColumnPath::from("a")),
1222            Compression::GZIP(Default::default())
1223        );
1224        assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1225        assert_eq!(
1226            props.statistics_enabled(&ColumnPath::from("a")),
1227            EnabledStatistics::None
1228        );
1229        assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50);
1230
1231        assert_eq!(
1232            props.encoding(&ColumnPath::from("col")),
1233            Some(Encoding::RLE)
1234        );
1235        assert_eq!(
1236            props.compression(&ColumnPath::from("col")),
1237            Compression::SNAPPY
1238        );
1239        assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1240        assert_eq!(
1241            props.statistics_enabled(&ColumnPath::from("col")),
1242            EnabledStatistics::Chunk
1243        );
1244        assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
1245        assert_eq!(
1246            props.bloom_filter_properties(&ColumnPath::from("col")),
1247            Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1248        );
1249    }
1250
1251    #[test]
1252    fn test_writer_properties_builder_partial_defaults() {
1253        let props = WriterProperties::builder()
1254            .set_encoding(Encoding::DELTA_BINARY_PACKED)
1255            .set_compression(Compression::GZIP(Default::default()))
1256            .set_bloom_filter_enabled(true)
1257            .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1258            .build();
1259
1260        assert_eq!(
1261            props.encoding(&ColumnPath::from("col")),
1262            Some(Encoding::RLE)
1263        );
1264        assert_eq!(
1265            props.compression(&ColumnPath::from("col")),
1266            Compression::GZIP(Default::default())
1267        );
1268        assert_eq!(
1269            props.dictionary_enabled(&ColumnPath::from("col")),
1270            DEFAULT_DICTIONARY_ENABLED
1271        );
1272        assert_eq!(
1273            props.bloom_filter_properties(&ColumnPath::from("col")),
1274            Some(&BloomFilterProperties {
1275                fpp: 0.05,
1276                ndv: 1_000_000_u64
1277            })
1278        );
1279    }
1280
1281    #[test]
1282    fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1283        assert_eq!(
1284            WriterProperties::builder()
1285                .build()
1286                .bloom_filter_properties(&ColumnPath::from("col")),
1287            None
1288        );
1289        assert_eq!(
1290            WriterProperties::builder()
1291                .set_bloom_filter_ndv(100)
1292                .build()
1293                .bloom_filter_properties(&ColumnPath::from("col")),
1294            Some(&BloomFilterProperties {
1295                fpp: 0.05,
1296                ndv: 100
1297            })
1298        );
1299        assert_eq!(
1300            WriterProperties::builder()
1301                .set_bloom_filter_fpp(0.1)
1302                .build()
1303                .bloom_filter_properties(&ColumnPath::from("col")),
1304            Some(&BloomFilterProperties {
1305                fpp: 0.1,
1306                ndv: 1_000_000_u64
1307            })
1308        );
1309    }
1310
1311    #[test]
1312    fn test_reader_properties_default_settings() {
1313        let props = ReaderProperties::builder().build();
1314
1315        let codec_options = CodecOptionsBuilder::default()
1316            .set_backward_compatible_lz4(true)
1317            .build();
1318
1319        assert_eq!(props.codec_options(), &codec_options);
1320        assert!(!props.read_bloom_filter());
1321    }
1322
1323    #[test]
1324    fn test_reader_properties_builder() {
1325        let props = ReaderProperties::builder()
1326            .set_backward_compatible_lz4(false)
1327            .build();
1328
1329        let codec_options = CodecOptionsBuilder::default()
1330            .set_backward_compatible_lz4(false)
1331            .build();
1332
1333        assert_eq!(props.codec_options(), &codec_options);
1334    }
1335
1336    #[test]
1337    fn test_parse_writerversion() {
1338        let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1339        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1340        writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1341        assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1342
1343        // test lowercase
1344        writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1345        assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1346
1347        // test invalid version
1348        match "PARQUET_-1_0".parse::<WriterVersion>() {
1349            Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1350            Err(e) => {
1351                assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1352            }
1353        }
1354    }
1355
1356    #[test]
1357    fn test_parse_enabledstatistics() {
1358        let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1359        assert_eq!(enabled_statistics, EnabledStatistics::None);
1360        enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1361        assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1362        enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1363        assert_eq!(enabled_statistics, EnabledStatistics::Page);
1364
1365        // test lowercase
1366        enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1367        assert_eq!(enabled_statistics, EnabledStatistics::None);
1368
1369        //test invalid statistics
1370        match "ChunkAndPage".parse::<EnabledStatistics>() {
1371            Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1372            Err(e) => {
1373                assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1374            }
1375        }
1376    }
1377}