parquet/file/properties.rs
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements. See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership. The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License. You may obtain a copy of the License at
8//
9// http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied. See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Configuration via [`WriterProperties`] and [`ReaderProperties`]
19use std::str::FromStr;
20use std::{collections::HashMap, sync::Arc};
21
22use crate::basic::{Compression, Encoding};
23use crate::compression::{CodecOptions, CodecOptionsBuilder};
24use crate::file::metadata::KeyValue;
25use crate::format::SortingColumn;
26use crate::schema::types::ColumnPath;
27
28/// Default value for [`WriterProperties::data_page_size_limit`]
29pub const DEFAULT_PAGE_SIZE: usize = 1024 * 1024;
30/// Default value for [`WriterProperties::write_batch_size`]
31pub const DEFAULT_WRITE_BATCH_SIZE: usize = 1024;
32/// Default value for [`WriterProperties::writer_version`]
33pub const DEFAULT_WRITER_VERSION: WriterVersion = WriterVersion::PARQUET_1_0;
34/// Default value for [`WriterProperties::compression`]
35pub const DEFAULT_COMPRESSION: Compression = Compression::UNCOMPRESSED;
36/// Default value for [`WriterProperties::dictionary_enabled`]
37pub const DEFAULT_DICTIONARY_ENABLED: bool = true;
38/// Default value for [`WriterProperties::dictionary_page_size_limit`]
39pub const DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT: usize = DEFAULT_PAGE_SIZE;
40/// Default value for [`WriterProperties::data_page_row_count_limit`]
41pub const DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT: usize = 20_000;
42/// Default value for [`WriterProperties::statistics_enabled`]
43pub const DEFAULT_STATISTICS_ENABLED: EnabledStatistics = EnabledStatistics::Page;
44/// Default value for [`WriterProperties::max_statistics_size`]
45pub const DEFAULT_MAX_STATISTICS_SIZE: usize = 4096;
46/// Default value for [`WriterProperties::max_row_group_size`]
47pub const DEFAULT_MAX_ROW_GROUP_SIZE: usize = 1024 * 1024;
48/// Default value for [`WriterProperties::bloom_filter_position`]
49pub const DEFAULT_BLOOM_FILTER_POSITION: BloomFilterPosition = BloomFilterPosition::AfterRowGroup;
50/// Default value for [`WriterProperties::created_by`]
51pub const DEFAULT_CREATED_BY: &str = concat!("parquet-rs version ", env!("CARGO_PKG_VERSION"));
52/// Default value for [`WriterProperties::column_index_truncate_length`]
53pub const DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH: Option<usize> = Some(64);
54/// Default value for [`BloomFilterProperties::fpp`]
55pub const DEFAULT_BLOOM_FILTER_FPP: f64 = 0.05;
56/// Default value for [`BloomFilterProperties::ndv`]
57pub const DEFAULT_BLOOM_FILTER_NDV: u64 = 1_000_000_u64;
58/// Default values for [`WriterProperties::statistics_truncate_length`]
59pub const DEFAULT_STATISTICS_TRUNCATE_LENGTH: Option<usize> = None;
60
61/// Parquet writer version.
62///
63/// Basic constant, which is not part of the Thrift definition.
64#[derive(Debug, Clone, Copy, PartialEq, Eq)]
65#[allow(non_camel_case_types)]
66pub enum WriterVersion {
67 /// Parquet format version 1.0
68 PARQUET_1_0,
69 /// Parquet format version 2.0
70 PARQUET_2_0,
71}
72
73impl WriterVersion {
74 /// Returns writer version as `i32`.
75 pub fn as_num(&self) -> i32 {
76 match self {
77 WriterVersion::PARQUET_1_0 => 1,
78 WriterVersion::PARQUET_2_0 => 2,
79 }
80 }
81}
82
83impl FromStr for WriterVersion {
84 type Err = String;
85
86 fn from_str(s: &str) -> Result<Self, Self::Err> {
87 match s {
88 "PARQUET_1_0" | "parquet_1_0" => Ok(WriterVersion::PARQUET_1_0),
89 "PARQUET_2_0" | "parquet_2_0" => Ok(WriterVersion::PARQUET_2_0),
90 _ => Err(format!("Invalid writer version: {}", s)),
91 }
92 }
93}
94
95/// Where in the file [`ArrowWriter`](crate::arrow::arrow_writer::ArrowWriter) should
96/// write Bloom filters
97///
98/// Basic constant, which is not part of the Thrift definition.
99#[derive(Debug, Clone, Copy, PartialEq, Eq)]
100pub enum BloomFilterPosition {
101 /// Write Bloom Filters of each row group right after the row group
102 ///
103 /// This saves memory by writing it as soon as it is computed, at the cost
104 /// of data locality for readers
105 AfterRowGroup,
106 /// Write Bloom Filters at the end of the file
107 ///
108 /// This allows better data locality for readers, at the cost of memory usage
109 /// for writers.
110 End,
111}
112
113/// Reference counted writer properties.
114pub type WriterPropertiesPtr = Arc<WriterProperties>;
115
116/// Configuration settings for writing parquet files.
117///
118/// Use [`Self::builder`] to create a [`WriterPropertiesBuilder`] to change settings.
119///
120/// # Example
121///
122/// ```rust
123/// # use parquet::{
124/// # basic::{Compression, Encoding},
125/// # file::properties::*,
126/// # schema::types::ColumnPath,
127/// # };
128/// #
129/// // Create properties with default configuration.
130/// let props = WriterProperties::default();
131///
132/// // Use properties builder to set certain options and assemble the configuration.
133/// let props = WriterProperties::builder()
134/// .set_writer_version(WriterVersion::PARQUET_1_0)
135/// .set_encoding(Encoding::PLAIN)
136/// .set_column_encoding(ColumnPath::from("col1"), Encoding::DELTA_BINARY_PACKED)
137/// .set_compression(Compression::SNAPPY)
138/// .build();
139///
140/// assert_eq!(props.writer_version(), WriterVersion::PARQUET_1_0);
141/// assert_eq!(
142/// props.encoding(&ColumnPath::from("col1")),
143/// Some(Encoding::DELTA_BINARY_PACKED)
144/// );
145/// assert_eq!(
146/// props.encoding(&ColumnPath::from("col2")),
147/// Some(Encoding::PLAIN)
148/// );
149/// ```
150#[derive(Debug, Clone)]
151pub struct WriterProperties {
152 data_page_size_limit: usize,
153 dictionary_page_size_limit: usize,
154 data_page_row_count_limit: usize,
155 write_batch_size: usize,
156 max_row_group_size: usize,
157 bloom_filter_position: BloomFilterPosition,
158 writer_version: WriterVersion,
159 created_by: String,
160 pub(crate) key_value_metadata: Option<Vec<KeyValue>>,
161 default_column_properties: ColumnProperties,
162 column_properties: HashMap<ColumnPath, ColumnProperties>,
163 sorting_columns: Option<Vec<SortingColumn>>,
164 column_index_truncate_length: Option<usize>,
165 statistics_truncate_length: Option<usize>,
166}
167
168impl Default for WriterProperties {
169 fn default() -> Self {
170 Self::builder().build()
171 }
172}
173
174impl WriterProperties {
175 /// Create a new [`WriterProperties`] with the default settings
176 ///
177 /// See [`WriterProperties::builder`] for customising settings
178 pub fn new() -> Self {
179 Self::default()
180 }
181
182 /// Returns a new default [`WriterPropertiesBuilder`] for creating writer
183 /// properties.
184 pub fn builder() -> WriterPropertiesBuilder {
185 WriterPropertiesBuilder::with_defaults()
186 }
187
188 /// Returns data page size limit.
189 ///
190 /// Note: this is a best effort limit based on the write batch size
191 #[deprecated(since = "41.0.0", note = "Use data_page_size_limit")]
192 pub fn data_pagesize_limit(&self) -> usize {
193 self.data_page_size_limit
194 }
195
196 /// Returns data page size limit.
197 ///
198 /// Note: this is a best effort limit based on the write batch size
199 ///
200 /// For more details see [`WriterPropertiesBuilder::set_data_page_size_limit`]
201 pub fn data_page_size_limit(&self) -> usize {
202 self.data_page_size_limit
203 }
204
205 /// Returns dictionary page size limit.
206 ///
207 /// Note: this is a best effort limit based on the write batch size
208 #[deprecated(since = "41.0.0", note = "Use dictionary_page_size_limit")]
209 pub fn dictionary_pagesize_limit(&self) -> usize {
210 self.dictionary_page_size_limit
211 }
212
213 /// Returns dictionary page size limit.
214 ///
215 /// Note: this is a best effort limit based on the write batch size
216 ///
217 /// For more details see [`WriterPropertiesBuilder::set_dictionary_page_size_limit`]
218 pub fn dictionary_page_size_limit(&self) -> usize {
219 self.dictionary_page_size_limit
220 }
221
222 /// Returns the maximum page row count
223 ///
224 /// Note: this is a best effort limit based on the write batch size
225 ///
226 /// For more details see [`WriterPropertiesBuilder::set_data_page_row_count_limit`]
227 pub fn data_page_row_count_limit(&self) -> usize {
228 self.data_page_row_count_limit
229 }
230
231 /// Returns configured batch size for writes.
232 ///
233 /// When writing a batch of data, this setting allows to split it internally into
234 /// smaller batches so we can better estimate the size of a page currently being
235 /// written.
236 pub fn write_batch_size(&self) -> usize {
237 self.write_batch_size
238 }
239
240 /// Returns maximum number of rows in a row group.
241 pub fn max_row_group_size(&self) -> usize {
242 self.max_row_group_size
243 }
244
245 /// Returns maximum number of rows in a row group.
246 pub fn bloom_filter_position(&self) -> BloomFilterPosition {
247 self.bloom_filter_position
248 }
249
250 /// Returns configured writer version.
251 pub fn writer_version(&self) -> WriterVersion {
252 self.writer_version
253 }
254
255 /// Returns `created_by` string.
256 pub fn created_by(&self) -> &str {
257 &self.created_by
258 }
259
260 /// Returns `key_value_metadata` KeyValue pairs.
261 pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
262 self.key_value_metadata.as_ref()
263 }
264
265 /// Returns sorting columns.
266 pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
267 self.sorting_columns.as_ref()
268 }
269
270 /// Returns the maximum length of truncated min/max values in the column index.
271 ///
272 /// `None` if truncation is disabled, must be greater than 0 otherwise.
273 pub fn column_index_truncate_length(&self) -> Option<usize> {
274 self.column_index_truncate_length
275 }
276
277 /// Returns the maximum length of truncated min/max values in statistics.
278 ///
279 /// `None` if truncation is disabled, must be greater than 0 otherwise.
280 pub fn statistics_truncate_length(&self) -> Option<usize> {
281 self.statistics_truncate_length
282 }
283
284 /// Returns encoding for a data page, when dictionary encoding is enabled.
285 /// This is not configurable.
286 #[inline]
287 pub fn dictionary_data_page_encoding(&self) -> Encoding {
288 // PLAIN_DICTIONARY encoding is deprecated in writer version 1.
289 // Dictionary values are encoded using RLE_DICTIONARY encoding.
290 Encoding::RLE_DICTIONARY
291 }
292
293 /// Returns encoding for dictionary page, when dictionary encoding is enabled.
294 /// This is not configurable.
295 #[inline]
296 pub fn dictionary_page_encoding(&self) -> Encoding {
297 // PLAIN_DICTIONARY is deprecated in writer version 1.
298 // Dictionary is encoded using plain encoding.
299 Encoding::PLAIN
300 }
301
302 /// Returns encoding for a column, if set.
303 /// In case when dictionary is enabled, returns fallback encoding.
304 ///
305 /// If encoding is not set, then column writer will choose the best encoding
306 /// based on the column type.
307 pub fn encoding(&self, col: &ColumnPath) -> Option<Encoding> {
308 self.column_properties
309 .get(col)
310 .and_then(|c| c.encoding())
311 .or_else(|| self.default_column_properties.encoding())
312 }
313
314 /// Returns compression codec for a column.
315 pub fn compression(&self, col: &ColumnPath) -> Compression {
316 self.column_properties
317 .get(col)
318 .and_then(|c| c.compression())
319 .or_else(|| self.default_column_properties.compression())
320 .unwrap_or(DEFAULT_COMPRESSION)
321 }
322
323 /// Returns `true` if dictionary encoding is enabled for a column.
324 pub fn dictionary_enabled(&self, col: &ColumnPath) -> bool {
325 self.column_properties
326 .get(col)
327 .and_then(|c| c.dictionary_enabled())
328 .or_else(|| self.default_column_properties.dictionary_enabled())
329 .unwrap_or(DEFAULT_DICTIONARY_ENABLED)
330 }
331
332 /// Returns which statistics are written for a column.
333 pub fn statistics_enabled(&self, col: &ColumnPath) -> EnabledStatistics {
334 self.column_properties
335 .get(col)
336 .and_then(|c| c.statistics_enabled())
337 .or_else(|| self.default_column_properties.statistics_enabled())
338 .unwrap_or(DEFAULT_STATISTICS_ENABLED)
339 }
340
341 /// Returns max size for statistics.
342 /// Only applicable if statistics are enabled.
343 pub fn max_statistics_size(&self, col: &ColumnPath) -> usize {
344 self.column_properties
345 .get(col)
346 .and_then(|c| c.max_statistics_size())
347 .or_else(|| self.default_column_properties.max_statistics_size())
348 .unwrap_or(DEFAULT_MAX_STATISTICS_SIZE)
349 }
350
351 /// Returns the [`BloomFilterProperties`] for the given column
352 ///
353 /// Returns `None` if bloom filter is disabled
354 pub fn bloom_filter_properties(&self, col: &ColumnPath) -> Option<&BloomFilterProperties> {
355 self.column_properties
356 .get(col)
357 .and_then(|c| c.bloom_filter_properties())
358 .or_else(|| self.default_column_properties.bloom_filter_properties())
359 }
360}
361
362/// Builder for [`WriterProperties`] parquet writer configuration.
363///
364/// See example on [`WriterProperties`]
365pub struct WriterPropertiesBuilder {
366 data_page_size_limit: usize,
367 dictionary_page_size_limit: usize,
368 data_page_row_count_limit: usize,
369 write_batch_size: usize,
370 max_row_group_size: usize,
371 bloom_filter_position: BloomFilterPosition,
372 writer_version: WriterVersion,
373 created_by: String,
374 key_value_metadata: Option<Vec<KeyValue>>,
375 default_column_properties: ColumnProperties,
376 column_properties: HashMap<ColumnPath, ColumnProperties>,
377 sorting_columns: Option<Vec<SortingColumn>>,
378 column_index_truncate_length: Option<usize>,
379 statistics_truncate_length: Option<usize>,
380}
381
382impl WriterPropertiesBuilder {
383 /// Returns default state of the builder.
384 fn with_defaults() -> Self {
385 Self {
386 data_page_size_limit: DEFAULT_PAGE_SIZE,
387 dictionary_page_size_limit: DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT,
388 data_page_row_count_limit: DEFAULT_DATA_PAGE_ROW_COUNT_LIMIT,
389 write_batch_size: DEFAULT_WRITE_BATCH_SIZE,
390 max_row_group_size: DEFAULT_MAX_ROW_GROUP_SIZE,
391 bloom_filter_position: DEFAULT_BLOOM_FILTER_POSITION,
392 writer_version: DEFAULT_WRITER_VERSION,
393 created_by: DEFAULT_CREATED_BY.to_string(),
394 key_value_metadata: None,
395 default_column_properties: Default::default(),
396 column_properties: HashMap::new(),
397 sorting_columns: None,
398 column_index_truncate_length: DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH,
399 statistics_truncate_length: DEFAULT_STATISTICS_TRUNCATE_LENGTH,
400 }
401 }
402
403 /// Finalizes the configuration and returns immutable writer properties struct.
404 pub fn build(self) -> WriterProperties {
405 WriterProperties {
406 data_page_size_limit: self.data_page_size_limit,
407 dictionary_page_size_limit: self.dictionary_page_size_limit,
408 data_page_row_count_limit: self.data_page_row_count_limit,
409 write_batch_size: self.write_batch_size,
410 max_row_group_size: self.max_row_group_size,
411 bloom_filter_position: self.bloom_filter_position,
412 writer_version: self.writer_version,
413 created_by: self.created_by,
414 key_value_metadata: self.key_value_metadata,
415 default_column_properties: self.default_column_properties,
416 column_properties: self.column_properties,
417 sorting_columns: self.sorting_columns,
418 column_index_truncate_length: self.column_index_truncate_length,
419 statistics_truncate_length: self.statistics_truncate_length,
420 }
421 }
422
423 // ----------------------------------------------------------------------
424 // Writer properties related to a file
425
426 /// Sets the `WriterVersion` written into the parquet metadata (defaults to [`PARQUET_1_0`])
427 ///
428 /// This value can determine what features some readers will support.
429 ///
430 /// [`PARQUET_1_0`]: [WriterVersion::PARQUET_1_0]
431 pub fn set_writer_version(mut self, value: WriterVersion) -> Self {
432 self.writer_version = value;
433 self
434 }
435
436 /// Sets best effort maximum size of a data page in bytes.
437 ///
438 /// Note: this is a best effort limit based on value of
439 /// [`set_write_batch_size`](Self::set_write_batch_size).
440 #[deprecated(since = "41.0.0", note = "Use set_data_page_size_limit")]
441 pub fn set_data_pagesize_limit(mut self, value: usize) -> Self {
442 self.data_page_size_limit = value;
443 self
444 }
445
446 /// Sets best effort maximum size of a data page in bytes (defaults to `1024 * 1024`).
447 ///
448 /// The parquet writer will attempt to limit the sizes of each
449 /// `DataPage` to this many bytes. Reducing this value will result
450 /// in larger parquet files, but may improve the effectiveness of
451 /// page index based predicate pushdown during reading.
452 ///
453 /// Note: this is a best effort limit based on value of
454 /// [`set_write_batch_size`](Self::set_write_batch_size).
455 pub fn set_data_page_size_limit(mut self, value: usize) -> Self {
456 self.data_page_size_limit = value;
457 self
458 }
459
460 /// Sets best effort maximum number of rows in a data page (defaults to `20_000`).
461 ///
462 /// The parquet writer will attempt to limit the number of rows in
463 /// each `DataPage` to this value. Reducing this value will result
464 /// in larger parquet files, but may improve the effectiveness of
465 /// page index based predicate pushdown during reading.
466 ///
467 /// Note: this is a best effort limit based on value of
468 /// [`set_write_batch_size`](Self::set_write_batch_size).
469 pub fn set_data_page_row_count_limit(mut self, value: usize) -> Self {
470 self.data_page_row_count_limit = value;
471 self
472 }
473
474 /// Sets best effort maximum dictionary page size, in bytes.
475 ///
476 /// Note: this is a best effort limit based on value of
477 /// [`set_write_batch_size`](Self::set_write_batch_size).
478 #[deprecated(since = "41.0.0", note = "Use set_dictionary_page_size_limit")]
479 pub fn set_dictionary_pagesize_limit(mut self, value: usize) -> Self {
480 self.dictionary_page_size_limit = value;
481 self
482 }
483
484 /// Sets best effort maximum dictionary page size, in bytes (defaults to `1024 * 1024`).
485 ///
486 /// The parquet writer will attempt to limit the size of each
487 /// `DataPage` used to store dictionaries to this many
488 /// bytes. Reducing this value will result in larger parquet
489 /// files, but may improve the effectiveness of page index based
490 /// predicate pushdown during reading.
491 ///
492 /// Note: this is a best effort limit based on value of
493 /// [`set_write_batch_size`](Self::set_write_batch_size).
494 pub fn set_dictionary_page_size_limit(mut self, value: usize) -> Self {
495 self.dictionary_page_size_limit = value;
496 self
497 }
498
499 /// Sets write batch size (defaults to 1024).
500 ///
501 /// For performance reasons, data for each column is written in
502 /// batches of this size.
503 ///
504 /// Additional limits such as such as
505 /// [`set_data_page_row_count_limit`](Self::set_data_page_row_count_limit)
506 /// are checked between batches, and thus the write batch size value acts as an
507 /// upper-bound on the enforcement granularity of other limits.
508 pub fn set_write_batch_size(mut self, value: usize) -> Self {
509 self.write_batch_size = value;
510 self
511 }
512
513 /// Sets maximum number of rows in a row group (defaults to `1024 * 1024`).
514 ///
515 /// # Panics
516 /// If the value is set to 0.
517 pub fn set_max_row_group_size(mut self, value: usize) -> Self {
518 assert!(value > 0, "Cannot have a 0 max row group size");
519 self.max_row_group_size = value;
520 self
521 }
522
523 /// Sets where in the final file Bloom Filters are written (default `AfterRowGroup`)
524 pub fn set_bloom_filter_position(mut self, value: BloomFilterPosition) -> Self {
525 self.bloom_filter_position = value;
526 self
527 }
528
529 /// Sets "created by" property (defaults to `parquet-rs version <VERSION>`).
530 pub fn set_created_by(mut self, value: String) -> Self {
531 self.created_by = value;
532 self
533 }
534
535 /// Sets "key_value_metadata" property (defaults to `None`).
536 pub fn set_key_value_metadata(mut self, value: Option<Vec<KeyValue>>) -> Self {
537 self.key_value_metadata = value;
538 self
539 }
540
541 /// Sets sorting order of rows in the row group if any (defaults to `None`).
542 pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
543 self.sorting_columns = value;
544 self
545 }
546
547 // ----------------------------------------------------------------------
548 // Setters for any column (global)
549
550 /// Sets default encoding for all columns.
551 ///
552 /// If dictionary is not enabled, this is treated as a primary encoding for all
553 /// columns. In case when dictionary is enabled for any column, this value is
554 /// considered to be a fallback encoding for that column.
555 ///
556 /// # Panics
557 ///
558 /// if dictionary encoding is specified, regardless of dictionary
559 /// encoding flag being set.
560 pub fn set_encoding(mut self, value: Encoding) -> Self {
561 self.default_column_properties.set_encoding(value);
562 self
563 }
564
565 /// Sets default compression codec for all columns (default to [`UNCOMPRESSED`]).
566 ///
567 /// [`UNCOMPRESSED`]: Compression::UNCOMPRESSED
568 pub fn set_compression(mut self, value: Compression) -> Self {
569 self.default_column_properties.set_compression(value);
570 self
571 }
572
573 /// Sets default flag to enable/disable dictionary encoding for all columns (defaults to `true`).
574 ///
575 /// Use this method to set dictionary encoding, instead of explicitly specifying
576 /// encoding in `set_encoding` method.
577 pub fn set_dictionary_enabled(mut self, value: bool) -> Self {
578 self.default_column_properties.set_dictionary_enabled(value);
579 self
580 }
581
582 /// Sets default statistics level for all columns (defaults to [`Page`]).
583 ///
584 /// [`Page`]: EnabledStatistics::Page
585 pub fn set_statistics_enabled(mut self, value: EnabledStatistics) -> Self {
586 self.default_column_properties.set_statistics_enabled(value);
587 self
588 }
589
590 /// Sets default max statistics size for all columns (defaults to `4096`).
591 ///
592 /// Applicable only if statistics are enabled.
593 pub fn set_max_statistics_size(mut self, value: usize) -> Self {
594 self.default_column_properties
595 .set_max_statistics_size(value);
596 self
597 }
598
599 /// Sets if bloom filter is enabled by default for all columns (defaults to `false`).
600 ///
601 /// # Notes
602 ///
603 /// * If the bloom filter is enabled previously then it is a no-op.
604 ///
605 /// * If the bloom filter is not enabled, default values for ndv and fpp
606 /// value are used used. See [`set_bloom_filter_ndv`] and
607 /// [`set_bloom_filter_fpp`] to further adjust the ndv and fpp.
608 ///
609 /// [`set_bloom_filter_ndv`]: Self::set_bloom_filter_ndv
610 /// [`set_bloom_filter_fpp`]: Self::set_bloom_filter_fpp
611 pub fn set_bloom_filter_enabled(mut self, value: bool) -> Self {
612 self.default_column_properties
613 .set_bloom_filter_enabled(value);
614 self
615 }
616
617 /// Sets the default target bloom filter false positive probability (fpp)
618 /// for all columns (defaults to `0.05`).
619 ///
620 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
621 /// been called.
622 ///
623 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
624 pub fn set_bloom_filter_fpp(mut self, value: f64) -> Self {
625 self.default_column_properties.set_bloom_filter_fpp(value);
626 self
627 }
628
629 /// Sets default number of distinct values (ndv) for bloom filter for all
630 /// columns (defaults to `1_000_000`).
631 ///
632 /// Implicitly enables bloom writing, as if [`set_bloom_filter_enabled`] had
633 /// been called.
634 ///
635 /// [`set_bloom_filter_enabled`]: Self::set_bloom_filter_enabled
636 pub fn set_bloom_filter_ndv(mut self, value: u64) -> Self {
637 self.default_column_properties.set_bloom_filter_ndv(value);
638 self
639 }
640
641 // ----------------------------------------------------------------------
642 // Setters for a specific column
643
644 /// Helper method to get existing or new mutable reference of column properties.
645 #[inline]
646 fn get_mut_props(&mut self, col: ColumnPath) -> &mut ColumnProperties {
647 self.column_properties.entry(col).or_default()
648 }
649
650 /// Sets encoding for a specific column.
651 ///
652 /// Takes precedence over [`Self::set_encoding`].
653 ///
654 /// If dictionary is not enabled, this is treated as a primary encoding for this
655 /// column. In case when dictionary is enabled for this column, either through
656 /// global defaults or explicitly, this value is considered to be a fallback
657 /// encoding for this column.
658 ///
659 /// # Panics
660 /// If user tries to set dictionary encoding here, regardless of dictionary
661 /// encoding flag being set.
662 pub fn set_column_encoding(mut self, col: ColumnPath, value: Encoding) -> Self {
663 self.get_mut_props(col).set_encoding(value);
664 self
665 }
666
667 /// Sets compression codec for a specific column.
668 ///
669 /// Takes precedence over [`Self::set_compression`].
670 pub fn set_column_compression(mut self, col: ColumnPath, value: Compression) -> Self {
671 self.get_mut_props(col).set_compression(value);
672 self
673 }
674
675 /// Sets flag to enable/disable dictionary encoding for a specific column.
676 ///
677 /// Takes precedence over [`Self::set_dictionary_enabled`].
678 pub fn set_column_dictionary_enabled(mut self, col: ColumnPath, value: bool) -> Self {
679 self.get_mut_props(col).set_dictionary_enabled(value);
680 self
681 }
682
683 /// Sets statistics level for a specific column.
684 ///
685 /// Takes precedence over [`Self::set_statistics_enabled`].
686 pub fn set_column_statistics_enabled(
687 mut self,
688 col: ColumnPath,
689 value: EnabledStatistics,
690 ) -> Self {
691 self.get_mut_props(col).set_statistics_enabled(value);
692 self
693 }
694
695 /// Sets max size for statistics for a specific column.
696 ///
697 /// Takes precedence over [`Self::set_max_statistics_size`].
698 pub fn set_column_max_statistics_size(mut self, col: ColumnPath, value: usize) -> Self {
699 self.get_mut_props(col).set_max_statistics_size(value);
700 self
701 }
702
703 /// Sets whether a bloom filter should be written for a specific column.
704 ///
705 /// Takes precedence over [`Self::set_bloom_filter_enabled`].
706 pub fn set_column_bloom_filter_enabled(mut self, col: ColumnPath, value: bool) -> Self {
707 self.get_mut_props(col).set_bloom_filter_enabled(value);
708 self
709 }
710
711 /// Sets the false positive probability for bloom filter for a specific column.
712 ///
713 /// Takes precedence over [`Self::set_bloom_filter_fpp`].
714 pub fn set_column_bloom_filter_fpp(mut self, col: ColumnPath, value: f64) -> Self {
715 self.get_mut_props(col).set_bloom_filter_fpp(value);
716 self
717 }
718
719 /// Sets the number of distinct values for bloom filter for a specific column.
720 ///
721 /// Takes precedence over [`Self::set_bloom_filter_ndv`].
722 pub fn set_column_bloom_filter_ndv(mut self, col: ColumnPath, value: u64) -> Self {
723 self.get_mut_props(col).set_bloom_filter_ndv(value);
724 self
725 }
726
727 /// Sets the max length of min/max value fields when writing the column
728 /// [`Index`] (defaults to `None`).
729 ///
730 /// This can be used to prevent columns with very long values (hundreds of
731 /// bytes long) from causing the parquet metadata to become huge.
732 ///
733 /// # Notes
734 ///
735 /// The column [`Index`] is written when [`Self::set_statistics_enabled`] is
736 /// set to [`EnabledStatistics::Page`].
737 ///
738 /// * If `Some`, must be greater than 0, otherwise will panic
739 /// * If `None`, there's no effective limit.
740 ///
741 /// [`Index`]: crate::file::page_index::index::Index
742 pub fn set_column_index_truncate_length(mut self, max_length: Option<usize>) -> Self {
743 if let Some(value) = max_length {
744 assert!(value > 0, "Cannot have a 0 column index truncate length. If you wish to disable min/max value truncation, set it to `None`.");
745 }
746
747 self.column_index_truncate_length = max_length;
748 self
749 }
750
751 /// Sets the max length of min/max value fields in row group level
752 /// [`Statistics`] (defaults to `None`).
753 ///
754 /// # Notes
755 /// Row group level [`Statistics`] are written when [`Self::set_statistics_enabled`] is
756 /// set to [`EnabledStatistics::Chunk`] or [`EnabledStatistics::Page`].
757 ///
758 /// * If `Some`, must be greater than 0, otherwise will panic
759 /// * If `None`, there's no effective limit.
760 ///
761 /// [`Statistics`]: crate::file::statistics::Statistics
762 pub fn set_statistics_truncate_length(mut self, max_length: Option<usize>) -> Self {
763 if let Some(value) = max_length {
764 assert!(value > 0, "Cannot have a 0 statistics truncate length. If you wish to disable min/max value truncation, set it to `None`.");
765 }
766
767 self.statistics_truncate_length = max_length;
768 self
769 }
770}
771
772/// Controls the level of statistics to be computed by the writer and stored in
773/// the parquet file.
774///
775/// Enabling statistics makes the resulting Parquet file larger and requires
776/// more time to read the parquet footer.
777///
778/// Statistics can be used to improve query performance by pruning row groups
779/// and pages during query execution if the query engine supports evaluating the
780/// predicate using the statistics.
781#[derive(Debug, Clone, Copy, Eq, PartialEq)]
782pub enum EnabledStatistics {
783 /// Compute no statistics.
784 None,
785 /// Compute column chunk-level statistics but not page-level.
786 ///
787 /// Setting this option will store one set of statistics for each relevant
788 /// column for each row group. The more row groups written, the more
789 /// statistics will be stored.
790 Chunk,
791 /// Compute page-level and column chunk-level statistics.
792 ///
793 /// Setting this option will store one set of statistics for each relevant
794 /// column for each page and row group. The more row groups and the more
795 /// pages written, the more statistics will be stored.
796 Page,
797}
798
799impl FromStr for EnabledStatistics {
800 type Err = String;
801
802 fn from_str(s: &str) -> Result<Self, Self::Err> {
803 match s {
804 "NONE" | "none" => Ok(EnabledStatistics::None),
805 "CHUNK" | "chunk" => Ok(EnabledStatistics::Chunk),
806 "PAGE" | "page" => Ok(EnabledStatistics::Page),
807 _ => Err(format!("Invalid statistics arg: {}", s)),
808 }
809 }
810}
811
812impl Default for EnabledStatistics {
813 fn default() -> Self {
814 DEFAULT_STATISTICS_ENABLED
815 }
816}
817
818/// Controls the bloom filter to be computed by the writer.
819#[derive(Debug, Clone, PartialEq)]
820pub struct BloomFilterProperties {
821 /// False positive probability, should be always between 0 and 1 exclusive. Defaults to [`DEFAULT_BLOOM_FILTER_FPP`].
822 ///
823 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_fpp`].
824 ///
825 /// The bloom filter data structure is a trade of between disk and memory space versus fpp, the
826 /// smaller the fpp, the more memory and disk space is required, thus setting it to a reasonable value
827 /// e.g. 0.1, 0.05, or 0.001 is recommended.
828 ///
829 /// Setting to very small number diminishes the value of the filter itself, as the bitset size is
830 /// even larger than just storing the whole value. You are also expected to set `ndv` if it can
831 /// be known in advance in order to largely reduce space usage.
832 pub fpp: f64,
833 /// Number of distinct values, should be non-negative to be meaningful. Defaults to [`DEFAULT_BLOOM_FILTER_NDV`].
834 ///
835 /// You should set this value by calling [`WriterPropertiesBuilder::set_bloom_filter_ndv`].
836 ///
837 /// Usage of bloom filter is most beneficial for columns with large cardinality, so a good heuristic
838 /// is to set ndv to number of rows. However it can reduce disk size if you know in advance a smaller
839 /// number of distinct values. For very small ndv value it is probably not worth it to use bloom filter
840 /// anyway.
841 ///
842 /// Increasing this value (without increasing fpp) will result in an increase in disk or memory size.
843 pub ndv: u64,
844}
845
846impl Default for BloomFilterProperties {
847 fn default() -> Self {
848 BloomFilterProperties {
849 fpp: DEFAULT_BLOOM_FILTER_FPP,
850 ndv: DEFAULT_BLOOM_FILTER_NDV,
851 }
852 }
853}
854
855/// Container for column properties that can be changed as part of writer.
856///
857/// If a field is `None`, it means that no specific value has been set for this column,
858/// so some subsequent or default value must be used.
859#[derive(Debug, Clone, Default, PartialEq)]
860struct ColumnProperties {
861 encoding: Option<Encoding>,
862 codec: Option<Compression>,
863 dictionary_enabled: Option<bool>,
864 statistics_enabled: Option<EnabledStatistics>,
865 max_statistics_size: Option<usize>,
866 /// bloom filter related properties
867 bloom_filter_properties: Option<BloomFilterProperties>,
868}
869
870impl ColumnProperties {
871 /// Sets encoding for this column.
872 ///
873 /// If dictionary is not enabled, this is treated as a primary encoding for a column.
874 /// In case when dictionary is enabled for a column, this value is considered to
875 /// be a fallback encoding.
876 ///
877 /// Panics if user tries to set dictionary encoding here, regardless of dictionary
878 /// encoding flag being set. Use `set_dictionary_enabled` method to enable dictionary
879 /// for a column.
880 fn set_encoding(&mut self, value: Encoding) {
881 if value == Encoding::PLAIN_DICTIONARY || value == Encoding::RLE_DICTIONARY {
882 panic!("Dictionary encoding can not be used as fallback encoding");
883 }
884 self.encoding = Some(value);
885 }
886
887 /// Sets compression codec for this column.
888 fn set_compression(&mut self, value: Compression) {
889 self.codec = Some(value);
890 }
891
892 /// Sets whether or not dictionary encoding is enabled for this column.
893 fn set_dictionary_enabled(&mut self, enabled: bool) {
894 self.dictionary_enabled = Some(enabled);
895 }
896
897 /// Sets whether or not statistics are enabled for this column.
898 fn set_statistics_enabled(&mut self, enabled: EnabledStatistics) {
899 self.statistics_enabled = Some(enabled);
900 }
901
902 /// Sets max size for statistics for this column.
903 fn set_max_statistics_size(&mut self, value: usize) {
904 self.max_statistics_size = Some(value);
905 }
906
907 /// If `value` is `true`, sets bloom filter properties to default values if not previously set,
908 /// otherwise it is a no-op.
909 /// If `value` is `false`, resets bloom filter properties to `None`.
910 fn set_bloom_filter_enabled(&mut self, value: bool) {
911 if value && self.bloom_filter_properties.is_none() {
912 self.bloom_filter_properties = Some(Default::default())
913 } else if !value {
914 self.bloom_filter_properties = None
915 }
916 }
917
918 /// Sets the false positive probability for bloom filter for this column, and implicitly enables
919 /// bloom filter if not previously enabled.
920 ///
921 /// # Panics
922 ///
923 /// Panics if the `value` is not between 0 and 1 exclusive
924 fn set_bloom_filter_fpp(&mut self, value: f64) {
925 assert!(
926 value > 0. && value < 1.0,
927 "fpp must be between 0 and 1 exclusive, got {value}"
928 );
929
930 self.bloom_filter_properties
931 .get_or_insert_with(Default::default)
932 .fpp = value;
933 }
934
935 /// Sets the number of distinct (unique) values for bloom filter for this column, and implicitly
936 /// enables bloom filter if not previously enabled.
937 fn set_bloom_filter_ndv(&mut self, value: u64) {
938 self.bloom_filter_properties
939 .get_or_insert_with(Default::default)
940 .ndv = value;
941 }
942
943 /// Returns optional encoding for this column.
944 fn encoding(&self) -> Option<Encoding> {
945 self.encoding
946 }
947
948 /// Returns optional compression codec for this column.
949 fn compression(&self) -> Option<Compression> {
950 self.codec
951 }
952
953 /// Returns `Some(true)` if dictionary encoding is enabled for this column, if
954 /// disabled then returns `Some(false)`. If result is `None`, then no setting has
955 /// been provided.
956 fn dictionary_enabled(&self) -> Option<bool> {
957 self.dictionary_enabled
958 }
959
960 /// Returns `Some(true)` if statistics are enabled for this column, if disabled then
961 /// returns `Some(false)`. If result is `None`, then no setting has been provided.
962 fn statistics_enabled(&self) -> Option<EnabledStatistics> {
963 self.statistics_enabled
964 }
965
966 /// Returns optional max size in bytes for statistics.
967 fn max_statistics_size(&self) -> Option<usize> {
968 self.max_statistics_size
969 }
970
971 /// Returns the bloom filter properties, or `None` if not enabled
972 fn bloom_filter_properties(&self) -> Option<&BloomFilterProperties> {
973 self.bloom_filter_properties.as_ref()
974 }
975}
976
977/// Reference counted reader properties.
978pub type ReaderPropertiesPtr = Arc<ReaderProperties>;
979
980const DEFAULT_READ_BLOOM_FILTER: bool = false;
981
982/// Configuration settings for reading parquet files.
983///
984/// All properties are immutable and `Send` + `Sync`.
985/// Use [`ReaderPropertiesBuilder`] to assemble these properties.
986///
987/// # Example
988///
989/// ```rust
990/// use parquet::file::properties::ReaderProperties;
991///
992/// // Create properties with default configuration.
993/// let props = ReaderProperties::builder().build();
994///
995/// // Use properties builder to set certain options and assemble the configuration.
996/// let props = ReaderProperties::builder()
997/// .set_backward_compatible_lz4(false)
998/// .build();
999/// ```
1000pub struct ReaderProperties {
1001 codec_options: CodecOptions,
1002 read_bloom_filter: bool,
1003}
1004
1005impl ReaderProperties {
1006 /// Returns builder for reader properties with default values.
1007 pub fn builder() -> ReaderPropertiesBuilder {
1008 ReaderPropertiesBuilder::with_defaults()
1009 }
1010
1011 /// Returns codec options.
1012 pub(crate) fn codec_options(&self) -> &CodecOptions {
1013 &self.codec_options
1014 }
1015
1016 /// Returns whether to read bloom filter
1017 pub(crate) fn read_bloom_filter(&self) -> bool {
1018 self.read_bloom_filter
1019 }
1020}
1021
1022/// Builder for parquet file reader configuration. See example on
1023/// [`ReaderProperties`]
1024pub struct ReaderPropertiesBuilder {
1025 codec_options_builder: CodecOptionsBuilder,
1026 read_bloom_filter: Option<bool>,
1027}
1028
1029/// Reader properties builder.
1030impl ReaderPropertiesBuilder {
1031 /// Returns default state of the builder.
1032 fn with_defaults() -> Self {
1033 Self {
1034 codec_options_builder: CodecOptionsBuilder::default(),
1035 read_bloom_filter: None,
1036 }
1037 }
1038
1039 /// Finalizes the configuration and returns immutable reader properties struct.
1040 pub fn build(self) -> ReaderProperties {
1041 ReaderProperties {
1042 codec_options: self.codec_options_builder.build(),
1043 read_bloom_filter: self.read_bloom_filter.unwrap_or(DEFAULT_READ_BLOOM_FILTER),
1044 }
1045 }
1046
1047 /// Enable/disable backward compatible LZ4.
1048 ///
1049 /// If backward compatible LZ4 is enable, on LZ4_HADOOP error it will fallback
1050 /// to the older versions LZ4 algorithms. That is LZ4_FRAME, for backward compatibility
1051 /// with files generated by older versions of this library, and LZ4_RAW, for backward
1052 /// compatibility with files generated by older versions of parquet-cpp.
1053 ///
1054 /// If backward compatible LZ4 is disabled, on LZ4_HADOOP error it will return the error.
1055 pub fn set_backward_compatible_lz4(mut self, value: bool) -> Self {
1056 self.codec_options_builder = self
1057 .codec_options_builder
1058 .set_backward_compatible_lz4(value);
1059 self
1060 }
1061
1062 /// Enable/disable reading bloom filter
1063 ///
1064 /// If reading bloom filter is enabled, bloom filter will be read from the file.
1065 /// If reading bloom filter is disabled, bloom filter will not be read from the file.
1066 ///
1067 /// By default bloom filter is set to be read.
1068 pub fn set_read_bloom_filter(mut self, value: bool) -> Self {
1069 self.read_bloom_filter = Some(value);
1070 self
1071 }
1072}
1073
1074#[cfg(test)]
1075mod tests {
1076 use super::*;
1077
1078 #[test]
1079 fn test_writer_version() {
1080 assert_eq!(WriterVersion::PARQUET_1_0.as_num(), 1);
1081 assert_eq!(WriterVersion::PARQUET_2_0.as_num(), 2);
1082 }
1083
1084 #[test]
1085 fn test_writer_properties_default_settings() {
1086 let props = WriterProperties::default();
1087 assert_eq!(props.data_page_size_limit(), DEFAULT_PAGE_SIZE);
1088 assert_eq!(
1089 props.dictionary_page_size_limit(),
1090 DEFAULT_DICTIONARY_PAGE_SIZE_LIMIT
1091 );
1092 assert_eq!(props.write_batch_size(), DEFAULT_WRITE_BATCH_SIZE);
1093 assert_eq!(props.max_row_group_size(), DEFAULT_MAX_ROW_GROUP_SIZE);
1094 assert_eq!(props.bloom_filter_position(), DEFAULT_BLOOM_FILTER_POSITION);
1095 assert_eq!(props.writer_version(), DEFAULT_WRITER_VERSION);
1096 assert_eq!(props.created_by(), DEFAULT_CREATED_BY);
1097 assert_eq!(props.key_value_metadata(), None);
1098 assert_eq!(props.encoding(&ColumnPath::from("col")), None);
1099 assert_eq!(
1100 props.compression(&ColumnPath::from("col")),
1101 DEFAULT_COMPRESSION
1102 );
1103 assert_eq!(
1104 props.dictionary_enabled(&ColumnPath::from("col")),
1105 DEFAULT_DICTIONARY_ENABLED
1106 );
1107 assert_eq!(
1108 props.statistics_enabled(&ColumnPath::from("col")),
1109 DEFAULT_STATISTICS_ENABLED
1110 );
1111 assert_eq!(
1112 props.max_statistics_size(&ColumnPath::from("col")),
1113 DEFAULT_MAX_STATISTICS_SIZE
1114 );
1115 assert!(props
1116 .bloom_filter_properties(&ColumnPath::from("col"))
1117 .is_none());
1118 }
1119
1120 #[test]
1121 fn test_writer_properties_dictionary_encoding() {
1122 // dictionary encoding is not configurable, and it should be the same for both
1123 // writer version 1 and 2.
1124 for version in &[WriterVersion::PARQUET_1_0, WriterVersion::PARQUET_2_0] {
1125 let props = WriterProperties::builder()
1126 .set_writer_version(*version)
1127 .build();
1128 assert_eq!(props.dictionary_page_encoding(), Encoding::PLAIN);
1129 assert_eq!(
1130 props.dictionary_data_page_encoding(),
1131 Encoding::RLE_DICTIONARY
1132 );
1133 }
1134 }
1135
1136 #[test]
1137 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1138 fn test_writer_properties_panic_when_plain_dictionary_is_fallback() {
1139 // Should panic when user specifies dictionary encoding as fallback encoding.
1140 WriterProperties::builder()
1141 .set_encoding(Encoding::PLAIN_DICTIONARY)
1142 .build();
1143 }
1144
1145 #[test]
1146 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1147 fn test_writer_properties_panic_when_rle_dictionary_is_fallback() {
1148 // Should panic when user specifies dictionary encoding as fallback encoding.
1149 WriterProperties::builder()
1150 .set_encoding(Encoding::RLE_DICTIONARY)
1151 .build();
1152 }
1153
1154 #[test]
1155 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1156 fn test_writer_properties_panic_when_dictionary_is_enabled() {
1157 WriterProperties::builder()
1158 .set_dictionary_enabled(true)
1159 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1160 .build();
1161 }
1162
1163 #[test]
1164 #[should_panic(expected = "Dictionary encoding can not be used as fallback encoding")]
1165 fn test_writer_properties_panic_when_dictionary_is_disabled() {
1166 WriterProperties::builder()
1167 .set_dictionary_enabled(false)
1168 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE_DICTIONARY)
1169 .build();
1170 }
1171
1172 #[test]
1173 fn test_writer_properties_builder() {
1174 let props = WriterProperties::builder()
1175 // file settings
1176 .set_writer_version(WriterVersion::PARQUET_2_0)
1177 .set_data_page_size_limit(10)
1178 .set_dictionary_page_size_limit(20)
1179 .set_write_batch_size(30)
1180 .set_max_row_group_size(40)
1181 .set_created_by("default".to_owned())
1182 .set_key_value_metadata(Some(vec![KeyValue::new(
1183 "key".to_string(),
1184 "value".to_string(),
1185 )]))
1186 // global column settings
1187 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1188 .set_compression(Compression::GZIP(Default::default()))
1189 .set_dictionary_enabled(false)
1190 .set_statistics_enabled(EnabledStatistics::None)
1191 .set_max_statistics_size(50)
1192 // specific column settings
1193 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1194 .set_column_compression(ColumnPath::from("col"), Compression::SNAPPY)
1195 .set_column_dictionary_enabled(ColumnPath::from("col"), true)
1196 .set_column_statistics_enabled(ColumnPath::from("col"), EnabledStatistics::Chunk)
1197 .set_column_max_statistics_size(ColumnPath::from("col"), 123)
1198 .set_column_bloom_filter_enabled(ColumnPath::from("col"), true)
1199 .set_column_bloom_filter_ndv(ColumnPath::from("col"), 100_u64)
1200 .set_column_bloom_filter_fpp(ColumnPath::from("col"), 0.1)
1201 .build();
1202
1203 assert_eq!(props.writer_version(), WriterVersion::PARQUET_2_0);
1204 assert_eq!(props.data_page_size_limit(), 10);
1205 assert_eq!(props.dictionary_page_size_limit(), 20);
1206 assert_eq!(props.write_batch_size(), 30);
1207 assert_eq!(props.max_row_group_size(), 40);
1208 assert_eq!(props.created_by(), "default");
1209 assert_eq!(
1210 props.key_value_metadata(),
1211 Some(&vec![
1212 KeyValue::new("key".to_string(), "value".to_string(),)
1213 ])
1214 );
1215
1216 assert_eq!(
1217 props.encoding(&ColumnPath::from("a")),
1218 Some(Encoding::DELTA_BINARY_PACKED)
1219 );
1220 assert_eq!(
1221 props.compression(&ColumnPath::from("a")),
1222 Compression::GZIP(Default::default())
1223 );
1224 assert!(!props.dictionary_enabled(&ColumnPath::from("a")));
1225 assert_eq!(
1226 props.statistics_enabled(&ColumnPath::from("a")),
1227 EnabledStatistics::None
1228 );
1229 assert_eq!(props.max_statistics_size(&ColumnPath::from("a")), 50);
1230
1231 assert_eq!(
1232 props.encoding(&ColumnPath::from("col")),
1233 Some(Encoding::RLE)
1234 );
1235 assert_eq!(
1236 props.compression(&ColumnPath::from("col")),
1237 Compression::SNAPPY
1238 );
1239 assert!(props.dictionary_enabled(&ColumnPath::from("col")));
1240 assert_eq!(
1241 props.statistics_enabled(&ColumnPath::from("col")),
1242 EnabledStatistics::Chunk
1243 );
1244 assert_eq!(props.max_statistics_size(&ColumnPath::from("col")), 123);
1245 assert_eq!(
1246 props.bloom_filter_properties(&ColumnPath::from("col")),
1247 Some(&BloomFilterProperties { fpp: 0.1, ndv: 100 })
1248 );
1249 }
1250
1251 #[test]
1252 fn test_writer_properties_builder_partial_defaults() {
1253 let props = WriterProperties::builder()
1254 .set_encoding(Encoding::DELTA_BINARY_PACKED)
1255 .set_compression(Compression::GZIP(Default::default()))
1256 .set_bloom_filter_enabled(true)
1257 .set_column_encoding(ColumnPath::from("col"), Encoding::RLE)
1258 .build();
1259
1260 assert_eq!(
1261 props.encoding(&ColumnPath::from("col")),
1262 Some(Encoding::RLE)
1263 );
1264 assert_eq!(
1265 props.compression(&ColumnPath::from("col")),
1266 Compression::GZIP(Default::default())
1267 );
1268 assert_eq!(
1269 props.dictionary_enabled(&ColumnPath::from("col")),
1270 DEFAULT_DICTIONARY_ENABLED
1271 );
1272 assert_eq!(
1273 props.bloom_filter_properties(&ColumnPath::from("col")),
1274 Some(&BloomFilterProperties {
1275 fpp: 0.05,
1276 ndv: 1_000_000_u64
1277 })
1278 );
1279 }
1280
1281 #[test]
1282 fn test_writer_properties_bloom_filter_ndv_fpp_set() {
1283 assert_eq!(
1284 WriterProperties::builder()
1285 .build()
1286 .bloom_filter_properties(&ColumnPath::from("col")),
1287 None
1288 );
1289 assert_eq!(
1290 WriterProperties::builder()
1291 .set_bloom_filter_ndv(100)
1292 .build()
1293 .bloom_filter_properties(&ColumnPath::from("col")),
1294 Some(&BloomFilterProperties {
1295 fpp: 0.05,
1296 ndv: 100
1297 })
1298 );
1299 assert_eq!(
1300 WriterProperties::builder()
1301 .set_bloom_filter_fpp(0.1)
1302 .build()
1303 .bloom_filter_properties(&ColumnPath::from("col")),
1304 Some(&BloomFilterProperties {
1305 fpp: 0.1,
1306 ndv: 1_000_000_u64
1307 })
1308 );
1309 }
1310
1311 #[test]
1312 fn test_reader_properties_default_settings() {
1313 let props = ReaderProperties::builder().build();
1314
1315 let codec_options = CodecOptionsBuilder::default()
1316 .set_backward_compatible_lz4(true)
1317 .build();
1318
1319 assert_eq!(props.codec_options(), &codec_options);
1320 assert!(!props.read_bloom_filter());
1321 }
1322
1323 #[test]
1324 fn test_reader_properties_builder() {
1325 let props = ReaderProperties::builder()
1326 .set_backward_compatible_lz4(false)
1327 .build();
1328
1329 let codec_options = CodecOptionsBuilder::default()
1330 .set_backward_compatible_lz4(false)
1331 .build();
1332
1333 assert_eq!(props.codec_options(), &codec_options);
1334 }
1335
1336 #[test]
1337 fn test_parse_writerversion() {
1338 let mut writer_version = "PARQUET_1_0".parse::<WriterVersion>().unwrap();
1339 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1340 writer_version = "PARQUET_2_0".parse::<WriterVersion>().unwrap();
1341 assert_eq!(writer_version, WriterVersion::PARQUET_2_0);
1342
1343 // test lowercase
1344 writer_version = "parquet_1_0".parse::<WriterVersion>().unwrap();
1345 assert_eq!(writer_version, WriterVersion::PARQUET_1_0);
1346
1347 // test invalid version
1348 match "PARQUET_-1_0".parse::<WriterVersion>() {
1349 Ok(_) => panic!("Should not be able to parse PARQUET_-1_0"),
1350 Err(e) => {
1351 assert_eq!(e, "Invalid writer version: PARQUET_-1_0");
1352 }
1353 }
1354 }
1355
1356 #[test]
1357 fn test_parse_enabledstatistics() {
1358 let mut enabled_statistics = "NONE".parse::<EnabledStatistics>().unwrap();
1359 assert_eq!(enabled_statistics, EnabledStatistics::None);
1360 enabled_statistics = "CHUNK".parse::<EnabledStatistics>().unwrap();
1361 assert_eq!(enabled_statistics, EnabledStatistics::Chunk);
1362 enabled_statistics = "PAGE".parse::<EnabledStatistics>().unwrap();
1363 assert_eq!(enabled_statistics, EnabledStatistics::Page);
1364
1365 // test lowercase
1366 enabled_statistics = "none".parse::<EnabledStatistics>().unwrap();
1367 assert_eq!(enabled_statistics, EnabledStatistics::None);
1368
1369 //test invalid statistics
1370 match "ChunkAndPage".parse::<EnabledStatistics>() {
1371 Ok(_) => panic!("Should not be able to parse ChunkAndPage"),
1372 Err(e) => {
1373 assert_eq!(e, "Invalid statistics arg: ChunkAndPage");
1374 }
1375 }
1376 }
1377}