parquet/file/metadata/
mod.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Users should use these structures to interact with Parquet metadata.
21//!
22//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
23//!   file footer.
24//!
25//! * [`FileMetaData`]: File level metadata such as schema, row counts and
26//!   version.
27//!
28//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
29//!   location and number of rows, and column chunks.
30//!
31//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
32//!   within a Row Group including encoding and compression information,
33//!   number of values, statistics, etc.
34//!
35//! # APIs for working with Parquet Metadata
36//!
37//! The Parquet readers and writers in this crate handle reading and writing
38//! metadata into parquet files. To work with metadata directly,
39//! the following APIs are available:
40//!
41//! * [`ParquetMetaDataReader`] for reading metadata from an I/O source (sync and async)
42//! * [`ParquetMetaDataPushDecoder`] for decoding from bytes without I/O
43//! * [`ParquetMetaDataWriter`] for writing.
44//!
45//! # Examples
46//!
47//! Please see [`external_metadata.rs`]
48//!
49//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
50//!
51//! # Metadata Encodings and Structures
52//!
53//! There are three different encodings of Parquet Metadata in this crate:
54//!
55//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
56//!    [parquet.thrift]
57//!
58//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
59//!    from [parquet.thrift]. These structures are low level and mirror
60//!    the thrift definitions.
61//!
62//! 3. [`file::metadata`] (this module): Easier to use Rust structures
63//!    with a more idiomatic API. Note that, confusingly, some but not all
64//!    of these structures have the same name as the [`format`] structures.
65//!
66//! [`file::metadata`]: crate::file::metadata
67//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
68//!
69//! Graphically, this is how the different structures relate to each other:
70//!
71//! ```text
72//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
73//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
74//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
75//!                            └──────────────┘     │         └───────────────────────┘ │
76//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
77//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
78//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
79//!                                     ...         │                   ...             │
80//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
81//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
82//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
83//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
84//!
85//!                          format::meta structures          file::metadata structures
86//!
87//!                         * Same name, different struct
88//! ```
89mod footer_tail;
90mod memory;
91mod options;
92mod parser;
93mod push_decoder;
94pub(crate) mod reader;
95pub(crate) mod thrift;
96mod writer;
97
98use crate::basic::{EncodingMask, PageType};
99#[cfg(feature = "encryption")]
100use crate::encryption::decrypt::FileDecryptor;
101#[cfg(feature = "encryption")]
102use crate::file::column_crypto_metadata::ColumnCryptoMetaData;
103pub(crate) use crate::file::metadata::memory::HeapSize;
104#[cfg(feature = "encryption")]
105use crate::file::metadata::thrift::encryption::EncryptionAlgorithm;
106use crate::file::page_index::column_index::{ByteArrayColumnIndex, PrimitiveColumnIndex};
107use crate::file::page_index::{column_index::ColumnIndexMetaData, offset_index::PageLocation};
108use crate::file::statistics::Statistics;
109use crate::geospatial::statistics as geo_statistics;
110use crate::schema::types::{
111    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
112    Type as SchemaType,
113};
114use crate::thrift_struct;
115use crate::{
116    basic::BoundaryOrder,
117    errors::{ParquetError, Result},
118};
119use crate::{
120    basic::{ColumnOrder, Compression, Encoding, Type},
121    parquet_thrift::{
122        ElementType, FieldType, ReadThrift, ThriftCompactInputProtocol,
123        ThriftCompactOutputProtocol, WriteThrift, WriteThriftField,
124    },
125};
126use crate::{
127    data_type::private::ParquetValueType, file::page_index::offset_index::OffsetIndexMetaData,
128};
129
130pub use footer_tail::FooterTail;
131pub use options::{ParquetMetaDataOptions, ParquetStatisticsPolicy};
132pub use push_decoder::ParquetMetaDataPushDecoder;
133pub use reader::{PageIndexPolicy, ParquetMetaDataReader};
134use std::io::Write;
135use std::ops::Range;
136use std::sync::Arc;
137pub use writer::ParquetMetaDataWriter;
138pub(crate) use writer::ThriftMetadataWriter;
139
140/// Page level statistics for each column chunk of each row group.
141///
142/// This structure is an in-memory representation of multiple [`ColumnIndex`]
143/// structures in a parquet file footer, as described in the Parquet [PageIndex
144/// documentation]. Each [`ColumnIndex`] holds statistics about all the pages in a
145/// particular column chunk.
146///
147/// `column_index[row_group_number][column_number]` holds the
148/// [`ColumnIndex`] corresponding to column `column_number` of row group
149/// `row_group_number`.
150///
151/// For example `column_index[2][3]` holds the [`ColumnIndex`] for the fourth
152/// column in the third row group of the parquet file.
153///
154/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
155/// [`ColumnIndex`]: crate::file::page_index::column_index::ColumnIndexMetaData
156pub type ParquetColumnIndex = Vec<Vec<ColumnIndexMetaData>>;
157
158/// [`OffsetIndexMetaData`] for each data page of each row group of each column
159///
160/// This structure is the parsed representation of the [`OffsetIndex`] from the
161/// Parquet file footer, as described in the Parquet [PageIndex documentation].
162///
163/// `offset_index[row_group_number][column_number]` holds
164/// the [`OffsetIndexMetaData`] corresponding to column
165/// `column_number`of row group `row_group_number`.
166///
167/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
168/// [`OffsetIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
169pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
170
171/// Parsed metadata for a single Parquet file
172///
173/// This structure is stored in the footer of Parquet files, in the format
174/// defined by [`parquet.thrift`].
175///
176/// # Overview
177/// The fields of this structure are:
178/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
179/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
180/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
181///
182/// This structure is read by the various readers in this crate or can be read
183/// directly from a file using the [`ParquetMetaDataReader`] struct.
184///
185/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
186///
187/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
188#[derive(Debug, Clone, PartialEq)]
189pub struct ParquetMetaData {
190    /// File level metadata
191    file_metadata: FileMetaData,
192    /// Row group metadata
193    row_groups: Vec<RowGroupMetaData>,
194    /// Page level index for each page in each column chunk
195    column_index: Option<ParquetColumnIndex>,
196    /// Offset index for each page in each column chunk
197    offset_index: Option<ParquetOffsetIndex>,
198    /// Optional file decryptor
199    #[cfg(feature = "encryption")]
200    file_decryptor: Option<Box<FileDecryptor>>,
201}
202
203impl ParquetMetaData {
204    /// Creates Parquet metadata from file metadata and a list of row
205    /// group metadata
206    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
207        ParquetMetaData {
208            file_metadata,
209            row_groups,
210            column_index: None,
211            offset_index: None,
212            #[cfg(feature = "encryption")]
213            file_decryptor: None,
214        }
215    }
216
217    /// Adds [`FileDecryptor`] to this metadata instance to enable decryption of
218    /// encrypted data.
219    #[cfg(feature = "encryption")]
220    pub(crate) fn with_file_decryptor(&mut self, file_decryptor: Option<FileDecryptor>) {
221        self.file_decryptor = file_decryptor.map(Box::new);
222    }
223
224    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
225    pub fn into_builder(self) -> ParquetMetaDataBuilder {
226        self.into()
227    }
228
229    /// Returns file metadata as reference.
230    pub fn file_metadata(&self) -> &FileMetaData {
231        &self.file_metadata
232    }
233
234    /// Returns file decryptor as reference.
235    #[cfg(feature = "encryption")]
236    pub(crate) fn file_decryptor(&self) -> Option<&FileDecryptor> {
237        self.file_decryptor.as_deref()
238    }
239
240    /// Returns number of row groups in this file.
241    pub fn num_row_groups(&self) -> usize {
242        self.row_groups.len()
243    }
244
245    /// Returns row group metadata for `i`th position.
246    /// Position should be less than number of row groups `num_row_groups`.
247    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
248        &self.row_groups[i]
249    }
250
251    /// Returns slice of row groups in this file.
252    pub fn row_groups(&self) -> &[RowGroupMetaData] {
253        &self.row_groups
254    }
255
256    /// Returns the column index for this file if loaded
257    ///
258    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
259    /// [ArrowReaderOptions::with_page_index] was set to false.
260    ///
261    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
262    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
263        self.column_index.as_ref()
264    }
265
266    /// Returns offset indexes in this file, if loaded
267    ///
268    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
269    /// [ArrowReaderOptions::with_page_index] was set to false.
270    ///
271    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
272    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
273        self.offset_index.as_ref()
274    }
275
276    /// Estimate of the bytes allocated to store `ParquetMetadata`
277    ///
278    /// # Notes:
279    ///
280    /// 1. Includes size of self
281    ///
282    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
283    ///    [`RowGroupMetaData`].
284    ///
285    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
286    ///    means `memory_size` will over estimate the memory size if such pointers
287    ///    are shared.
288    ///
289    /// 4. Does not include any allocator overheads
290    pub fn memory_size(&self) -> usize {
291        #[cfg(feature = "encryption")]
292        let encryption_size = self.file_decryptor.heap_size();
293        #[cfg(not(feature = "encryption"))]
294        let encryption_size = 0usize;
295
296        std::mem::size_of::<Self>()
297            + self.file_metadata.heap_size()
298            + self.row_groups.heap_size()
299            + self.column_index.heap_size()
300            + self.offset_index.heap_size()
301            + encryption_size
302    }
303
304    /// Override the column index
305    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
306        self.column_index = index;
307    }
308
309    /// Override the offset index
310    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
311        self.offset_index = index;
312    }
313}
314
315/// A builder for creating / manipulating [`ParquetMetaData`]
316///
317/// # Example creating a new [`ParquetMetaData`]
318///
319///```no_run
320/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
321/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
322/// // Create a new builder given the file metadata
323/// let file_metadata = get_file_metadata();
324/// // Create a row group
325/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
326///    .set_num_rows(100)
327///    // ... (A real row group needs more than just the number of rows)
328///    .build()
329///    .unwrap();
330/// // Create the final metadata
331/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
332///   .add_row_group(row_group)
333///   .build();
334/// ```
335///
336/// # Example modifying an existing [`ParquetMetaData`]
337/// ```no_run
338/// # use parquet::file::metadata::ParquetMetaData;
339/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
340/// // Modify the metadata so only the last RowGroup remains
341/// let metadata: ParquetMetaData = load_metadata();
342/// let mut builder = metadata.into_builder();
343///
344/// // Take existing row groups to modify
345/// let mut row_groups = builder.take_row_groups();
346/// let last_row_group = row_groups.pop().unwrap();
347///
348/// let metadata = builder
349///   .add_row_group(last_row_group)
350///   .build();
351/// ```
352pub struct ParquetMetaDataBuilder(ParquetMetaData);
353
354impl ParquetMetaDataBuilder {
355    /// Create a new builder from a file metadata, with no row groups
356    pub fn new(file_meta_data: FileMetaData) -> Self {
357        Self(ParquetMetaData::new(file_meta_data, vec![]))
358    }
359
360    /// Create a new builder from an existing ParquetMetaData
361    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
362        Self(metadata)
363    }
364
365    /// Adds a row group to the metadata
366    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
367        self.0.row_groups.push(row_group);
368        self
369    }
370
371    /// Sets all the row groups to the specified list
372    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
373        self.0.row_groups = row_groups;
374        self
375    }
376
377    /// Takes ownership of the row groups in this builder, and clears the list
378    /// of row groups.
379    ///
380    /// This can be used for more efficient creation of a new ParquetMetaData
381    /// from an existing one.
382    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
383        std::mem::take(&mut self.0.row_groups)
384    }
385
386    /// Return a reference to the current row groups
387    pub fn row_groups(&self) -> &[RowGroupMetaData] {
388        &self.0.row_groups
389    }
390
391    /// Sets the column index
392    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
393        self.0.column_index = column_index;
394        self
395    }
396
397    /// Returns the current column index from the builder, replacing it with `None`
398    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
399        std::mem::take(&mut self.0.column_index)
400    }
401
402    /// Return a reference to the current column index, if any
403    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
404        self.0.column_index.as_ref()
405    }
406
407    /// Sets the offset index
408    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
409        self.0.offset_index = offset_index;
410        self
411    }
412
413    /// Returns the current offset index from the builder, replacing it with `None`
414    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
415        std::mem::take(&mut self.0.offset_index)
416    }
417
418    /// Return a reference to the current offset index, if any
419    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
420        self.0.offset_index.as_ref()
421    }
422
423    /// Sets the file decryptor needed to decrypt this metadata.
424    #[cfg(feature = "encryption")]
425    pub(crate) fn set_file_decryptor(mut self, file_decryptor: Option<FileDecryptor>) -> Self {
426        self.0.with_file_decryptor(file_decryptor);
427        self
428    }
429
430    /// Creates a new ParquetMetaData from the builder
431    pub fn build(self) -> ParquetMetaData {
432        let Self(metadata) = self;
433        metadata
434    }
435}
436
437impl From<ParquetMetaData> for ParquetMetaDataBuilder {
438    fn from(meta_data: ParquetMetaData) -> Self {
439        Self(meta_data)
440    }
441}
442
443thrift_struct!(
444/// A key-value pair for [`FileMetaData`].
445pub struct KeyValue {
446  1: required string key
447  2: optional string value
448}
449);
450
451impl KeyValue {
452    /// Create a new key value pair
453    pub fn new<F2>(key: String, value: F2) -> KeyValue
454    where
455        F2: Into<Option<String>>,
456    {
457        KeyValue {
458            key,
459            value: value.into(),
460        }
461    }
462}
463
464thrift_struct!(
465/// PageEncodingStats for a column chunk and data page.
466pub struct PageEncodingStats {
467  1: required PageType page_type;
468  2: required Encoding encoding;
469  3: required i32 count;
470}
471);
472
473/// Internal representation of the page encoding stats in the [`ColumnChunkMetaData`].
474/// This is not publicly exposed, with different getters defined for each variant.
475#[derive(Debug, Clone, PartialEq)]
476enum ParquetPageEncodingStats {
477    /// The full array of stats as defined in the Parquet spec.
478    Full(Vec<PageEncodingStats>),
479    /// A condensed version of only page encodings seen.
480    Mask(EncodingMask),
481}
482
483/// Reference counted pointer for [`FileMetaData`].
484pub type FileMetaDataPtr = Arc<FileMetaData>;
485
486/// File level metadata for a Parquet file.
487///
488/// Includes the version of the file, metadata, number of rows, schema, and column orders
489#[derive(Debug, Clone, PartialEq)]
490pub struct FileMetaData {
491    version: i32,
492    num_rows: i64,
493    created_by: Option<String>,
494    key_value_metadata: Option<Vec<KeyValue>>,
495    schema_descr: SchemaDescPtr,
496    column_orders: Option<Vec<ColumnOrder>>,
497    #[cfg(feature = "encryption")]
498    encryption_algorithm: Option<Box<EncryptionAlgorithm>>,
499    #[cfg(feature = "encryption")]
500    footer_signing_key_metadata: Option<Vec<u8>>,
501}
502
503impl FileMetaData {
504    /// Creates new file metadata.
505    pub fn new(
506        version: i32,
507        num_rows: i64,
508        created_by: Option<String>,
509        key_value_metadata: Option<Vec<KeyValue>>,
510        schema_descr: SchemaDescPtr,
511        column_orders: Option<Vec<ColumnOrder>>,
512    ) -> Self {
513        FileMetaData {
514            version,
515            num_rows,
516            created_by,
517            key_value_metadata,
518            schema_descr,
519            column_orders,
520            #[cfg(feature = "encryption")]
521            encryption_algorithm: None,
522            #[cfg(feature = "encryption")]
523            footer_signing_key_metadata: None,
524        }
525    }
526
527    #[cfg(feature = "encryption")]
528    pub(crate) fn with_encryption_algorithm(
529        mut self,
530        encryption_algorithm: Option<EncryptionAlgorithm>,
531    ) -> Self {
532        self.encryption_algorithm = encryption_algorithm.map(Box::new);
533        self
534    }
535
536    #[cfg(feature = "encryption")]
537    pub(crate) fn with_footer_signing_key_metadata(
538        mut self,
539        footer_signing_key_metadata: Option<Vec<u8>>,
540    ) -> Self {
541        self.footer_signing_key_metadata = footer_signing_key_metadata;
542        self
543    }
544
545    /// Returns version of this file.
546    pub fn version(&self) -> i32 {
547        self.version
548    }
549
550    /// Returns number of rows in the file.
551    pub fn num_rows(&self) -> i64 {
552        self.num_rows
553    }
554
555    /// String message for application that wrote this file.
556    ///
557    /// This should have the following format:
558    /// `<application> version <application version> (build <application build hash>)`.
559    ///
560    /// ```shell
561    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
562    /// ```
563    pub fn created_by(&self) -> Option<&str> {
564        self.created_by.as_deref()
565    }
566
567    /// Returns key_value_metadata of this file.
568    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
569        self.key_value_metadata.as_ref()
570    }
571
572    /// Returns Parquet [`Type`] that describes schema in this file.
573    ///
574    /// [`Type`]: crate::schema::types::Type
575    pub fn schema(&self) -> &SchemaType {
576        self.schema_descr.root_schema()
577    }
578
579    /// Returns a reference to schema descriptor.
580    pub fn schema_descr(&self) -> &SchemaDescriptor {
581        &self.schema_descr
582    }
583
584    /// Returns reference counted clone for schema descriptor.
585    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
586        self.schema_descr.clone()
587    }
588
589    /// Column (sort) order used for `min` and `max` values of each column in this file.
590    ///
591    /// Each column order corresponds to one column, determined by its position in the
592    /// list, matching the position of the column in the schema.
593    ///
594    /// When `None` is returned, there are no column orders available, and each column
595    /// should be assumed to have undefined (legacy) column order.
596    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
597        self.column_orders.as_ref()
598    }
599
600    /// Returns column order for `i`th column in this file.
601    /// If column orders are not available, returns undefined (legacy) column order.
602    pub fn column_order(&self, i: usize) -> ColumnOrder {
603        self.column_orders
604            .as_ref()
605            .map(|data| data[i])
606            .unwrap_or(ColumnOrder::UNDEFINED)
607    }
608}
609
610thrift_struct!(
611/// Sort order within a RowGroup of a leaf column
612pub struct SortingColumn {
613  /// The ordinal position of the column (in this row group)
614  1: required i32 column_idx
615
616  /// If true, indicates this column is sorted in descending order.
617  2: required bool descending
618
619  /// If true, nulls will come before non-null values, otherwise,
620  /// nulls go at the end. */
621  3: required bool nulls_first
622}
623);
624
625/// Reference counted pointer for [`RowGroupMetaData`].
626pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
627
628/// Metadata for a row group
629///
630/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
631/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
632#[derive(Debug, Clone, PartialEq)]
633pub struct RowGroupMetaData {
634    columns: Vec<ColumnChunkMetaData>,
635    num_rows: i64,
636    sorting_columns: Option<Vec<SortingColumn>>,
637    total_byte_size: i64,
638    schema_descr: SchemaDescPtr,
639    /// We can't infer from file offset of first column since there may empty columns in row group.
640    file_offset: Option<i64>,
641    /// Ordinal position of this row group in file
642    ordinal: Option<i16>,
643}
644
645impl RowGroupMetaData {
646    /// Returns builder for row group metadata.
647    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
648        RowGroupMetaDataBuilder::new(schema_descr)
649    }
650
651    /// Number of columns in this row group.
652    pub fn num_columns(&self) -> usize {
653        self.columns.len()
654    }
655
656    /// Returns column chunk metadata for `i`th column.
657    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
658        &self.columns[i]
659    }
660
661    /// Returns slice of column chunk metadata.
662    pub fn columns(&self) -> &[ColumnChunkMetaData] {
663        &self.columns
664    }
665
666    /// Returns mutable slice of column chunk metadata.
667    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
668        &mut self.columns
669    }
670
671    /// Number of rows in this row group.
672    pub fn num_rows(&self) -> i64 {
673        self.num_rows
674    }
675
676    /// Returns the sort ordering of the rows in this RowGroup if any
677    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
678        self.sorting_columns.as_ref()
679    }
680
681    /// Total byte size of all uncompressed column data in this row group.
682    pub fn total_byte_size(&self) -> i64 {
683        self.total_byte_size
684    }
685
686    /// Total size of all compressed column data in this row group.
687    pub fn compressed_size(&self) -> i64 {
688        self.columns.iter().map(|c| c.total_compressed_size).sum()
689    }
690
691    /// Returns reference to a schema descriptor.
692    pub fn schema_descr(&self) -> &SchemaDescriptor {
693        self.schema_descr.as_ref()
694    }
695
696    /// Returns reference counted clone of schema descriptor.
697    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
698        self.schema_descr.clone()
699    }
700
701    /// Returns ordinal position of this row group in file.
702    ///
703    /// For example if this is the first row group in the file, this will return 0.
704    /// If this is the second row group in the file, this will return 1.
705    #[inline(always)]
706    pub fn ordinal(&self) -> Option<i16> {
707        self.ordinal
708    }
709
710    /// Returns file offset of this row group in file.
711    #[inline(always)]
712    pub fn file_offset(&self) -> Option<i64> {
713        self.file_offset
714    }
715
716    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
717    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
718        RowGroupMetaDataBuilder(self)
719    }
720}
721
722/// Builder for row group metadata.
723pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
724
725impl RowGroupMetaDataBuilder {
726    /// Creates new builder from schema descriptor.
727    fn new(schema_descr: SchemaDescPtr) -> Self {
728        Self(RowGroupMetaData {
729            columns: Vec::with_capacity(schema_descr.num_columns()),
730            schema_descr,
731            file_offset: None,
732            num_rows: 0,
733            sorting_columns: None,
734            total_byte_size: 0,
735            ordinal: None,
736        })
737    }
738
739    /// Sets number of rows in this row group.
740    pub fn set_num_rows(mut self, value: i64) -> Self {
741        self.0.num_rows = value;
742        self
743    }
744
745    /// Sets the sorting order for columns
746    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
747        self.0.sorting_columns = value;
748        self
749    }
750
751    /// Sets total size in bytes for this row group.
752    pub fn set_total_byte_size(mut self, value: i64) -> Self {
753        self.0.total_byte_size = value;
754        self
755    }
756
757    /// Takes ownership of the the column metadata in this builder, and clears
758    /// the list of columns.
759    ///
760    /// This can be used for more efficient creation of a new RowGroupMetaData
761    /// from an existing one.
762    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
763        std::mem::take(&mut self.0.columns)
764    }
765
766    /// Sets column metadata for this row group.
767    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
768        self.0.columns = value;
769        self
770    }
771
772    /// Adds a column metadata to this row group
773    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
774        self.0.columns.push(value);
775        self
776    }
777
778    /// Sets ordinal for this row group.
779    pub fn set_ordinal(mut self, value: i16) -> Self {
780        self.0.ordinal = Some(value);
781        self
782    }
783
784    /// Sets file offset for this row group.
785    pub fn set_file_offset(mut self, value: i64) -> Self {
786        self.0.file_offset = Some(value);
787        self
788    }
789
790    /// Builds row group metadata.
791    pub fn build(self) -> Result<RowGroupMetaData> {
792        if self.0.schema_descr.num_columns() != self.0.columns.len() {
793            return Err(general_err!(
794                "Column length mismatch: {} != {}",
795                self.0.schema_descr.num_columns(),
796                self.0.columns.len()
797            ));
798        }
799
800        Ok(self.0)
801    }
802
803    /// Build row group metadata without validation.
804    pub(super) fn build_unchecked(self) -> RowGroupMetaData {
805        self.0
806    }
807}
808
809/// Metadata for a column chunk.
810#[derive(Debug, Clone, PartialEq)]
811pub struct ColumnChunkMetaData {
812    column_descr: ColumnDescPtr,
813    encodings: EncodingMask,
814    file_path: Option<String>,
815    file_offset: i64,
816    num_values: i64,
817    compression: Compression,
818    total_compressed_size: i64,
819    total_uncompressed_size: i64,
820    data_page_offset: i64,
821    index_page_offset: Option<i64>,
822    dictionary_page_offset: Option<i64>,
823    statistics: Option<Statistics>,
824    geo_statistics: Option<Box<geo_statistics::GeospatialStatistics>>,
825    encoding_stats: Option<ParquetPageEncodingStats>,
826    bloom_filter_offset: Option<i64>,
827    bloom_filter_length: Option<i32>,
828    offset_index_offset: Option<i64>,
829    offset_index_length: Option<i32>,
830    column_index_offset: Option<i64>,
831    column_index_length: Option<i32>,
832    unencoded_byte_array_data_bytes: Option<i64>,
833    repetition_level_histogram: Option<LevelHistogram>,
834    definition_level_histogram: Option<LevelHistogram>,
835    #[cfg(feature = "encryption")]
836    column_crypto_metadata: Option<Box<ColumnCryptoMetaData>>,
837    #[cfg(feature = "encryption")]
838    encrypted_column_metadata: Option<Vec<u8>>,
839}
840
841/// Histograms for repetition and definition levels.
842///
843/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
844/// values at level `i`.
845///
846/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
847/// number of rows with level 1, and so on.
848///
849#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
850pub struct LevelHistogram {
851    inner: Vec<i64>,
852}
853
854impl LevelHistogram {
855    /// Creates a new level histogram data.
856    ///
857    /// Length will be `max_level + 1`.
858    ///
859    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
860    pub fn try_new(max_level: i16) -> Option<Self> {
861        if max_level > 0 {
862            Some(Self {
863                inner: vec![0; max_level as usize + 1],
864            })
865        } else {
866            None
867        }
868    }
869    /// Returns a reference to the the histogram's values.
870    pub fn values(&self) -> &[i64] {
871        &self.inner
872    }
873
874    /// Return the inner vector, consuming self
875    pub fn into_inner(self) -> Vec<i64> {
876        self.inner
877    }
878
879    /// Returns the histogram value at the given index.
880    ///
881    /// The value of `i` is the number of values with level `i`. For example,
882    /// `get(1)` returns the number of values with level 1.
883    ///
884    /// Returns `None` if the index is out of bounds.
885    pub fn get(&self, index: usize) -> Option<i64> {
886        self.inner.get(index).copied()
887    }
888
889    /// Adds the values from the other histogram to this histogram
890    ///
891    /// # Panics
892    /// If the histograms have different lengths
893    pub fn add(&mut self, other: &Self) {
894        assert_eq!(self.len(), other.len());
895        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
896            *dst += src;
897        }
898    }
899
900    /// return the length of the histogram
901    pub fn len(&self) -> usize {
902        self.inner.len()
903    }
904
905    /// returns if the histogram is empty
906    pub fn is_empty(&self) -> bool {
907        self.inner.is_empty()
908    }
909
910    /// Sets the values of all histogram levels to 0.
911    pub fn reset(&mut self) {
912        for value in self.inner.iter_mut() {
913            *value = 0;
914        }
915    }
916
917    /// Updates histogram values using provided repetition levels
918    ///
919    /// # Panics
920    /// if any of the levels is greater than the length of the histogram (
921    /// the argument supplied to [`Self::try_new`])
922    pub fn update_from_levels(&mut self, levels: &[i16]) {
923        for &level in levels {
924            self.inner[level as usize] += 1;
925        }
926    }
927}
928
929impl From<Vec<i64>> for LevelHistogram {
930    fn from(inner: Vec<i64>) -> Self {
931        Self { inner }
932    }
933}
934
935impl From<LevelHistogram> for Vec<i64> {
936    fn from(value: LevelHistogram) -> Self {
937        value.into_inner()
938    }
939}
940
941impl HeapSize for LevelHistogram {
942    fn heap_size(&self) -> usize {
943        self.inner.heap_size()
944    }
945}
946
947/// Represents common operations for a column chunk.
948impl ColumnChunkMetaData {
949    /// Returns builder for column chunk metadata.
950    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
951        ColumnChunkMetaDataBuilder::new(column_descr)
952    }
953
954    /// File where the column chunk is stored.
955    ///
956    /// If not set, assumed to belong to the same file as the metadata.
957    /// This path is relative to the current file.
958    pub fn file_path(&self) -> Option<&str> {
959        self.file_path.as_deref()
960    }
961
962    /// Byte offset of `ColumnMetaData` in `file_path()`.
963    ///
964    /// Note that the meaning of this field has been inconsistent between implementations
965    /// so its use has since been deprecated in the Parquet specification. Modern implementations
966    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
967    /// `ColumnChunk` struct.
968    pub fn file_offset(&self) -> i64 {
969        self.file_offset
970    }
971
972    /// Type of this column. Must be primitive.
973    pub fn column_type(&self) -> Type {
974        self.column_descr.physical_type()
975    }
976
977    /// Path (or identifier) of this column.
978    pub fn column_path(&self) -> &ColumnPath {
979        self.column_descr.path()
980    }
981
982    /// Descriptor for this column.
983    pub fn column_descr(&self) -> &ColumnDescriptor {
984        self.column_descr.as_ref()
985    }
986
987    /// Reference counted clone of descriptor for this column.
988    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
989        self.column_descr.clone()
990    }
991
992    /// All encodings used for this column.
993    pub fn encodings(&self) -> impl Iterator<Item = Encoding> {
994        self.encodings.encodings()
995    }
996
997    /// All encodings used for this column, returned as a bitmask.
998    pub fn encodings_mask(&self) -> &EncodingMask {
999        &self.encodings
1000    }
1001
1002    /// Total number of values in this column chunk.
1003    pub fn num_values(&self) -> i64 {
1004        self.num_values
1005    }
1006
1007    /// Compression for this column.
1008    pub fn compression(&self) -> Compression {
1009        self.compression
1010    }
1011
1012    /// Returns the total compressed data size of this column chunk.
1013    pub fn compressed_size(&self) -> i64 {
1014        self.total_compressed_size
1015    }
1016
1017    /// Returns the total uncompressed data size of this column chunk.
1018    pub fn uncompressed_size(&self) -> i64 {
1019        self.total_uncompressed_size
1020    }
1021
1022    /// Returns the offset for the column data.
1023    pub fn data_page_offset(&self) -> i64 {
1024        self.data_page_offset
1025    }
1026
1027    /// Returns the offset for the index page.
1028    pub fn index_page_offset(&self) -> Option<i64> {
1029        self.index_page_offset
1030    }
1031
1032    /// Returns the offset for the dictionary page, if any.
1033    pub fn dictionary_page_offset(&self) -> Option<i64> {
1034        self.dictionary_page_offset
1035    }
1036
1037    /// Returns the offset and length in bytes of the column chunk within the file
1038    pub fn byte_range(&self) -> (u64, u64) {
1039        let col_start = match self.dictionary_page_offset() {
1040            Some(dictionary_page_offset) => dictionary_page_offset,
1041            None => self.data_page_offset(),
1042        };
1043        let col_len = self.compressed_size();
1044        assert!(
1045            col_start >= 0 && col_len >= 0,
1046            "column start and length should not be negative"
1047        );
1048        (col_start as u64, col_len as u64)
1049    }
1050
1051    /// Returns statistics that are set for this column chunk,
1052    /// or `None` if no statistics are available.
1053    pub fn statistics(&self) -> Option<&Statistics> {
1054        self.statistics.as_ref()
1055    }
1056
1057    /// Returns geospatial statistics that are set for this column chunk,
1058    /// or `None` if no geospatial statistics are available.
1059    pub fn geo_statistics(&self) -> Option<&geo_statistics::GeospatialStatistics> {
1060        self.geo_statistics.as_deref()
1061    }
1062
1063    /// Returns the page encoding statistics, or `None` if no page encoding statistics
1064    /// are available (or they were converted to a mask).
1065    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
1066        match self.encoding_stats.as_ref() {
1067            Some(ParquetPageEncodingStats::Full(stats)) => Some(stats),
1068            _ => None,
1069        }
1070    }
1071
1072    /// Returns the page encoding statistics reduced to a bitmask, or `None` if statistics are
1073    /// not available (or they were left in their original form).
1074    ///
1075    /// The [`PageEncodingStats`] struct was added to the Parquet specification specifically to
1076    /// enable fast determination of whether all pages in a column chunk are dictionary encoded
1077    /// (see <https://github.com/apache/parquet-format/pull/16>).
1078    /// Decoding the full page encoding statistics, however, can be very costly, and is not
1079    /// necessary to support the aforementioned use case. As an alternative, this crate can
1080    /// instead distill the list of `PageEncodingStats` down to a bitmask of just the encodings
1081    /// used for data pages
1082    /// (see [`ParquetMetaDataOptions::set_encoding_stats_as_mask`]).
1083    /// To test for an all-dictionary-encoded chunk one could use this bitmask in the following way:
1084    ///
1085    /// ```rust
1086    /// use parquet::basic::Encoding;
1087    /// use parquet::file::metadata::ColumnChunkMetaData;
1088    /// // test if all data pages in the column chunk are dictionary encoded
1089    /// fn is_all_dictionary_encoded(col_meta: &ColumnChunkMetaData) -> bool {
1090    ///     // check that dictionary encoding was used
1091    ///     col_meta.dictionary_page_offset().is_some()
1092    ///         && col_meta.page_encoding_stats_mask().is_some_and(|mask| {
1093    ///             // mask should only have one bit set, either for PLAIN_DICTIONARY or
1094    ///             // RLE_DICTIONARY
1095    ///             mask.is_only(Encoding::PLAIN_DICTIONARY) || mask.is_only(Encoding::RLE_DICTIONARY)
1096    ///         })
1097    /// }
1098    /// ```
1099    pub fn page_encoding_stats_mask(&self) -> Option<&EncodingMask> {
1100        match self.encoding_stats.as_ref() {
1101            Some(ParquetPageEncodingStats::Mask(stats)) => Some(stats),
1102            _ => None,
1103        }
1104    }
1105
1106    /// Returns the offset for the bloom filter.
1107    pub fn bloom_filter_offset(&self) -> Option<i64> {
1108        self.bloom_filter_offset
1109    }
1110
1111    /// Returns the offset for the bloom filter.
1112    pub fn bloom_filter_length(&self) -> Option<i32> {
1113        self.bloom_filter_length
1114    }
1115
1116    /// Returns the offset for the column index.
1117    pub fn column_index_offset(&self) -> Option<i64> {
1118        self.column_index_offset
1119    }
1120
1121    /// Returns the offset for the column index length.
1122    pub fn column_index_length(&self) -> Option<i32> {
1123        self.column_index_length
1124    }
1125
1126    /// Returns the range for the offset index if any
1127    pub(crate) fn column_index_range(&self) -> Option<Range<u64>> {
1128        let offset = u64::try_from(self.column_index_offset?).ok()?;
1129        let length = u64::try_from(self.column_index_length?).ok()?;
1130        Some(offset..(offset + length))
1131    }
1132
1133    /// Returns the offset for the offset index.
1134    pub fn offset_index_offset(&self) -> Option<i64> {
1135        self.offset_index_offset
1136    }
1137
1138    /// Returns the offset for the offset index length.
1139    pub fn offset_index_length(&self) -> Option<i32> {
1140        self.offset_index_length
1141    }
1142
1143    /// Returns the range for the offset index if any
1144    pub(crate) fn offset_index_range(&self) -> Option<Range<u64>> {
1145        let offset = u64::try_from(self.offset_index_offset?).ok()?;
1146        let length = u64::try_from(self.offset_index_length?).ok()?;
1147        Some(offset..(offset + length))
1148    }
1149
1150    /// Returns the number of bytes of variable length data after decoding.
1151    ///
1152    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1153    /// writers.
1154    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1155        self.unencoded_byte_array_data_bytes
1156    }
1157
1158    /// Returns the repetition level histogram.
1159    ///
1160    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1161    /// `vec[0]` indicates how many rows the page contains.
1162    /// This field may not be set by older writers.
1163    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1164        self.repetition_level_histogram.as_ref()
1165    }
1166
1167    /// Returns the definition level histogram.
1168    ///
1169    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1170    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1171    /// This field may not be set by older writers.
1172    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1173        self.definition_level_histogram.as_ref()
1174    }
1175
1176    /// Returns the encryption metadata for this column chunk.
1177    #[cfg(feature = "encryption")]
1178    pub fn crypto_metadata(&self) -> Option<&ColumnCryptoMetaData> {
1179        self.column_crypto_metadata.as_deref()
1180    }
1181
1182    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1183    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1184        ColumnChunkMetaDataBuilder::from(self)
1185    }
1186}
1187
1188/// Builder for [`ColumnChunkMetaData`]
1189///
1190/// This builder is used to create a new column chunk metadata or modify an
1191/// existing one.
1192///
1193/// # Example
1194/// ```no_run
1195/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1196/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1197/// let column_chunk_metadata = get_column_chunk_metadata();
1198/// // create a new builder from existing column chunk metadata
1199/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1200/// // clear the statistics:
1201/// let column_chunk_metadata: ColumnChunkMetaData = builder
1202///   .clear_statistics()
1203///   .build()
1204///   .unwrap();
1205/// ```
1206pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1207
1208impl ColumnChunkMetaDataBuilder {
1209    /// Creates new column chunk metadata builder.
1210    ///
1211    /// See also [`ColumnChunkMetaData::builder`]
1212    fn new(column_descr: ColumnDescPtr) -> Self {
1213        Self(ColumnChunkMetaData {
1214            column_descr,
1215            encodings: Default::default(),
1216            file_path: None,
1217            file_offset: 0,
1218            num_values: 0,
1219            compression: Compression::UNCOMPRESSED,
1220            total_compressed_size: 0,
1221            total_uncompressed_size: 0,
1222            data_page_offset: 0,
1223            index_page_offset: None,
1224            dictionary_page_offset: None,
1225            statistics: None,
1226            geo_statistics: None,
1227            encoding_stats: None,
1228            bloom_filter_offset: None,
1229            bloom_filter_length: None,
1230            offset_index_offset: None,
1231            offset_index_length: None,
1232            column_index_offset: None,
1233            column_index_length: None,
1234            unencoded_byte_array_data_bytes: None,
1235            repetition_level_histogram: None,
1236            definition_level_histogram: None,
1237            #[cfg(feature = "encryption")]
1238            column_crypto_metadata: None,
1239            #[cfg(feature = "encryption")]
1240            encrypted_column_metadata: None,
1241        })
1242    }
1243
1244    /// Sets list of encodings for this column chunk.
1245    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1246        self.0.encodings = EncodingMask::new_from_encodings(encodings.iter());
1247        self
1248    }
1249
1250    /// Sets the encodings mask for this column chunk.
1251    pub fn set_encodings_mask(mut self, encodings: EncodingMask) -> Self {
1252        self.0.encodings = encodings;
1253        self
1254    }
1255
1256    /// Sets optional file path for this column chunk.
1257    pub fn set_file_path(mut self, value: String) -> Self {
1258        self.0.file_path = Some(value);
1259        self
1260    }
1261
1262    /// Sets number of values.
1263    pub fn set_num_values(mut self, value: i64) -> Self {
1264        self.0.num_values = value;
1265        self
1266    }
1267
1268    /// Sets compression.
1269    pub fn set_compression(mut self, value: Compression) -> Self {
1270        self.0.compression = value;
1271        self
1272    }
1273
1274    /// Sets total compressed size in bytes.
1275    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1276        self.0.total_compressed_size = value;
1277        self
1278    }
1279
1280    /// Sets total uncompressed size in bytes.
1281    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1282        self.0.total_uncompressed_size = value;
1283        self
1284    }
1285
1286    /// Sets data page offset in bytes.
1287    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1288        self.0.data_page_offset = value;
1289        self
1290    }
1291
1292    /// Sets optional dictionary page offset in bytes.
1293    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1294        self.0.dictionary_page_offset = value;
1295        self
1296    }
1297
1298    /// Sets optional index page offset in bytes.
1299    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1300        self.0.index_page_offset = value;
1301        self
1302    }
1303
1304    /// Sets statistics for this column chunk.
1305    pub fn set_statistics(mut self, value: Statistics) -> Self {
1306        self.0.statistics = Some(value);
1307        self
1308    }
1309
1310    /// Sets geospatial statistics for this column chunk.
1311    pub fn set_geo_statistics(mut self, value: Box<geo_statistics::GeospatialStatistics>) -> Self {
1312        self.0.geo_statistics = Some(value);
1313        self
1314    }
1315
1316    /// Clears the statistics for this column chunk.
1317    pub fn clear_statistics(mut self) -> Self {
1318        self.0.statistics = None;
1319        self
1320    }
1321
1322    /// Sets page encoding stats for this column chunk.
1323    ///
1324    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1325    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1326        self.0.encoding_stats = Some(ParquetPageEncodingStats::Full(value));
1327        self
1328    }
1329
1330    /// Sets page encoding stats mask for this column chunk.
1331    ///
1332    /// This will overwrite any existing stats, either `Vec` based or bitmask.
1333    pub fn set_page_encoding_stats_mask(mut self, value: EncodingMask) -> Self {
1334        self.0.encoding_stats = Some(ParquetPageEncodingStats::Mask(value));
1335        self
1336    }
1337
1338    /// Clears the page encoding stats for this column chunk.
1339    pub fn clear_page_encoding_stats(mut self) -> Self {
1340        self.0.encoding_stats = None;
1341        self
1342    }
1343
1344    /// Sets optional bloom filter offset in bytes.
1345    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1346        self.0.bloom_filter_offset = value;
1347        self
1348    }
1349
1350    /// Sets optional bloom filter length in bytes.
1351    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1352        self.0.bloom_filter_length = value;
1353        self
1354    }
1355
1356    /// Sets optional offset index offset in bytes.
1357    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1358        self.0.offset_index_offset = value;
1359        self
1360    }
1361
1362    /// Sets optional offset index length in bytes.
1363    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1364        self.0.offset_index_length = value;
1365        self
1366    }
1367
1368    /// Sets optional column index offset in bytes.
1369    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1370        self.0.column_index_offset = value;
1371        self
1372    }
1373
1374    /// Sets optional column index length in bytes.
1375    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1376        self.0.column_index_length = value;
1377        self
1378    }
1379
1380    /// Sets optional length of variable length data in bytes.
1381    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1382        self.0.unencoded_byte_array_data_bytes = value;
1383        self
1384    }
1385
1386    /// Sets optional repetition level histogram
1387    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1388        self.0.repetition_level_histogram = value;
1389        self
1390    }
1391
1392    /// Sets optional repetition level histogram
1393    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1394        self.0.definition_level_histogram = value;
1395        self
1396    }
1397
1398    #[cfg(feature = "encryption")]
1399    /// Set the encryption metadata for an encrypted column
1400    pub fn set_column_crypto_metadata(mut self, value: Option<ColumnCryptoMetaData>) -> Self {
1401        self.0.column_crypto_metadata = value.map(Box::new);
1402        self
1403    }
1404
1405    #[cfg(feature = "encryption")]
1406    /// Set the encryption metadata for an encrypted column
1407    pub fn set_encrypted_column_metadata(mut self, value: Option<Vec<u8>>) -> Self {
1408        self.0.encrypted_column_metadata = value;
1409        self
1410    }
1411
1412    /// Builds column chunk metadata.
1413    pub fn build(self) -> Result<ColumnChunkMetaData> {
1414        Ok(self.0)
1415    }
1416}
1417
1418/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1419///
1420/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1421/// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1422pub struct ColumnIndexBuilder {
1423    column_type: Type,
1424    null_pages: Vec<bool>,
1425    min_values: Vec<Vec<u8>>,
1426    max_values: Vec<Vec<u8>>,
1427    null_counts: Vec<i64>,
1428    boundary_order: BoundaryOrder,
1429    /// contains the concatenation of the histograms of all pages
1430    repetition_level_histograms: Option<Vec<i64>>,
1431    /// contains the concatenation of the histograms of all pages
1432    definition_level_histograms: Option<Vec<i64>>,
1433    /// Is the information in the builder valid?
1434    ///
1435    /// Set to `false` if any entry in the page doesn't have statistics for
1436    /// some reason, so statistics for that page won't be written to the file.
1437    /// This might happen if the page is entirely null, or
1438    /// is a floating point column without any non-nan values
1439    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1440    valid: bool,
1441}
1442
1443impl ColumnIndexBuilder {
1444    /// Creates a new column index builder.
1445    pub fn new(column_type: Type) -> Self {
1446        ColumnIndexBuilder {
1447            column_type,
1448            null_pages: Vec::new(),
1449            min_values: Vec::new(),
1450            max_values: Vec::new(),
1451            null_counts: Vec::new(),
1452            boundary_order: BoundaryOrder::UNORDERED,
1453            repetition_level_histograms: None,
1454            definition_level_histograms: None,
1455            valid: true,
1456        }
1457    }
1458
1459    /// Append statistics for the next page
1460    pub fn append(
1461        &mut self,
1462        null_page: bool,
1463        min_value: Vec<u8>,
1464        max_value: Vec<u8>,
1465        null_count: i64,
1466    ) {
1467        self.null_pages.push(null_page);
1468        self.min_values.push(min_value);
1469        self.max_values.push(max_value);
1470        self.null_counts.push(null_count);
1471    }
1472
1473    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1474    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1475    ///
1476    /// [`ColumnIndex`]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1477    pub fn append_histograms(
1478        &mut self,
1479        repetition_level_histogram: &Option<LevelHistogram>,
1480        definition_level_histogram: &Option<LevelHistogram>,
1481    ) {
1482        if !self.valid {
1483            return;
1484        }
1485        if let Some(rep_lvl_hist) = repetition_level_histogram {
1486            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1487            hist.reserve(rep_lvl_hist.len());
1488            hist.extend(rep_lvl_hist.values());
1489        }
1490        if let Some(def_lvl_hist) = definition_level_histogram {
1491            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1492            hist.reserve(def_lvl_hist.len());
1493            hist.extend(def_lvl_hist.values());
1494        }
1495    }
1496
1497    /// Set the boundary order of the column index
1498    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1499        self.boundary_order = boundary_order;
1500    }
1501
1502    /// Mark this column index as invalid
1503    pub fn to_invalid(&mut self) {
1504        self.valid = false;
1505    }
1506
1507    /// Is the information in the builder valid?
1508    pub fn valid(&self) -> bool {
1509        self.valid
1510    }
1511
1512    /// Build and get the column index
1513    ///
1514    /// Note: callers should check [`Self::valid`] before calling this method
1515    pub fn build(self) -> Result<ColumnIndexMetaData> {
1516        Ok(match self.column_type {
1517            Type::BOOLEAN => {
1518                let index = self.build_page_index()?;
1519                ColumnIndexMetaData::BOOLEAN(index)
1520            }
1521            Type::INT32 => {
1522                let index = self.build_page_index()?;
1523                ColumnIndexMetaData::INT32(index)
1524            }
1525            Type::INT64 => {
1526                let index = self.build_page_index()?;
1527                ColumnIndexMetaData::INT64(index)
1528            }
1529            Type::INT96 => {
1530                let index = self.build_page_index()?;
1531                ColumnIndexMetaData::INT96(index)
1532            }
1533            Type::FLOAT => {
1534                let index = self.build_page_index()?;
1535                ColumnIndexMetaData::FLOAT(index)
1536            }
1537            Type::DOUBLE => {
1538                let index = self.build_page_index()?;
1539                ColumnIndexMetaData::DOUBLE(index)
1540            }
1541            Type::BYTE_ARRAY => {
1542                let index = self.build_byte_array_index()?;
1543                ColumnIndexMetaData::BYTE_ARRAY(index)
1544            }
1545            Type::FIXED_LEN_BYTE_ARRAY => {
1546                let index = self.build_byte_array_index()?;
1547                ColumnIndexMetaData::FIXED_LEN_BYTE_ARRAY(index)
1548            }
1549        })
1550    }
1551
1552    fn build_page_index<T>(self) -> Result<PrimitiveColumnIndex<T>>
1553    where
1554        T: ParquetValueType,
1555    {
1556        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1557        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1558
1559        PrimitiveColumnIndex::try_new(
1560            self.null_pages,
1561            self.boundary_order,
1562            Some(self.null_counts),
1563            self.repetition_level_histograms,
1564            self.definition_level_histograms,
1565            min_values,
1566            max_values,
1567        )
1568    }
1569
1570    fn build_byte_array_index(self) -> Result<ByteArrayColumnIndex> {
1571        let min_values: Vec<&[u8]> = self.min_values.iter().map(|v| v.as_slice()).collect();
1572        let max_values: Vec<&[u8]> = self.max_values.iter().map(|v| v.as_slice()).collect();
1573
1574        ByteArrayColumnIndex::try_new(
1575            self.null_pages,
1576            self.boundary_order,
1577            Some(self.null_counts),
1578            self.repetition_level_histograms,
1579            self.definition_level_histograms,
1580            min_values,
1581            max_values,
1582        )
1583    }
1584}
1585
1586impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1587    fn from(value: ColumnChunkMetaData) -> Self {
1588        ColumnChunkMetaDataBuilder(value)
1589    }
1590}
1591
1592/// Builder for offset index, part of the Parquet [PageIndex].
1593///
1594/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1595pub struct OffsetIndexBuilder {
1596    offset_array: Vec<i64>,
1597    compressed_page_size_array: Vec<i32>,
1598    first_row_index_array: Vec<i64>,
1599    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1600    current_first_row_index: i64,
1601}
1602
1603impl Default for OffsetIndexBuilder {
1604    fn default() -> Self {
1605        Self::new()
1606    }
1607}
1608
1609impl OffsetIndexBuilder {
1610    /// Creates a new offset index builder.
1611    pub fn new() -> Self {
1612        OffsetIndexBuilder {
1613            offset_array: Vec::new(),
1614            compressed_page_size_array: Vec::new(),
1615            first_row_index_array: Vec::new(),
1616            unencoded_byte_array_data_bytes_array: None,
1617            current_first_row_index: 0,
1618        }
1619    }
1620
1621    /// Append the row count of the next page.
1622    pub fn append_row_count(&mut self, row_count: i64) {
1623        let current_page_row_index = self.current_first_row_index;
1624        self.first_row_index_array.push(current_page_row_index);
1625        self.current_first_row_index += row_count;
1626    }
1627
1628    /// Append the offset and size of the next page.
1629    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1630        self.offset_array.push(offset);
1631        self.compressed_page_size_array.push(compressed_page_size);
1632    }
1633
1634    /// Append the unencoded byte array data bytes of the next page.
1635    pub fn append_unencoded_byte_array_data_bytes(
1636        &mut self,
1637        unencoded_byte_array_data_bytes: Option<i64>,
1638    ) {
1639        if let Some(val) = unencoded_byte_array_data_bytes {
1640            self.unencoded_byte_array_data_bytes_array
1641                .get_or_insert(Vec::new())
1642                .push(val);
1643        }
1644    }
1645
1646    /// Build and get the thrift metadata of offset index
1647    pub fn build(self) -> OffsetIndexMetaData {
1648        let locations = self
1649            .offset_array
1650            .iter()
1651            .zip(self.compressed_page_size_array.iter())
1652            .zip(self.first_row_index_array.iter())
1653            .map(|((offset, size), row_index)| PageLocation {
1654                offset: *offset,
1655                compressed_page_size: *size,
1656                first_row_index: *row_index,
1657            })
1658            .collect::<Vec<_>>();
1659        OffsetIndexMetaData {
1660            page_locations: locations,
1661            unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes_array,
1662        }
1663    }
1664}
1665
1666#[cfg(test)]
1667mod tests {
1668    use super::*;
1669    use crate::basic::{PageType, SortOrder};
1670    use crate::file::metadata::thrift::tests::{read_column_chunk, read_row_group};
1671
1672    #[test]
1673    fn test_row_group_metadata_thrift_conversion() {
1674        let schema_descr = get_test_schema_descr();
1675
1676        let mut columns = vec![];
1677        for ptr in schema_descr.columns() {
1678            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1679            columns.push(column);
1680        }
1681        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1682            .set_num_rows(1000)
1683            .set_total_byte_size(2000)
1684            .set_column_metadata(columns)
1685            .set_ordinal(1)
1686            .build()
1687            .unwrap();
1688
1689        let mut buf = Vec::new();
1690        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1691        row_group_meta.write_thrift(&mut writer).unwrap();
1692
1693        let row_group_res = read_row_group(&mut buf, schema_descr).unwrap();
1694
1695        assert_eq!(row_group_res, row_group_meta);
1696    }
1697
1698    #[test]
1699    fn test_row_group_metadata_thrift_conversion_empty() {
1700        let schema_descr = get_test_schema_descr();
1701
1702        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1703
1704        assert!(row_group_meta.is_err());
1705        if let Err(e) = row_group_meta {
1706            assert_eq!(
1707                format!("{e}"),
1708                "Parquet error: Column length mismatch: 2 != 0"
1709            );
1710        }
1711    }
1712
1713    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1714    #[test]
1715    fn test_row_group_metadata_thrift_corrupted() {
1716        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1717            SchemaType::group_type_builder("schema")
1718                .with_fields(vec![
1719                    Arc::new(
1720                        SchemaType::primitive_type_builder("a", Type::INT32)
1721                            .build()
1722                            .unwrap(),
1723                    ),
1724                    Arc::new(
1725                        SchemaType::primitive_type_builder("b", Type::INT32)
1726                            .build()
1727                            .unwrap(),
1728                    ),
1729                ])
1730                .build()
1731                .unwrap(),
1732        )));
1733
1734        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1735            SchemaType::group_type_builder("schema")
1736                .with_fields(vec![
1737                    Arc::new(
1738                        SchemaType::primitive_type_builder("a", Type::INT32)
1739                            .build()
1740                            .unwrap(),
1741                    ),
1742                    Arc::new(
1743                        SchemaType::primitive_type_builder("b", Type::INT32)
1744                            .build()
1745                            .unwrap(),
1746                    ),
1747                    Arc::new(
1748                        SchemaType::primitive_type_builder("c", Type::INT32)
1749                            .build()
1750                            .unwrap(),
1751                    ),
1752                ])
1753                .build()
1754                .unwrap(),
1755        )));
1756
1757        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1758            .set_num_rows(1000)
1759            .set_total_byte_size(2000)
1760            .set_column_metadata(vec![
1761                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1762                    .build()
1763                    .unwrap(),
1764                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1765                    .build()
1766                    .unwrap(),
1767            ])
1768            .set_ordinal(1)
1769            .build()
1770            .unwrap();
1771        let mut buf = Vec::new();
1772        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1773        row_group_meta_2cols.write_thrift(&mut writer).unwrap();
1774
1775        let err = read_row_group(&mut buf, schema_descr_3cols)
1776            .unwrap_err()
1777            .to_string();
1778        assert_eq!(
1779            err,
1780            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1781        );
1782    }
1783
1784    #[test]
1785    fn test_column_chunk_metadata_thrift_conversion() {
1786        let column_descr = get_test_schema_descr().column(0);
1787        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1788            .set_encodings_mask(EncodingMask::new_from_encodings(
1789                [Encoding::PLAIN, Encoding::RLE].iter(),
1790            ))
1791            .set_file_path("file_path".to_owned())
1792            .set_num_values(1000)
1793            .set_compression(Compression::SNAPPY)
1794            .set_total_compressed_size(2000)
1795            .set_total_uncompressed_size(3000)
1796            .set_data_page_offset(4000)
1797            .set_dictionary_page_offset(Some(5000))
1798            .set_page_encoding_stats(vec![
1799                PageEncodingStats {
1800                    page_type: PageType::DATA_PAGE,
1801                    encoding: Encoding::PLAIN,
1802                    count: 3,
1803                },
1804                PageEncodingStats {
1805                    page_type: PageType::DATA_PAGE,
1806                    encoding: Encoding::RLE,
1807                    count: 5,
1808                },
1809            ])
1810            .set_bloom_filter_offset(Some(6000))
1811            .set_bloom_filter_length(Some(25))
1812            .set_offset_index_offset(Some(7000))
1813            .set_offset_index_length(Some(25))
1814            .set_column_index_offset(Some(8000))
1815            .set_column_index_length(Some(25))
1816            .set_unencoded_byte_array_data_bytes(Some(2000))
1817            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1818            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1819            .build()
1820            .unwrap();
1821
1822        let mut buf = Vec::new();
1823        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1824        col_metadata.write_thrift(&mut writer).unwrap();
1825        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1826
1827        assert_eq!(col_chunk_res, col_metadata);
1828    }
1829
1830    #[test]
1831    fn test_column_chunk_metadata_thrift_conversion_empty() {
1832        let column_descr = get_test_schema_descr().column(0);
1833
1834        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1835            .build()
1836            .unwrap();
1837
1838        let mut buf = Vec::new();
1839        let mut writer = ThriftCompactOutputProtocol::new(&mut buf);
1840        col_metadata.write_thrift(&mut writer).unwrap();
1841        let col_chunk_res = read_column_chunk(&mut buf, column_descr).unwrap();
1842
1843        assert_eq!(col_chunk_res, col_metadata);
1844    }
1845
1846    #[test]
1847    fn test_compressed_size() {
1848        let schema_descr = get_test_schema_descr();
1849
1850        let mut columns = vec![];
1851        for column_descr in schema_descr.columns() {
1852            let column = ColumnChunkMetaData::builder(column_descr.clone())
1853                .set_total_compressed_size(500)
1854                .set_total_uncompressed_size(700)
1855                .build()
1856                .unwrap();
1857            columns.push(column);
1858        }
1859        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1860            .set_num_rows(1000)
1861            .set_column_metadata(columns)
1862            .build()
1863            .unwrap();
1864
1865        let compressed_size_res: i64 = row_group_meta.compressed_size();
1866        let compressed_size_exp: i64 = 1000;
1867
1868        assert_eq!(compressed_size_res, compressed_size_exp);
1869    }
1870
1871    #[test]
1872    fn test_memory_size() {
1873        let schema_descr = get_test_schema_descr();
1874
1875        let columns = schema_descr
1876            .columns()
1877            .iter()
1878            .map(|column_descr| {
1879                ColumnChunkMetaData::builder(column_descr.clone())
1880                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1881                    .build()
1882            })
1883            .collect::<Result<Vec<_>>>()
1884            .unwrap();
1885        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1886            .set_num_rows(1000)
1887            .set_column_metadata(columns)
1888            .build()
1889            .unwrap();
1890        let row_group_meta = vec![row_group_meta];
1891
1892        let version = 2;
1893        let num_rows = 1000;
1894        let created_by = Some(String::from("test harness"));
1895        let key_value_metadata = Some(vec![KeyValue::new(
1896            String::from("Foo"),
1897            Some(String::from("bar")),
1898        )]);
1899        let column_orders = Some(vec![
1900            ColumnOrder::UNDEFINED,
1901            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1902        ]);
1903        let file_metadata = FileMetaData::new(
1904            version,
1905            num_rows,
1906            created_by,
1907            key_value_metadata,
1908            schema_descr.clone(),
1909            column_orders,
1910        );
1911
1912        // Now, add in Exact Statistics
1913        let columns_with_stats = schema_descr
1914            .columns()
1915            .iter()
1916            .map(|column_descr| {
1917                ColumnChunkMetaData::builder(column_descr.clone())
1918                    .set_statistics(Statistics::new::<i32>(
1919                        Some(0),
1920                        Some(100),
1921                        None,
1922                        None,
1923                        false,
1924                    ))
1925                    .build()
1926            })
1927            .collect::<Result<Vec<_>>>()
1928            .unwrap();
1929
1930        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1931            .set_num_rows(1000)
1932            .set_column_metadata(columns_with_stats)
1933            .build()
1934            .unwrap();
1935        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1936
1937        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1938            .set_row_groups(row_group_meta_with_stats)
1939            .build();
1940
1941        #[cfg(not(feature = "encryption"))]
1942        let base_expected_size = 2766;
1943        #[cfg(feature = "encryption")]
1944        let base_expected_size = 2934;
1945
1946        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1947
1948        let mut column_index = ColumnIndexBuilder::new(Type::BOOLEAN);
1949        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1950        let column_index = column_index.build().unwrap();
1951        let native_index = match column_index {
1952            ColumnIndexMetaData::BOOLEAN(index) => index,
1953            _ => panic!("wrong type of column index"),
1954        };
1955
1956        // Now, add in OffsetIndex
1957        let mut offset_index = OffsetIndexBuilder::new();
1958        offset_index.append_row_count(1);
1959        offset_index.append_offset_and_size(2, 3);
1960        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1961        offset_index.append_row_count(1);
1962        offset_index.append_offset_and_size(2, 3);
1963        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1964        let offset_index = offset_index.build();
1965
1966        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1967            .set_row_groups(row_group_meta)
1968            .set_column_index(Some(vec![vec![ColumnIndexMetaData::BOOLEAN(native_index)]]))
1969            .set_offset_index(Some(vec![vec![offset_index]]))
1970            .build();
1971
1972        #[cfg(not(feature = "encryption"))]
1973        let bigger_expected_size = 3192;
1974        #[cfg(feature = "encryption")]
1975        let bigger_expected_size = 3360;
1976
1977        // more set fields means more memory usage
1978        assert!(bigger_expected_size > base_expected_size);
1979        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1980    }
1981
1982    #[test]
1983    #[cfg(feature = "encryption")]
1984    fn test_memory_size_with_decryptor() {
1985        use crate::encryption::decrypt::FileDecryptionProperties;
1986        use crate::file::metadata::thrift::encryption::AesGcmV1;
1987
1988        let schema_descr = get_test_schema_descr();
1989
1990        let columns = schema_descr
1991            .columns()
1992            .iter()
1993            .map(|column_descr| ColumnChunkMetaData::builder(column_descr.clone()).build())
1994            .collect::<Result<Vec<_>>>()
1995            .unwrap();
1996        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1997            .set_num_rows(1000)
1998            .set_column_metadata(columns)
1999            .build()
2000            .unwrap();
2001        let row_group_meta = vec![row_group_meta];
2002
2003        let version = 2;
2004        let num_rows = 1000;
2005        let aad_file_unique = vec![1u8; 8];
2006        let aad_prefix = vec![2u8; 8];
2007        let encryption_algorithm = EncryptionAlgorithm::AES_GCM_V1(AesGcmV1 {
2008            aad_prefix: Some(aad_prefix.clone()),
2009            aad_file_unique: Some(aad_file_unique.clone()),
2010            supply_aad_prefix: Some(true),
2011        });
2012        let footer_key_metadata = Some(vec![3u8; 8]);
2013        let file_metadata =
2014            FileMetaData::new(version, num_rows, None, None, schema_descr.clone(), None)
2015                .with_encryption_algorithm(Some(encryption_algorithm))
2016                .with_footer_signing_key_metadata(footer_key_metadata.clone());
2017
2018        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2019            .set_row_groups(row_group_meta.clone())
2020            .build();
2021
2022        let base_expected_size = 2058;
2023        assert_eq!(parquet_meta_data.memory_size(), base_expected_size);
2024
2025        let footer_key = "0123456789012345".as_bytes();
2026        let column_key = "1234567890123450".as_bytes();
2027        let mut decryption_properties_builder =
2028            FileDecryptionProperties::builder(footer_key.to_vec())
2029                .with_aad_prefix(aad_prefix.clone());
2030        for column in schema_descr.columns() {
2031            decryption_properties_builder = decryption_properties_builder
2032                .with_column_key(&column.path().string(), column_key.to_vec());
2033        }
2034        let decryption_properties = decryption_properties_builder.build().unwrap();
2035        let decryptor = FileDecryptor::new(
2036            &decryption_properties,
2037            footer_key_metadata.as_deref(),
2038            aad_file_unique,
2039            aad_prefix,
2040        )
2041        .unwrap();
2042
2043        let parquet_meta_data = ParquetMetaDataBuilder::new(file_metadata.clone())
2044            .set_row_groups(row_group_meta.clone())
2045            .set_file_decryptor(Some(decryptor))
2046            .build();
2047
2048        let expected_size_with_decryptor = 3072;
2049        assert!(expected_size_with_decryptor > base_expected_size);
2050
2051        assert_eq!(
2052            parquet_meta_data.memory_size(),
2053            expected_size_with_decryptor
2054        );
2055    }
2056
2057    /// Returns sample schema descriptor so we can create column metadata.
2058    fn get_test_schema_descr() -> SchemaDescPtr {
2059        let schema = SchemaType::group_type_builder("schema")
2060            .with_fields(vec![
2061                Arc::new(
2062                    SchemaType::primitive_type_builder("a", Type::INT32)
2063                        .build()
2064                        .unwrap(),
2065                ),
2066                Arc::new(
2067                    SchemaType::primitive_type_builder("b", Type::INT32)
2068                        .build()
2069                        .unwrap(),
2070                ),
2071            ])
2072            .build()
2073            .unwrap();
2074
2075        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
2076    }
2077}
parquet/file/metadata/mod.rs

parquet/file/metadata/
mod.rs