parquet/file/metadata/
mod.rs

Help
1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet metadata API
19//!
20//! Most users should use these structures to interact with Parquet metadata.
21//! The [crate::format] module contains lower level structures generated from the
22//! Parquet thrift definition.
23//!
24//! * [`ParquetMetaData`]: Top level metadata container, read from the Parquet
25//!   file footer.
26//!
27//! * [`FileMetaData`]: File level metadata such as schema, row counts and
28//!   version.
29//!
30//! * [`RowGroupMetaData`]: Metadata for each Row Group with a File, such as
31//!   location and number of rows, and column chunks.
32//!
33//! * [`ColumnChunkMetaData`]: Metadata for each column chunk (primitive leaf)
34//!   within a Row Group including encoding and compression information,
35//!   number of values, statistics, etc.
36//!
37//! # APIs for working with Parquet Metadata
38//!
39//! The Parquet readers and writers in this crate handle reading and writing
40//! metadata into parquet files. To work with metadata directly,
41//! the following APIs are available:
42//!
43//! * [`ParquetMetaDataReader`] for reading
44//! * [`ParquetMetaDataWriter`] for writing.
45//!
46//! [`ParquetMetaDataReader`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataReader.html
47//! [`ParquetMetaDataWriter`]: https://docs.rs/parquet/latest/parquet/file/metadata/struct.ParquetMetaDataWriter.html
48//!
49//! # Examples
50//!
51//! Please see [`external_metadata.rs`]
52//!
53//! [`external_metadata.rs`]: https://github.com/apache/arrow-rs/tree/master/parquet/examples/external_metadata.rs
54//!
55//! # Metadata Encodings and Structures
56//!
57//! There are three different encodings of Parquet Metadata in this crate:
58//!
59//! 1. `bytes`:encoded with the Thrift `TCompactProtocol` as defined in
60//!    [parquet.thrift]
61//!
62//! 2. [`format`]: Rust structures automatically generated by the thrift compiler
63//!    from [parquet.thrift]. These structures are low level and mirror
64//!    the thrift definitions.
65//!
66//! 3. [`file::metadata`] (this module): Easier to use Rust structures
67//!    with a more idiomatic API. Note that, confusingly, some but not all
68//!    of these structures have the same name as the [`format`] structures.
69//!
70//! [`format`]: crate::format
71//! [`file::metadata`]: crate::file::metadata
72//! [parquet.thrift]:  https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
73//!
74//! Graphically, this is how the different structures relate to each other:
75//!
76//! ```text
77//!                          ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─         ┌ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─
78//!                            ┌──────────────┐     │         ┌───────────────────────┐ │
79//!                          │ │ ColumnIndex  │              ││    ParquetMetaData    │
80//!                            └──────────────┘     │         └───────────────────────┘ │
81//! ┌──────────────┐         │ ┌────────────────┐            │┌───────────────────────┐
82//! │   ..0x24..   │ ◀────▶    │  OffsetIndex   │   │ ◀────▶  │    ParquetMetaData    │ │
83//! └──────────────┘         │ └────────────────┘            │└───────────────────────┘
84//!                                     ...         │                   ...             │
85//!                          │ ┌──────────────────┐          │ ┌──────────────────┐
86//! bytes                      │  FileMetaData*   │ │          │  FileMetaData*   │     │
87//! (thrift encoded)         │ └──────────────────┘          │ └──────────────────┘
88//!                           ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘         ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ─ ┘
89//!
90//!                          format::meta structures          file::metadata structures
91//!
92//!                         * Same name, different struct
93//! ```
94mod memory;
95pub(crate) mod reader;
96mod writer;
97
98use std::ops::Range;
99use std::sync::Arc;
100
101use crate::format::{
102    BoundaryOrder, ColumnChunk, ColumnIndex, ColumnMetaData, OffsetIndex, PageLocation, RowGroup,
103    SizeStatistics, SortingColumn,
104};
105
106use crate::basic::{ColumnOrder, Compression, Encoding, Type};
107use crate::errors::{ParquetError, Result};
108pub(crate) use crate::file::metadata::memory::HeapSize;
109use crate::file::page_encoding_stats::{self, PageEncodingStats};
110use crate::file::page_index::index::Index;
111use crate::file::page_index::offset_index::OffsetIndexMetaData;
112use crate::file::statistics::{self, Statistics};
113use crate::schema::types::{
114    ColumnDescPtr, ColumnDescriptor, ColumnPath, SchemaDescPtr, SchemaDescriptor,
115    Type as SchemaType,
116};
117pub use reader::ParquetMetaDataReader;
118pub use writer::ParquetMetaDataWriter;
119pub(crate) use writer::ThriftMetadataWriter;
120
121/// Page level statistics for each column chunk of each row group.
122///
123/// This structure is an in-memory representation of multiple [`ColumnIndex`]
124/// structures in a parquet file footer, as described in the Parquet [PageIndex
125/// documentation]. Each [`Index`] holds statistics about all the pages in a
126/// particular column chunk.
127///
128/// `column_index[row_group_number][column_number]` holds the
129/// [`Index`] corresponding to column `column_number` of row group
130/// `row_group_number`.
131///
132/// For example `column_index[2][3]` holds the [`Index`] for the fourth
133/// column in the third row group of the parquet file.
134///
135/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
136pub type ParquetColumnIndex = Vec<Vec<Index>>;
137
138/// [`OffsetIndexMetaData`] for each data page of each row group of each column
139///
140/// This structure is the parsed representation of the [`OffsetIndex`] from the
141/// Parquet file footer, as described in the Parquet [PageIndex documentation].
142///
143/// `offset_index[row_group_number][column_number]` holds
144/// the [`OffsetIndexMetaData`] corresponding to column
145/// `column_number`of row group `row_group_number`.
146///
147/// [PageIndex documentation]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
148pub type ParquetOffsetIndex = Vec<Vec<OffsetIndexMetaData>>;
149
150/// Parsed metadata for a single Parquet file
151///
152/// This structure is stored in the footer of Parquet files, in the format
153/// defined by [`parquet.thrift`].
154///
155/// # Overview
156/// The fields of this structure are:
157/// * [`FileMetaData`]: Information about the overall file (such as the schema) (See [`Self::file_metadata`])
158/// * [`RowGroupMetaData`]: Information about each Row Group (see [`Self::row_groups`])
159/// * [`ParquetColumnIndex`] and [`ParquetOffsetIndex`]: Optional "Page Index" structures (see [`Self::column_index`] and [`Self::offset_index`])
160///
161/// This structure is read by the various readers in this crate or can be read
162/// directly from a file using the [`ParquetMetaDataReader`] struct.
163///
164/// See the [`ParquetMetaDataBuilder`] to create and modify this structure.
165///
166/// [`parquet.thrift`]: https://github.com/apache/parquet-format/blob/master/src/main/thrift/parquet.thrift
167#[derive(Debug, Clone, PartialEq)]
168pub struct ParquetMetaData {
169    /// File level metadata
170    file_metadata: FileMetaData,
171    /// Row group metadata
172    row_groups: Vec<RowGroupMetaData>,
173    /// Page level index for each page in each column chunk
174    column_index: Option<ParquetColumnIndex>,
175    /// Offset index for each page in each column chunk
176    offset_index: Option<ParquetOffsetIndex>,
177}
178
179impl ParquetMetaData {
180    /// Creates Parquet metadata from file metadata and a list of row
181    /// group metadata
182    pub fn new(file_metadata: FileMetaData, row_groups: Vec<RowGroupMetaData>) -> Self {
183        ParquetMetaData {
184            file_metadata,
185            row_groups,
186            column_index: None,
187            offset_index: None,
188        }
189    }
190
191    /// Creates Parquet metadata from file metadata, a list of row
192    /// group metadata, and the column index structures.
193    #[deprecated(note = "Use ParquetMetaDataBuilder")]
194    pub fn new_with_page_index(
195        file_metadata: FileMetaData,
196        row_groups: Vec<RowGroupMetaData>,
197        column_index: Option<ParquetColumnIndex>,
198        offset_index: Option<ParquetOffsetIndex>,
199    ) -> Self {
200        ParquetMetaDataBuilder::new(file_metadata)
201            .set_row_groups(row_groups)
202            .set_column_index(column_index)
203            .set_offset_index(offset_index)
204            .build()
205    }
206
207    /// Convert this ParquetMetaData into a [`ParquetMetaDataBuilder`]
208    pub fn into_builder(self) -> ParquetMetaDataBuilder {
209        self.into()
210    }
211
212    /// Returns file metadata as reference.
213    pub fn file_metadata(&self) -> &FileMetaData {
214        &self.file_metadata
215    }
216
217    /// Returns number of row groups in this file.
218    pub fn num_row_groups(&self) -> usize {
219        self.row_groups.len()
220    }
221
222    /// Returns row group metadata for `i`th position.
223    /// Position should be less than number of row groups `num_row_groups`.
224    pub fn row_group(&self, i: usize) -> &RowGroupMetaData {
225        &self.row_groups[i]
226    }
227
228    /// Returns slice of row groups in this file.
229    pub fn row_groups(&self) -> &[RowGroupMetaData] {
230        &self.row_groups
231    }
232
233    /// Returns page indexes in this file.
234    #[deprecated(note = "Use Self::column_index")]
235    pub fn page_indexes(&self) -> Option<&ParquetColumnIndex> {
236        self.column_index.as_ref()
237    }
238
239    /// Returns the column index for this file if loaded
240    ///
241    /// Returns `None` if the parquet file does not have a `ColumnIndex` or
242    /// [ArrowReaderOptions::with_page_index] was set to false.
243    ///
244    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
245    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
246        self.column_index.as_ref()
247    }
248
249    /// Returns the offset index for this file if loaded
250    #[deprecated(note = "Use Self::offset_index")]
251    pub fn offset_indexes(&self) -> Option<&ParquetOffsetIndex> {
252        self.offset_index.as_ref()
253    }
254
255    /// Returns offset indexes in this file, if loaded
256    ///
257    /// Returns `None` if the parquet file does not have a `OffsetIndex` or
258    /// [ArrowReaderOptions::with_page_index] was set to false.
259    ///
260    /// [ArrowReaderOptions::with_page_index]: https://docs.rs/parquet/latest/parquet/arrow/arrow_reader/struct.ArrowReaderOptions.html#method.with_page_index
261    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
262        self.offset_index.as_ref()
263    }
264
265    /// Estimate of the bytes allocated to store `ParquetMetadata`
266    ///
267    /// # Notes:
268    ///
269    /// 1. Includes size of self
270    ///
271    /// 2. Includes heap memory for sub fields such as [`FileMetaData`] and
272    ///    [`RowGroupMetaData`].
273    ///
274    /// 3. Includes memory from shared pointers (e.g. [`SchemaDescPtr`]). This
275    ///    means `memory_size` will over estimate the memory size if such pointers
276    ///    are shared.
277    ///
278    /// 4. Does not include any allocator overheads
279    pub fn memory_size(&self) -> usize {
280        std::mem::size_of::<Self>()
281            + self.file_metadata.heap_size()
282            + self.row_groups.heap_size()
283            + self.column_index.heap_size()
284            + self.offset_index.heap_size()
285    }
286
287    /// Override the column index
288    pub(crate) fn set_column_index(&mut self, index: Option<ParquetColumnIndex>) {
289        self.column_index = index;
290    }
291
292    /// Override the offset index
293    pub(crate) fn set_offset_index(&mut self, index: Option<ParquetOffsetIndex>) {
294        self.offset_index = index;
295    }
296}
297
298/// A builder for creating / manipulating [`ParquetMetaData`]
299///
300/// # Example creating a new [`ParquetMetaData`]
301///
302///```no_run
303/// # use parquet::file::metadata::{FileMetaData, ParquetMetaData, ParquetMetaDataBuilder, RowGroupMetaData, RowGroupMetaDataBuilder};
304/// # fn get_file_metadata() -> FileMetaData { unimplemented!(); }
305/// // Create a new builder given the file metadata
306/// let file_metadata = get_file_metadata();
307/// // Create a row group
308/// let row_group = RowGroupMetaData::builder(file_metadata.schema_descr_ptr())
309///    .set_num_rows(100)
310///    // ... (A real row group needs more than just the number of rows)
311///    .build()
312///    .unwrap();
313/// // Create the final metadata
314/// let metadata: ParquetMetaData = ParquetMetaDataBuilder::new(file_metadata)
315///   .add_row_group(row_group)
316///   .build();
317/// ```
318///
319/// # Example modifying an existing [`ParquetMetaData`]
320/// ```no_run
321/// # use parquet::file::metadata::ParquetMetaData;
322/// # fn load_metadata() -> ParquetMetaData { unimplemented!(); }
323/// // Modify the metadata so only the last RowGroup remains
324/// let metadata: ParquetMetaData = load_metadata();
325/// let mut builder = metadata.into_builder();
326///
327/// // Take existing row groups to modify
328/// let mut row_groups = builder.take_row_groups();
329/// let last_row_group = row_groups.pop().unwrap();
330///
331/// let metadata = builder
332///   .add_row_group(last_row_group)
333///   .build();
334/// ```
335pub struct ParquetMetaDataBuilder(ParquetMetaData);
336
337impl ParquetMetaDataBuilder {
338    /// Create a new builder from a file metadata, with no row groups
339    pub fn new(file_meta_data: FileMetaData) -> Self {
340        Self(ParquetMetaData::new(file_meta_data, vec![]))
341    }
342
343    /// Create a new builder from an existing ParquetMetaData
344    pub fn new_from_metadata(metadata: ParquetMetaData) -> Self {
345        Self(metadata)
346    }
347
348    /// Adds a row group to the metadata
349    pub fn add_row_group(mut self, row_group: RowGroupMetaData) -> Self {
350        self.0.row_groups.push(row_group);
351        self
352    }
353
354    /// Sets all the row groups to the specified list
355    pub fn set_row_groups(mut self, row_groups: Vec<RowGroupMetaData>) -> Self {
356        self.0.row_groups = row_groups;
357        self
358    }
359
360    /// Takes ownership of the row groups in this builder, and clears the list
361    /// of row groups.
362    ///
363    /// This can be used for more efficient creation of a new ParquetMetaData
364    /// from an existing one.
365    pub fn take_row_groups(&mut self) -> Vec<RowGroupMetaData> {
366        std::mem::take(&mut self.0.row_groups)
367    }
368
369    /// Return a reference to the current row groups
370    pub fn row_groups(&self) -> &[RowGroupMetaData] {
371        &self.0.row_groups
372    }
373
374    /// Sets the column index
375    pub fn set_column_index(mut self, column_index: Option<ParquetColumnIndex>) -> Self {
376        self.0.column_index = column_index;
377        self
378    }
379
380    /// Returns the current column index from the builder, replacing it with `None`
381    pub fn take_column_index(&mut self) -> Option<ParquetColumnIndex> {
382        std::mem::take(&mut self.0.column_index)
383    }
384
385    /// Return a reference to the current column index, if any
386    pub fn column_index(&self) -> Option<&ParquetColumnIndex> {
387        self.0.column_index.as_ref()
388    }
389
390    /// Sets the offset index
391    pub fn set_offset_index(mut self, offset_index: Option<ParquetOffsetIndex>) -> Self {
392        self.0.offset_index = offset_index;
393        self
394    }
395
396    /// Returns the current offset index from the builder, replacing it with `None`
397    pub fn take_offset_index(&mut self) -> Option<ParquetOffsetIndex> {
398        std::mem::take(&mut self.0.offset_index)
399    }
400
401    /// Return a reference to the current offset index, if any
402    pub fn offset_index(&self) -> Option<&ParquetOffsetIndex> {
403        self.0.offset_index.as_ref()
404    }
405
406    /// Creates a new ParquetMetaData from the builder
407    pub fn build(self) -> ParquetMetaData {
408        let Self(metadata) = self;
409        metadata
410    }
411}
412
413impl From<ParquetMetaData> for ParquetMetaDataBuilder {
414    fn from(meta_data: ParquetMetaData) -> Self {
415        Self(meta_data)
416    }
417}
418
419/// A key-value pair for [`FileMetaData`].
420pub type KeyValue = crate::format::KeyValue;
421
422/// Reference counted pointer for [`FileMetaData`].
423pub type FileMetaDataPtr = Arc<FileMetaData>;
424
425/// File level metadata for a Parquet file.
426///
427/// Includes the version of the file, metadata, number of rows, schema, and column orders
428#[derive(Debug, Clone, PartialEq)]
429pub struct FileMetaData {
430    version: i32,
431    num_rows: i64,
432    created_by: Option<String>,
433    key_value_metadata: Option<Vec<KeyValue>>,
434    schema_descr: SchemaDescPtr,
435    column_orders: Option<Vec<ColumnOrder>>,
436}
437
438impl FileMetaData {
439    /// Creates new file metadata.
440    pub fn new(
441        version: i32,
442        num_rows: i64,
443        created_by: Option<String>,
444        key_value_metadata: Option<Vec<KeyValue>>,
445        schema_descr: SchemaDescPtr,
446        column_orders: Option<Vec<ColumnOrder>>,
447    ) -> Self {
448        FileMetaData {
449            version,
450            num_rows,
451            created_by,
452            key_value_metadata,
453            schema_descr,
454            column_orders,
455        }
456    }
457
458    /// Returns version of this file.
459    pub fn version(&self) -> i32 {
460        self.version
461    }
462
463    /// Returns number of rows in the file.
464    pub fn num_rows(&self) -> i64 {
465        self.num_rows
466    }
467
468    /// String message for application that wrote this file.
469    ///
470    /// This should have the following format:
471    /// `<application> version <application version> (build <application build hash>)`.
472    ///
473    /// ```shell
474    /// parquet-mr version 1.8.0 (build 0fda28af84b9746396014ad6a415b90592a98b3b)
475    /// ```
476    pub fn created_by(&self) -> Option<&str> {
477        self.created_by.as_deref()
478    }
479
480    /// Returns key_value_metadata of this file.
481    pub fn key_value_metadata(&self) -> Option<&Vec<KeyValue>> {
482        self.key_value_metadata.as_ref()
483    }
484
485    /// Returns Parquet [`Type`] that describes schema in this file.
486    ///
487    /// [`Type`]: crate::schema::types::Type
488    pub fn schema(&self) -> &SchemaType {
489        self.schema_descr.root_schema()
490    }
491
492    /// Returns a reference to schema descriptor.
493    pub fn schema_descr(&self) -> &SchemaDescriptor {
494        &self.schema_descr
495    }
496
497    /// Returns reference counted clone for schema descriptor.
498    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
499        self.schema_descr.clone()
500    }
501
502    /// Column (sort) order used for `min` and `max` values of each column in this file.
503    ///
504    /// Each column order corresponds to one column, determined by its position in the
505    /// list, matching the position of the column in the schema.
506    ///
507    /// When `None` is returned, there are no column orders available, and each column
508    /// should be assumed to have undefined (legacy) column order.
509    pub fn column_orders(&self) -> Option<&Vec<ColumnOrder>> {
510        self.column_orders.as_ref()
511    }
512
513    /// Returns column order for `i`th column in this file.
514    /// If column orders are not available, returns undefined (legacy) column order.
515    pub fn column_order(&self, i: usize) -> ColumnOrder {
516        self.column_orders
517            .as_ref()
518            .map(|data| data[i])
519            .unwrap_or(ColumnOrder::UNDEFINED)
520    }
521}
522
523/// Reference counted pointer for [`RowGroupMetaData`].
524pub type RowGroupMetaDataPtr = Arc<RowGroupMetaData>;
525
526/// Metadata for a row group
527///
528/// Includes [`ColumnChunkMetaData`] for each column in the row group, the number of rows
529/// the total byte size of the row group, and the [`SchemaDescriptor`] for the row group.
530#[derive(Debug, Clone, PartialEq)]
531pub struct RowGroupMetaData {
532    columns: Vec<ColumnChunkMetaData>,
533    num_rows: i64,
534    sorting_columns: Option<Vec<SortingColumn>>,
535    total_byte_size: i64,
536    schema_descr: SchemaDescPtr,
537    /// We can't infer from file offset of first column since there may empty columns in row group.
538    file_offset: Option<i64>,
539    /// Ordinal position of this row group in file
540    ordinal: Option<i16>,
541}
542
543impl RowGroupMetaData {
544    /// Returns builder for row group metadata.
545    pub fn builder(schema_descr: SchemaDescPtr) -> RowGroupMetaDataBuilder {
546        RowGroupMetaDataBuilder::new(schema_descr)
547    }
548
549    /// Number of columns in this row group.
550    pub fn num_columns(&self) -> usize {
551        self.columns.len()
552    }
553
554    /// Returns column chunk metadata for `i`th column.
555    pub fn column(&self, i: usize) -> &ColumnChunkMetaData {
556        &self.columns[i]
557    }
558
559    /// Returns slice of column chunk metadata.
560    pub fn columns(&self) -> &[ColumnChunkMetaData] {
561        &self.columns
562    }
563
564    /// Returns mutable slice of column chunk metadata.
565    pub fn columns_mut(&mut self) -> &mut [ColumnChunkMetaData] {
566        &mut self.columns
567    }
568
569    /// Number of rows in this row group.
570    pub fn num_rows(&self) -> i64 {
571        self.num_rows
572    }
573
574    /// Returns the sort ordering of the rows in this RowGroup if any
575    pub fn sorting_columns(&self) -> Option<&Vec<SortingColumn>> {
576        self.sorting_columns.as_ref()
577    }
578
579    /// Total byte size of all uncompressed column data in this row group.
580    pub fn total_byte_size(&self) -> i64 {
581        self.total_byte_size
582    }
583
584    /// Total size of all compressed column data in this row group.
585    pub fn compressed_size(&self) -> i64 {
586        self.columns.iter().map(|c| c.total_compressed_size).sum()
587    }
588
589    /// Returns reference to a schema descriptor.
590    pub fn schema_descr(&self) -> &SchemaDescriptor {
591        self.schema_descr.as_ref()
592    }
593
594    /// Returns reference counted clone of schema descriptor.
595    pub fn schema_descr_ptr(&self) -> SchemaDescPtr {
596        self.schema_descr.clone()
597    }
598
599    /// Returns ordinal position of this row group in file.
600    ///
601    /// For example if this is the first row group in the file, this will return 0.
602    /// If this is the second row group in the file, this will return 1.
603    #[inline(always)]
604    pub fn ordinal(&self) -> Option<i16> {
605        self.ordinal
606    }
607
608    /// Returns file offset of this row group in file.
609    #[inline(always)]
610    pub fn file_offset(&self) -> Option<i64> {
611        self.file_offset
612    }
613
614    /// Method to convert from Thrift.
615    pub fn from_thrift(schema_descr: SchemaDescPtr, mut rg: RowGroup) -> Result<RowGroupMetaData> {
616        if schema_descr.num_columns() != rg.columns.len() {
617            return Err(general_err!(
618                "Column count mismatch. Schema has {} columns while Row Group has {}",
619                schema_descr.num_columns(),
620                rg.columns.len()
621            ));
622        }
623        let total_byte_size = rg.total_byte_size;
624        let num_rows = rg.num_rows;
625        let mut columns = vec![];
626        for (c, d) in rg.columns.drain(0..).zip(schema_descr.columns()) {
627            let cc = ColumnChunkMetaData::from_thrift(d.clone(), c)?;
628            columns.push(cc);
629        }
630        let sorting_columns = rg.sorting_columns;
631        Ok(RowGroupMetaData {
632            columns,
633            num_rows,
634            sorting_columns,
635            total_byte_size,
636            schema_descr,
637            file_offset: rg.file_offset,
638            ordinal: rg.ordinal,
639        })
640    }
641
642    /// Method to convert to Thrift.
643    pub fn to_thrift(&self) -> RowGroup {
644        RowGroup {
645            columns: self.columns().iter().map(|v| v.to_thrift()).collect(),
646            total_byte_size: self.total_byte_size,
647            num_rows: self.num_rows,
648            sorting_columns: self.sorting_columns().cloned(),
649            file_offset: self.file_offset(),
650            total_compressed_size: Some(self.compressed_size()),
651            ordinal: self.ordinal,
652        }
653    }
654
655    /// Converts this [`RowGroupMetaData`] into a [`RowGroupMetaDataBuilder`]
656    pub fn into_builder(self) -> RowGroupMetaDataBuilder {
657        RowGroupMetaDataBuilder(self)
658    }
659}
660
661/// Builder for row group metadata.
662pub struct RowGroupMetaDataBuilder(RowGroupMetaData);
663
664impl RowGroupMetaDataBuilder {
665    /// Creates new builder from schema descriptor.
666    fn new(schema_descr: SchemaDescPtr) -> Self {
667        Self(RowGroupMetaData {
668            columns: Vec::with_capacity(schema_descr.num_columns()),
669            schema_descr,
670            file_offset: None,
671            num_rows: 0,
672            sorting_columns: None,
673            total_byte_size: 0,
674            ordinal: None,
675        })
676    }
677
678    /// Sets number of rows in this row group.
679    pub fn set_num_rows(mut self, value: i64) -> Self {
680        self.0.num_rows = value;
681        self
682    }
683
684    /// Sets the sorting order for columns
685    pub fn set_sorting_columns(mut self, value: Option<Vec<SortingColumn>>) -> Self {
686        self.0.sorting_columns = value;
687        self
688    }
689
690    /// Sets total size in bytes for this row group.
691    pub fn set_total_byte_size(mut self, value: i64) -> Self {
692        self.0.total_byte_size = value;
693        self
694    }
695
696    /// Takes ownership of the the column metadata in this builder, and clears
697    /// the list of columns.
698    ///
699    /// This can be used for more efficient creation of a new RowGroupMetaData
700    /// from an existing one.
701    pub fn take_columns(&mut self) -> Vec<ColumnChunkMetaData> {
702        std::mem::take(&mut self.0.columns)
703    }
704
705    /// Sets column metadata for this row group.
706    pub fn set_column_metadata(mut self, value: Vec<ColumnChunkMetaData>) -> Self {
707        self.0.columns = value;
708        self
709    }
710
711    /// Adds a column metadata to this row group
712    pub fn add_column_metadata(mut self, value: ColumnChunkMetaData) -> Self {
713        self.0.columns.push(value);
714        self
715    }
716
717    /// Sets ordinal for this row group.
718    pub fn set_ordinal(mut self, value: i16) -> Self {
719        self.0.ordinal = Some(value);
720        self
721    }
722
723    /// Sets file offset for this row group.
724    pub fn set_file_offset(mut self, value: i64) -> Self {
725        self.0.file_offset = Some(value);
726        self
727    }
728
729    /// Builds row group metadata.
730    pub fn build(self) -> Result<RowGroupMetaData> {
731        if self.0.schema_descr.num_columns() != self.0.columns.len() {
732            return Err(general_err!(
733                "Column length mismatch: {} != {}",
734                self.0.schema_descr.num_columns(),
735                self.0.columns.len()
736            ));
737        }
738
739        Ok(self.0)
740    }
741}
742
743/// Metadata for a column chunk.
744#[derive(Debug, Clone, PartialEq)]
745pub struct ColumnChunkMetaData {
746    column_descr: ColumnDescPtr,
747    encodings: Vec<Encoding>,
748    file_path: Option<String>,
749    file_offset: i64,
750    num_values: i64,
751    compression: Compression,
752    total_compressed_size: i64,
753    total_uncompressed_size: i64,
754    data_page_offset: i64,
755    index_page_offset: Option<i64>,
756    dictionary_page_offset: Option<i64>,
757    statistics: Option<Statistics>,
758    encoding_stats: Option<Vec<PageEncodingStats>>,
759    bloom_filter_offset: Option<i64>,
760    bloom_filter_length: Option<i32>,
761    offset_index_offset: Option<i64>,
762    offset_index_length: Option<i32>,
763    column_index_offset: Option<i64>,
764    column_index_length: Option<i32>,
765    unencoded_byte_array_data_bytes: Option<i64>,
766    repetition_level_histogram: Option<LevelHistogram>,
767    definition_level_histogram: Option<LevelHistogram>,
768}
769
770/// Histograms for repetition and definition levels.
771///
772/// Each histogram is a vector of length `max_level + 1`. The value at index `i` is the number of
773/// values at level `i`.
774///
775/// For example, `vec[0]` is the number of rows with level 0, `vec[1]` is the
776/// number of rows with level 1, and so on.
777///
778#[derive(Debug, Clone, PartialEq, Eq, Hash, Default)]
779pub struct LevelHistogram {
780    inner: Vec<i64>,
781}
782
783impl LevelHistogram {
784    /// Creates a new level histogram data.
785    ///
786    /// Length will be `max_level + 1`.
787    ///
788    /// Returns `None` when `max_level == 0` (because histograms are not necessary in this case)
789    pub fn try_new(max_level: i16) -> Option<Self> {
790        if max_level > 0 {
791            Some(Self {
792                inner: vec![0; max_level as usize + 1],
793            })
794        } else {
795            None
796        }
797    }
798    /// Returns a reference to the the histogram's values.
799    pub fn values(&self) -> &[i64] {
800        &self.inner
801    }
802
803    /// Return the inner vector, consuming self
804    pub fn into_inner(self) -> Vec<i64> {
805        self.inner
806    }
807
808    /// Returns the histogram value at the given index.
809    ///
810    /// The value of `i` is the number of values with level `i`. For example,
811    /// `get(1)` returns the number of values with level 1.
812    ///
813    /// Returns `None` if the index is out of bounds.
814    pub fn get(&self, index: usize) -> Option<i64> {
815        self.inner.get(index).copied()
816    }
817
818    /// Adds the values from the other histogram to this histogram
819    ///
820    /// # Panics
821    /// If the histograms have different lengths
822    pub fn add(&mut self, other: &Self) {
823        assert_eq!(self.len(), other.len());
824        for (dst, src) in self.inner.iter_mut().zip(other.inner.iter()) {
825            *dst += src;
826        }
827    }
828
829    /// return the length of the histogram
830    pub fn len(&self) -> usize {
831        self.inner.len()
832    }
833
834    /// returns if the histogram is empty
835    pub fn is_empty(&self) -> bool {
836        self.inner.is_empty()
837    }
838
839    /// Sets the values of all histogram levels to 0.
840    pub fn reset(&mut self) {
841        for value in self.inner.iter_mut() {
842            *value = 0;
843        }
844    }
845
846    /// Updates histogram values using provided repetition levels
847    ///
848    /// # Panics
849    /// if any of the levels is greater than the length of the histogram (
850    /// the argument supplied to [`Self::try_new`])
851    pub fn update_from_levels(&mut self, levels: &[i16]) {
852        for &level in levels {
853            self.inner[level as usize] += 1;
854        }
855    }
856}
857
858impl From<Vec<i64>> for LevelHistogram {
859    fn from(inner: Vec<i64>) -> Self {
860        Self { inner }
861    }
862}
863
864impl From<LevelHistogram> for Vec<i64> {
865    fn from(value: LevelHistogram) -> Self {
866        value.into_inner()
867    }
868}
869
870impl HeapSize for LevelHistogram {
871    fn heap_size(&self) -> usize {
872        self.inner.heap_size()
873    }
874}
875
876/// Represents common operations for a column chunk.
877impl ColumnChunkMetaData {
878    /// Returns builder for column chunk metadata.
879    pub fn builder(column_descr: ColumnDescPtr) -> ColumnChunkMetaDataBuilder {
880        ColumnChunkMetaDataBuilder::new(column_descr)
881    }
882
883    /// File where the column chunk is stored.
884    ///
885    /// If not set, assumed to belong to the same file as the metadata.
886    /// This path is relative to the current file.
887    pub fn file_path(&self) -> Option<&str> {
888        self.file_path.as_deref()
889    }
890
891    /// Byte offset of `ColumnMetaData` in `file_path()`.
892    ///
893    /// Note that the meaning of this field has been inconsistent between implementations
894    /// so its use has since been deprecated in the Parquet specification. Modern implementations
895    /// will set this to `0` to indicate that the `ColumnMetaData` is solely contained in the
896    /// `ColumnChunk` struct.
897    pub fn file_offset(&self) -> i64 {
898        self.file_offset
899    }
900
901    /// Type of this column. Must be primitive.
902    pub fn column_type(&self) -> Type {
903        self.column_descr.physical_type()
904    }
905
906    /// Path (or identifier) of this column.
907    pub fn column_path(&self) -> &ColumnPath {
908        self.column_descr.path()
909    }
910
911    /// Descriptor for this column.
912    pub fn column_descr(&self) -> &ColumnDescriptor {
913        self.column_descr.as_ref()
914    }
915
916    /// Reference counted clone of descriptor for this column.
917    pub fn column_descr_ptr(&self) -> ColumnDescPtr {
918        self.column_descr.clone()
919    }
920
921    /// All encodings used for this column.
922    pub fn encodings(&self) -> &Vec<Encoding> {
923        &self.encodings
924    }
925
926    /// Total number of values in this column chunk.
927    pub fn num_values(&self) -> i64 {
928        self.num_values
929    }
930
931    /// Compression for this column.
932    pub fn compression(&self) -> Compression {
933        self.compression
934    }
935
936    /// Returns the total compressed data size of this column chunk.
937    pub fn compressed_size(&self) -> i64 {
938        self.total_compressed_size
939    }
940
941    /// Returns the total uncompressed data size of this column chunk.
942    pub fn uncompressed_size(&self) -> i64 {
943        self.total_uncompressed_size
944    }
945
946    /// Returns the offset for the column data.
947    pub fn data_page_offset(&self) -> i64 {
948        self.data_page_offset
949    }
950
951    /// Returns the offset for the index page.
952    pub fn index_page_offset(&self) -> Option<i64> {
953        self.index_page_offset
954    }
955
956    /// Returns the offset for the dictionary page, if any.
957    pub fn dictionary_page_offset(&self) -> Option<i64> {
958        self.dictionary_page_offset
959    }
960
961    /// Returns the offset and length in bytes of the column chunk within the file
962    pub fn byte_range(&self) -> (u64, u64) {
963        let col_start = match self.dictionary_page_offset() {
964            Some(dictionary_page_offset) => dictionary_page_offset,
965            None => self.data_page_offset(),
966        };
967        let col_len = self.compressed_size();
968        assert!(
969            col_start >= 0 && col_len >= 0,
970            "column start and length should not be negative"
971        );
972        (col_start as u64, col_len as u64)
973    }
974
975    /// Returns statistics that are set for this column chunk,
976    /// or `None` if no statistics are available.
977    pub fn statistics(&self) -> Option<&Statistics> {
978        self.statistics.as_ref()
979    }
980
981    /// Returns the offset for the page encoding stats,
982    /// or `None` if no page encoding stats are available.
983    pub fn page_encoding_stats(&self) -> Option<&Vec<PageEncodingStats>> {
984        self.encoding_stats.as_ref()
985    }
986
987    /// Returns the offset for the bloom filter.
988    pub fn bloom_filter_offset(&self) -> Option<i64> {
989        self.bloom_filter_offset
990    }
991
992    /// Returns the offset for the bloom filter.
993    pub fn bloom_filter_length(&self) -> Option<i32> {
994        self.bloom_filter_length
995    }
996
997    /// Returns the offset for the column index.
998    pub fn column_index_offset(&self) -> Option<i64> {
999        self.column_index_offset
1000    }
1001
1002    /// Returns the offset for the column index length.
1003    pub fn column_index_length(&self) -> Option<i32> {
1004        self.column_index_length
1005    }
1006
1007    /// Returns the range for the offset index if any
1008    pub(crate) fn column_index_range(&self) -> Option<Range<usize>> {
1009        let offset = usize::try_from(self.column_index_offset?).ok()?;
1010        let length = usize::try_from(self.column_index_length?).ok()?;
1011        Some(offset..(offset + length))
1012    }
1013
1014    /// Returns the offset for the offset index.
1015    pub fn offset_index_offset(&self) -> Option<i64> {
1016        self.offset_index_offset
1017    }
1018
1019    /// Returns the offset for the offset index length.
1020    pub fn offset_index_length(&self) -> Option<i32> {
1021        self.offset_index_length
1022    }
1023
1024    /// Returns the range for the offset index if any
1025    pub(crate) fn offset_index_range(&self) -> Option<Range<usize>> {
1026        let offset = usize::try_from(self.offset_index_offset?).ok()?;
1027        let length = usize::try_from(self.offset_index_length?).ok()?;
1028        Some(offset..(offset + length))
1029    }
1030
1031    /// Returns the number of bytes of variable length data after decoding.
1032    ///
1033    /// Only set for BYTE_ARRAY columns. This field may not be set by older
1034    /// writers.
1035    pub fn unencoded_byte_array_data_bytes(&self) -> Option<i64> {
1036        self.unencoded_byte_array_data_bytes
1037    }
1038
1039    /// Returns the repetition level histogram.
1040    ///
1041    /// The returned value `vec[i]` is how many values are at repetition level `i`. For example,
1042    /// `vec[0]` indicates how many rows the page contains.
1043    /// This field may not be set by older writers.
1044    pub fn repetition_level_histogram(&self) -> Option<&LevelHistogram> {
1045        self.repetition_level_histogram.as_ref()
1046    }
1047
1048    /// Returns the definition level histogram.
1049    ///
1050    /// The returned value `vec[i]` is how many values are at definition level `i`. For example,
1051    /// `vec[max_definition_level]` indicates how many non-null values are present in the page.
1052    /// This field may not be set by older writers.
1053    pub fn definition_level_histogram(&self) -> Option<&LevelHistogram> {
1054        self.definition_level_histogram.as_ref()
1055    }
1056
1057    /// Method to convert from Thrift.
1058    pub fn from_thrift(column_descr: ColumnDescPtr, cc: ColumnChunk) -> Result<Self> {
1059        if cc.meta_data.is_none() {
1060            return Err(general_err!("Expected to have column metadata"));
1061        }
1062        let mut col_metadata: ColumnMetaData = cc.meta_data.unwrap();
1063        let column_type = Type::try_from(col_metadata.type_)?;
1064        let encodings = col_metadata
1065            .encodings
1066            .drain(0..)
1067            .map(Encoding::try_from)
1068            .collect::<Result<_>>()?;
1069        let compression = Compression::try_from(col_metadata.codec)?;
1070        let file_path = cc.file_path;
1071        let file_offset = cc.file_offset;
1072        let num_values = col_metadata.num_values;
1073        let total_compressed_size = col_metadata.total_compressed_size;
1074        let total_uncompressed_size = col_metadata.total_uncompressed_size;
1075        let data_page_offset = col_metadata.data_page_offset;
1076        let index_page_offset = col_metadata.index_page_offset;
1077        let dictionary_page_offset = col_metadata.dictionary_page_offset;
1078        let statistics = statistics::from_thrift(column_type, col_metadata.statistics)?;
1079        let encoding_stats = col_metadata
1080            .encoding_stats
1081            .as_ref()
1082            .map(|vec| {
1083                vec.iter()
1084                    .map(page_encoding_stats::try_from_thrift)
1085                    .collect::<Result<_>>()
1086            })
1087            .transpose()?;
1088        let bloom_filter_offset = col_metadata.bloom_filter_offset;
1089        let bloom_filter_length = col_metadata.bloom_filter_length;
1090        let offset_index_offset = cc.offset_index_offset;
1091        let offset_index_length = cc.offset_index_length;
1092        let column_index_offset = cc.column_index_offset;
1093        let column_index_length = cc.column_index_length;
1094        let (
1095            unencoded_byte_array_data_bytes,
1096            repetition_level_histogram,
1097            definition_level_histogram,
1098        ) = if let Some(size_stats) = col_metadata.size_statistics {
1099            (
1100                size_stats.unencoded_byte_array_data_bytes,
1101                size_stats.repetition_level_histogram,
1102                size_stats.definition_level_histogram,
1103            )
1104        } else {
1105            (None, None, None)
1106        };
1107
1108        let repetition_level_histogram = repetition_level_histogram.map(LevelHistogram::from);
1109        let definition_level_histogram = definition_level_histogram.map(LevelHistogram::from);
1110
1111        let result = ColumnChunkMetaData {
1112            column_descr,
1113            encodings,
1114            file_path,
1115            file_offset,
1116            num_values,
1117            compression,
1118            total_compressed_size,
1119            total_uncompressed_size,
1120            data_page_offset,
1121            index_page_offset,
1122            dictionary_page_offset,
1123            statistics,
1124            encoding_stats,
1125            bloom_filter_offset,
1126            bloom_filter_length,
1127            offset_index_offset,
1128            offset_index_length,
1129            column_index_offset,
1130            column_index_length,
1131            unencoded_byte_array_data_bytes,
1132            repetition_level_histogram,
1133            definition_level_histogram,
1134        };
1135        Ok(result)
1136    }
1137
1138    /// Method to convert to Thrift.
1139    pub fn to_thrift(&self) -> ColumnChunk {
1140        let column_metadata = self.to_column_metadata_thrift();
1141
1142        ColumnChunk {
1143            file_path: self.file_path().map(|s| s.to_owned()),
1144            file_offset: self.file_offset,
1145            meta_data: Some(column_metadata),
1146            offset_index_offset: self.offset_index_offset,
1147            offset_index_length: self.offset_index_length,
1148            column_index_offset: self.column_index_offset,
1149            column_index_length: self.column_index_length,
1150            crypto_metadata: None,
1151            encrypted_column_metadata: None,
1152        }
1153    }
1154
1155    /// Method to convert to Thrift `ColumnMetaData`
1156    pub fn to_column_metadata_thrift(&self) -> ColumnMetaData {
1157        let size_statistics = if self.unencoded_byte_array_data_bytes.is_some()
1158            || self.repetition_level_histogram.is_some()
1159            || self.definition_level_histogram.is_some()
1160        {
1161            let repetition_level_histogram = self
1162                .repetition_level_histogram
1163                .as_ref()
1164                .map(|hist| hist.clone().into_inner());
1165
1166            let definition_level_histogram = self
1167                .definition_level_histogram
1168                .as_ref()
1169                .map(|hist| hist.clone().into_inner());
1170
1171            Some(SizeStatistics {
1172                unencoded_byte_array_data_bytes: self.unencoded_byte_array_data_bytes,
1173                repetition_level_histogram,
1174                definition_level_histogram,
1175            })
1176        } else {
1177            None
1178        };
1179
1180        ColumnMetaData {
1181            type_: self.column_type().into(),
1182            encodings: self.encodings().iter().map(|&v| v.into()).collect(),
1183            path_in_schema: self.column_path().as_ref().to_vec(),
1184            codec: self.compression.into(),
1185            num_values: self.num_values,
1186            total_uncompressed_size: self.total_uncompressed_size,
1187            total_compressed_size: self.total_compressed_size,
1188            key_value_metadata: None,
1189            data_page_offset: self.data_page_offset,
1190            index_page_offset: self.index_page_offset,
1191            dictionary_page_offset: self.dictionary_page_offset,
1192            statistics: statistics::to_thrift(self.statistics.as_ref()),
1193            encoding_stats: self
1194                .encoding_stats
1195                .as_ref()
1196                .map(|vec| vec.iter().map(page_encoding_stats::to_thrift).collect()),
1197            bloom_filter_offset: self.bloom_filter_offset,
1198            bloom_filter_length: self.bloom_filter_length,
1199            size_statistics,
1200        }
1201    }
1202
1203    /// Converts this [`ColumnChunkMetaData`] into a [`ColumnChunkMetaDataBuilder`]
1204    pub fn into_builder(self) -> ColumnChunkMetaDataBuilder {
1205        ColumnChunkMetaDataBuilder::from(self)
1206    }
1207}
1208
1209/// Builder for [`ColumnChunkMetaData`]
1210///
1211/// This builder is used to create a new column chunk metadata or modify an
1212/// existing one.
1213///
1214/// # Example
1215/// ```no_run
1216/// # use parquet::file::metadata::{ColumnChunkMetaData, ColumnChunkMetaDataBuilder};
1217/// # fn get_column_chunk_metadata() -> ColumnChunkMetaData { unimplemented!(); }
1218/// let column_chunk_metadata = get_column_chunk_metadata();
1219/// // create a new builder from existing column chunk metadata
1220/// let builder = ColumnChunkMetaDataBuilder::from(column_chunk_metadata);
1221/// // clear the statistics:
1222/// let column_chunk_metadata: ColumnChunkMetaData = builder
1223///   .clear_statistics()
1224///   .build()
1225///   .unwrap();
1226/// ```
1227pub struct ColumnChunkMetaDataBuilder(ColumnChunkMetaData);
1228
1229impl ColumnChunkMetaDataBuilder {
1230    /// Creates new column chunk metadata builder.
1231    ///
1232    /// See also [`ColumnChunkMetaData::builder`]
1233    fn new(column_descr: ColumnDescPtr) -> Self {
1234        Self(ColumnChunkMetaData {
1235            column_descr,
1236            encodings: Vec::new(),
1237            file_path: None,
1238            file_offset: 0,
1239            num_values: 0,
1240            compression: Compression::UNCOMPRESSED,
1241            total_compressed_size: 0,
1242            total_uncompressed_size: 0,
1243            data_page_offset: 0,
1244            index_page_offset: None,
1245            dictionary_page_offset: None,
1246            statistics: None,
1247            encoding_stats: None,
1248            bloom_filter_offset: None,
1249            bloom_filter_length: None,
1250            offset_index_offset: None,
1251            offset_index_length: None,
1252            column_index_offset: None,
1253            column_index_length: None,
1254            unencoded_byte_array_data_bytes: None,
1255            repetition_level_histogram: None,
1256            definition_level_histogram: None,
1257        })
1258    }
1259
1260    /// Sets list of encodings for this column chunk.
1261    pub fn set_encodings(mut self, encodings: Vec<Encoding>) -> Self {
1262        self.0.encodings = encodings;
1263        self
1264    }
1265
1266    /// Sets optional file path for this column chunk.
1267    pub fn set_file_path(mut self, value: String) -> Self {
1268        self.0.file_path = Some(value);
1269        self
1270    }
1271
1272    /// Sets file offset in bytes.
1273    ///
1274    /// This field was meant to provide an alternate to storing `ColumnMetadata` directly in
1275    /// the `ColumnChunkMetadata`. However, most Parquet readers assume the `ColumnMetadata`
1276    /// is stored inline and ignore this field.
1277    #[deprecated(
1278        since = "53.0.0",
1279        note = "The Parquet specification requires this field to be 0"
1280    )]
1281    pub fn set_file_offset(mut self, value: i64) -> Self {
1282        self.0.file_offset = value;
1283        self
1284    }
1285
1286    /// Sets number of values.
1287    pub fn set_num_values(mut self, value: i64) -> Self {
1288        self.0.num_values = value;
1289        self
1290    }
1291
1292    /// Sets compression.
1293    pub fn set_compression(mut self, value: Compression) -> Self {
1294        self.0.compression = value;
1295        self
1296    }
1297
1298    /// Sets total compressed size in bytes.
1299    pub fn set_total_compressed_size(mut self, value: i64) -> Self {
1300        self.0.total_compressed_size = value;
1301        self
1302    }
1303
1304    /// Sets total uncompressed size in bytes.
1305    pub fn set_total_uncompressed_size(mut self, value: i64) -> Self {
1306        self.0.total_uncompressed_size = value;
1307        self
1308    }
1309
1310    /// Sets data page offset in bytes.
1311    pub fn set_data_page_offset(mut self, value: i64) -> Self {
1312        self.0.data_page_offset = value;
1313        self
1314    }
1315
1316    /// Sets optional dictionary page offset in bytes.
1317    pub fn set_dictionary_page_offset(mut self, value: Option<i64>) -> Self {
1318        self.0.dictionary_page_offset = value;
1319        self
1320    }
1321
1322    /// Sets optional index page offset in bytes.
1323    pub fn set_index_page_offset(mut self, value: Option<i64>) -> Self {
1324        self.0.index_page_offset = value;
1325        self
1326    }
1327
1328    /// Sets statistics for this column chunk.
1329    pub fn set_statistics(mut self, value: Statistics) -> Self {
1330        self.0.statistics = Some(value);
1331        self
1332    }
1333
1334    /// Clears the statistics for this column chunk.
1335    pub fn clear_statistics(mut self) -> Self {
1336        self.0.statistics = None;
1337        self
1338    }
1339
1340    /// Sets page encoding stats for this column chunk.
1341    pub fn set_page_encoding_stats(mut self, value: Vec<PageEncodingStats>) -> Self {
1342        self.0.encoding_stats = Some(value);
1343        self
1344    }
1345
1346    /// Clears the page encoding stats for this column chunk.
1347    pub fn clear_page_encoding_stats(mut self) -> Self {
1348        self.0.encoding_stats = None;
1349        self
1350    }
1351
1352    /// Sets optional bloom filter offset in bytes.
1353    pub fn set_bloom_filter_offset(mut self, value: Option<i64>) -> Self {
1354        self.0.bloom_filter_offset = value;
1355        self
1356    }
1357
1358    /// Sets optional bloom filter length in bytes.
1359    pub fn set_bloom_filter_length(mut self, value: Option<i32>) -> Self {
1360        self.0.bloom_filter_length = value;
1361        self
1362    }
1363
1364    /// Sets optional offset index offset in bytes.
1365    pub fn set_offset_index_offset(mut self, value: Option<i64>) -> Self {
1366        self.0.offset_index_offset = value;
1367        self
1368    }
1369
1370    /// Sets optional offset index length in bytes.
1371    pub fn set_offset_index_length(mut self, value: Option<i32>) -> Self {
1372        self.0.offset_index_length = value;
1373        self
1374    }
1375
1376    /// Sets optional column index offset in bytes.
1377    pub fn set_column_index_offset(mut self, value: Option<i64>) -> Self {
1378        self.0.column_index_offset = value;
1379        self
1380    }
1381
1382    /// Sets optional column index length in bytes.
1383    pub fn set_column_index_length(mut self, value: Option<i32>) -> Self {
1384        self.0.column_index_length = value;
1385        self
1386    }
1387
1388    /// Sets optional length of variable length data in bytes.
1389    pub fn set_unencoded_byte_array_data_bytes(mut self, value: Option<i64>) -> Self {
1390        self.0.unencoded_byte_array_data_bytes = value;
1391        self
1392    }
1393
1394    /// Sets optional repetition level histogram
1395    pub fn set_repetition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1396        self.0.repetition_level_histogram = value;
1397        self
1398    }
1399
1400    /// Sets optional repetition level histogram
1401    pub fn set_definition_level_histogram(mut self, value: Option<LevelHistogram>) -> Self {
1402        self.0.definition_level_histogram = value;
1403        self
1404    }
1405
1406    /// Builds column chunk metadata.
1407    pub fn build(self) -> Result<ColumnChunkMetaData> {
1408        Ok(self.0)
1409    }
1410}
1411
1412/// Builder for Parquet [`ColumnIndex`], part of the Parquet [PageIndex]
1413///
1414/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1415pub struct ColumnIndexBuilder {
1416    null_pages: Vec<bool>,
1417    min_values: Vec<Vec<u8>>,
1418    max_values: Vec<Vec<u8>>,
1419    null_counts: Vec<i64>,
1420    boundary_order: BoundaryOrder,
1421    /// contains the concatenation of the histograms of all pages
1422    repetition_level_histograms: Option<Vec<i64>>,
1423    /// contains the concatenation of the histograms of all pages
1424    definition_level_histograms: Option<Vec<i64>>,
1425    /// Is the information in the builder valid?
1426    ///
1427    /// Set to `false` if any entry in the page doesn't have statistics for
1428    /// some reason, so statistics for that page won't be written to the file.
1429    /// This might happen if the page is entirely null, or
1430    /// is a floating point column without any non-nan values
1431    /// e.g. <https://github.com/apache/parquet-format/pull/196>
1432    valid: bool,
1433}
1434
1435impl Default for ColumnIndexBuilder {
1436    fn default() -> Self {
1437        Self::new()
1438    }
1439}
1440
1441impl ColumnIndexBuilder {
1442    /// Creates a new column index builder.
1443    pub fn new() -> Self {
1444        ColumnIndexBuilder {
1445            null_pages: Vec::new(),
1446            min_values: Vec::new(),
1447            max_values: Vec::new(),
1448            null_counts: Vec::new(),
1449            boundary_order: BoundaryOrder::UNORDERED,
1450            repetition_level_histograms: None,
1451            definition_level_histograms: None,
1452            valid: true,
1453        }
1454    }
1455
1456    /// Append statistics for the next page
1457    pub fn append(
1458        &mut self,
1459        null_page: bool,
1460        min_value: Vec<u8>,
1461        max_value: Vec<u8>,
1462        null_count: i64,
1463    ) {
1464        self.null_pages.push(null_page);
1465        self.min_values.push(min_value);
1466        self.max_values.push(max_value);
1467        self.null_counts.push(null_count);
1468    }
1469
1470    /// Append the given page-level histograms to the [`ColumnIndex`] histograms.
1471    /// Does nothing if the `ColumnIndexBuilder` is not in the `valid` state.
1472    pub fn append_histograms(
1473        &mut self,
1474        repetition_level_histogram: &Option<LevelHistogram>,
1475        definition_level_histogram: &Option<LevelHistogram>,
1476    ) {
1477        if !self.valid {
1478            return;
1479        }
1480        if let Some(ref rep_lvl_hist) = repetition_level_histogram {
1481            let hist = self.repetition_level_histograms.get_or_insert(Vec::new());
1482            hist.reserve(rep_lvl_hist.len());
1483            hist.extend(rep_lvl_hist.values());
1484        }
1485        if let Some(ref def_lvl_hist) = definition_level_histogram {
1486            let hist = self.definition_level_histograms.get_or_insert(Vec::new());
1487            hist.reserve(def_lvl_hist.len());
1488            hist.extend(def_lvl_hist.values());
1489        }
1490    }
1491
1492    /// Set the boundary order of the column index
1493    pub fn set_boundary_order(&mut self, boundary_order: BoundaryOrder) {
1494        self.boundary_order = boundary_order;
1495    }
1496
1497    /// Mark this column index as invalid
1498    pub fn to_invalid(&mut self) {
1499        self.valid = false;
1500    }
1501
1502    /// Is the information in the builder valid?
1503    pub fn valid(&self) -> bool {
1504        self.valid
1505    }
1506
1507    /// Build and get the thrift metadata of column index
1508    ///
1509    /// Note: callers should check [`Self::valid`] before calling this method
1510    pub fn build_to_thrift(self) -> ColumnIndex {
1511        ColumnIndex::new(
1512            self.null_pages,
1513            self.min_values,
1514            self.max_values,
1515            self.boundary_order,
1516            self.null_counts,
1517            self.repetition_level_histograms,
1518            self.definition_level_histograms,
1519        )
1520    }
1521}
1522
1523impl From<ColumnChunkMetaData> for ColumnChunkMetaDataBuilder {
1524    fn from(value: ColumnChunkMetaData) -> Self {
1525        ColumnChunkMetaDataBuilder(value)
1526    }
1527}
1528
1529/// Builder for offset index, part of the Parquet [PageIndex].
1530///
1531/// [PageIndex]: https://github.com/apache/parquet-format/blob/master/PageIndex.md
1532pub struct OffsetIndexBuilder {
1533    offset_array: Vec<i64>,
1534    compressed_page_size_array: Vec<i32>,
1535    first_row_index_array: Vec<i64>,
1536    unencoded_byte_array_data_bytes_array: Option<Vec<i64>>,
1537    current_first_row_index: i64,
1538}
1539
1540impl Default for OffsetIndexBuilder {
1541    fn default() -> Self {
1542        Self::new()
1543    }
1544}
1545
1546impl OffsetIndexBuilder {
1547    /// Creates a new offset index builder.
1548    pub fn new() -> Self {
1549        OffsetIndexBuilder {
1550            offset_array: Vec::new(),
1551            compressed_page_size_array: Vec::new(),
1552            first_row_index_array: Vec::new(),
1553            unencoded_byte_array_data_bytes_array: None,
1554            current_first_row_index: 0,
1555        }
1556    }
1557
1558    /// Append the row count of the next page.
1559    pub fn append_row_count(&mut self, row_count: i64) {
1560        let current_page_row_index = self.current_first_row_index;
1561        self.first_row_index_array.push(current_page_row_index);
1562        self.current_first_row_index += row_count;
1563    }
1564
1565    /// Append the offset and size of the next page.
1566    pub fn append_offset_and_size(&mut self, offset: i64, compressed_page_size: i32) {
1567        self.offset_array.push(offset);
1568        self.compressed_page_size_array.push(compressed_page_size);
1569    }
1570
1571    /// Append the unencoded byte array data bytes of the next page.
1572    pub fn append_unencoded_byte_array_data_bytes(
1573        &mut self,
1574        unencoded_byte_array_data_bytes: Option<i64>,
1575    ) {
1576        if let Some(val) = unencoded_byte_array_data_bytes {
1577            self.unencoded_byte_array_data_bytes_array
1578                .get_or_insert(Vec::new())
1579                .push(val);
1580        }
1581    }
1582
1583    /// Build and get the thrift metadata of offset index
1584    pub fn build_to_thrift(self) -> OffsetIndex {
1585        let locations = self
1586            .offset_array
1587            .iter()
1588            .zip(self.compressed_page_size_array.iter())
1589            .zip(self.first_row_index_array.iter())
1590            .map(|((offset, size), row_index)| PageLocation::new(*offset, *size, *row_index))
1591            .collect::<Vec<_>>();
1592        OffsetIndex::new(locations, self.unencoded_byte_array_data_bytes_array)
1593    }
1594}
1595
1596#[cfg(test)]
1597mod tests {
1598    use super::*;
1599    use crate::basic::{PageType, SortOrder};
1600    use crate::file::page_index::index::NativeIndex;
1601
1602    #[test]
1603    fn test_row_group_metadata_thrift_conversion() {
1604        let schema_descr = get_test_schema_descr();
1605
1606        let mut columns = vec![];
1607        for ptr in schema_descr.columns() {
1608            let column = ColumnChunkMetaData::builder(ptr.clone()).build().unwrap();
1609            columns.push(column);
1610        }
1611        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1612            .set_num_rows(1000)
1613            .set_total_byte_size(2000)
1614            .set_column_metadata(columns)
1615            .set_ordinal(1)
1616            .build()
1617            .unwrap();
1618
1619        let row_group_exp = row_group_meta.to_thrift();
1620        let row_group_res = RowGroupMetaData::from_thrift(schema_descr, row_group_exp.clone())
1621            .unwrap()
1622            .to_thrift();
1623
1624        assert_eq!(row_group_res, row_group_exp);
1625    }
1626
1627    #[test]
1628    fn test_row_group_metadata_thrift_conversion_empty() {
1629        let schema_descr = get_test_schema_descr();
1630
1631        let row_group_meta = RowGroupMetaData::builder(schema_descr).build();
1632
1633        assert!(row_group_meta.is_err());
1634        if let Err(e) = row_group_meta {
1635            assert_eq!(
1636                format!("{e}"),
1637                "Parquet error: Column length mismatch: 2 != 0"
1638            );
1639        }
1640    }
1641
1642    /// Test reading a corrupted Parquet file with 3 columns in its schema but only 2 in its row group
1643    #[test]
1644    fn test_row_group_metadata_thrift_corrupted() {
1645        let schema_descr_2cols = Arc::new(SchemaDescriptor::new(Arc::new(
1646            SchemaType::group_type_builder("schema")
1647                .with_fields(vec![
1648                    Arc::new(
1649                        SchemaType::primitive_type_builder("a", Type::INT32)
1650                            .build()
1651                            .unwrap(),
1652                    ),
1653                    Arc::new(
1654                        SchemaType::primitive_type_builder("b", Type::INT32)
1655                            .build()
1656                            .unwrap(),
1657                    ),
1658                ])
1659                .build()
1660                .unwrap(),
1661        )));
1662
1663        let schema_descr_3cols = Arc::new(SchemaDescriptor::new(Arc::new(
1664            SchemaType::group_type_builder("schema")
1665                .with_fields(vec![
1666                    Arc::new(
1667                        SchemaType::primitive_type_builder("a", Type::INT32)
1668                            .build()
1669                            .unwrap(),
1670                    ),
1671                    Arc::new(
1672                        SchemaType::primitive_type_builder("b", Type::INT32)
1673                            .build()
1674                            .unwrap(),
1675                    ),
1676                    Arc::new(
1677                        SchemaType::primitive_type_builder("c", Type::INT32)
1678                            .build()
1679                            .unwrap(),
1680                    ),
1681                ])
1682                .build()
1683                .unwrap(),
1684        )));
1685
1686        let row_group_meta_2cols = RowGroupMetaData::builder(schema_descr_2cols.clone())
1687            .set_num_rows(1000)
1688            .set_total_byte_size(2000)
1689            .set_column_metadata(vec![
1690                ColumnChunkMetaData::builder(schema_descr_2cols.column(0))
1691                    .build()
1692                    .unwrap(),
1693                ColumnChunkMetaData::builder(schema_descr_2cols.column(1))
1694                    .build()
1695                    .unwrap(),
1696            ])
1697            .set_ordinal(1)
1698            .build()
1699            .unwrap();
1700
1701        let err =
1702            RowGroupMetaData::from_thrift(schema_descr_3cols, row_group_meta_2cols.to_thrift())
1703                .unwrap_err()
1704                .to_string();
1705        assert_eq!(
1706            err,
1707            "Parquet error: Column count mismatch. Schema has 3 columns while Row Group has 2"
1708        );
1709    }
1710
1711    #[test]
1712    fn test_column_chunk_metadata_thrift_conversion() {
1713        let column_descr = get_test_schema_descr().column(0);
1714
1715        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1716            .set_encodings(vec![Encoding::PLAIN, Encoding::RLE])
1717            .set_file_path("file_path".to_owned())
1718            .set_num_values(1000)
1719            .set_compression(Compression::SNAPPY)
1720            .set_total_compressed_size(2000)
1721            .set_total_uncompressed_size(3000)
1722            .set_data_page_offset(4000)
1723            .set_dictionary_page_offset(Some(5000))
1724            .set_page_encoding_stats(vec![
1725                PageEncodingStats {
1726                    page_type: PageType::DATA_PAGE,
1727                    encoding: Encoding::PLAIN,
1728                    count: 3,
1729                },
1730                PageEncodingStats {
1731                    page_type: PageType::DATA_PAGE,
1732                    encoding: Encoding::RLE,
1733                    count: 5,
1734                },
1735            ])
1736            .set_bloom_filter_offset(Some(6000))
1737            .set_bloom_filter_length(Some(25))
1738            .set_offset_index_offset(Some(7000))
1739            .set_offset_index_length(Some(25))
1740            .set_column_index_offset(Some(8000))
1741            .set_column_index_length(Some(25))
1742            .set_unencoded_byte_array_data_bytes(Some(2000))
1743            .set_repetition_level_histogram(Some(LevelHistogram::from(vec![100, 100])))
1744            .set_definition_level_histogram(Some(LevelHistogram::from(vec![0, 200])))
1745            .build()
1746            .unwrap();
1747
1748        let col_chunk_res =
1749            ColumnChunkMetaData::from_thrift(column_descr, col_metadata.to_thrift()).unwrap();
1750
1751        assert_eq!(col_chunk_res, col_metadata);
1752    }
1753
1754    #[test]
1755    fn test_column_chunk_metadata_thrift_conversion_empty() {
1756        let column_descr = get_test_schema_descr().column(0);
1757
1758        let col_metadata = ColumnChunkMetaData::builder(column_descr.clone())
1759            .build()
1760            .unwrap();
1761
1762        let col_chunk_exp = col_metadata.to_thrift();
1763        let col_chunk_res = ColumnChunkMetaData::from_thrift(column_descr, col_chunk_exp.clone())
1764            .unwrap()
1765            .to_thrift();
1766
1767        assert_eq!(col_chunk_res, col_chunk_exp);
1768    }
1769
1770    #[test]
1771    fn test_compressed_size() {
1772        let schema_descr = get_test_schema_descr();
1773
1774        let mut columns = vec![];
1775        for column_descr in schema_descr.columns() {
1776            let column = ColumnChunkMetaData::builder(column_descr.clone())
1777                .set_total_compressed_size(500)
1778                .set_total_uncompressed_size(700)
1779                .build()
1780                .unwrap();
1781            columns.push(column);
1782        }
1783        let row_group_meta = RowGroupMetaData::builder(schema_descr)
1784            .set_num_rows(1000)
1785            .set_column_metadata(columns)
1786            .build()
1787            .unwrap();
1788
1789        let compressed_size_res: i64 = row_group_meta.compressed_size();
1790        let compressed_size_exp: i64 = 1000;
1791
1792        assert_eq!(compressed_size_res, compressed_size_exp);
1793    }
1794
1795    #[test]
1796    fn test_memory_size() {
1797        let schema_descr = get_test_schema_descr();
1798
1799        let columns = schema_descr
1800            .columns()
1801            .iter()
1802            .map(|column_descr| {
1803                ColumnChunkMetaData::builder(column_descr.clone())
1804                    .set_statistics(Statistics::new::<i32>(None, None, None, None, false))
1805                    .build()
1806            })
1807            .collect::<Result<Vec<_>>>()
1808            .unwrap();
1809        let row_group_meta = RowGroupMetaData::builder(schema_descr.clone())
1810            .set_num_rows(1000)
1811            .set_column_metadata(columns)
1812            .build()
1813            .unwrap();
1814        let row_group_meta = vec![row_group_meta];
1815
1816        let version = 2;
1817        let num_rows = 1000;
1818        let created_by = Some(String::from("test harness"));
1819        let key_value_metadata = Some(vec![KeyValue::new(
1820            String::from("Foo"),
1821            Some(String::from("bar")),
1822        )]);
1823        let column_orders = Some(vec![
1824            ColumnOrder::UNDEFINED,
1825            ColumnOrder::TYPE_DEFINED_ORDER(SortOrder::UNSIGNED),
1826        ]);
1827        let file_metadata = FileMetaData::new(
1828            version,
1829            num_rows,
1830            created_by,
1831            key_value_metadata,
1832            schema_descr.clone(),
1833            column_orders,
1834        );
1835
1836        // Now, add in Exact Statistics
1837        let columns_with_stats = schema_descr
1838            .columns()
1839            .iter()
1840            .map(|column_descr| {
1841                ColumnChunkMetaData::builder(column_descr.clone())
1842                    .set_statistics(Statistics::new::<i32>(
1843                        Some(0),
1844                        Some(100),
1845                        None,
1846                        None,
1847                        false,
1848                    ))
1849                    .build()
1850            })
1851            .collect::<Result<Vec<_>>>()
1852            .unwrap();
1853
1854        let row_group_meta_with_stats = RowGroupMetaData::builder(schema_descr)
1855            .set_num_rows(1000)
1856            .set_column_metadata(columns_with_stats)
1857            .build()
1858            .unwrap();
1859        let row_group_meta_with_stats = vec![row_group_meta_with_stats];
1860
1861        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata.clone())
1862            .set_row_groups(row_group_meta_with_stats)
1863            .build();
1864        let base_expected_size = 2312;
1865
1866        assert_eq!(parquet_meta.memory_size(), base_expected_size);
1867
1868        let mut column_index = ColumnIndexBuilder::new();
1869        column_index.append(false, vec![1u8], vec![2u8, 3u8], 4);
1870        let column_index = column_index.build_to_thrift();
1871        let native_index = NativeIndex::<bool>::try_new(column_index).unwrap();
1872
1873        // Now, add in OffsetIndex
1874        let mut offset_index = OffsetIndexBuilder::new();
1875        offset_index.append_row_count(1);
1876        offset_index.append_offset_and_size(2, 3);
1877        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1878        offset_index.append_row_count(1);
1879        offset_index.append_offset_and_size(2, 3);
1880        offset_index.append_unencoded_byte_array_data_bytes(Some(10));
1881        let offset_index = offset_index.build_to_thrift();
1882
1883        let parquet_meta = ParquetMetaDataBuilder::new(file_metadata)
1884            .set_row_groups(row_group_meta)
1885            .set_column_index(Some(vec![vec![Index::BOOLEAN(native_index)]]))
1886            .set_offset_index(Some(vec![vec![
1887                OffsetIndexMetaData::try_new(offset_index).unwrap()
1888            ]]))
1889            .build();
1890
1891        let bigger_expected_size = 2816;
1892        // more set fields means more memory usage
1893        assert!(bigger_expected_size > base_expected_size);
1894        assert_eq!(parquet_meta.memory_size(), bigger_expected_size);
1895    }
1896
1897    /// Returns sample schema descriptor so we can create column metadata.
1898    fn get_test_schema_descr() -> SchemaDescPtr {
1899        let schema = SchemaType::group_type_builder("schema")
1900            .with_fields(vec![
1901                Arc::new(
1902                    SchemaType::primitive_type_builder("a", Type::INT32)
1903                        .build()
1904                        .unwrap(),
1905                ),
1906                Arc::new(
1907                    SchemaType::primitive_type_builder("b", Type::INT32)
1908                        .build()
1909                        .unwrap(),
1910                ),
1911            ])
1912            .build()
1913            .unwrap();
1914
1915        Arc::new(SchemaDescriptor::new(Arc::new(schema)))
1916    }
1917}
parquet/file/metadata/mod.rs

parquet/file/metadata/
mod.rs