parquet/schema/
types.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Contains structs and methods to build Parquet schema and schema descriptors.
19
20use std::{collections::HashMap, fmt, sync::Arc};
21
22use crate::file::metadata::HeapSize;
23use crate::format::SchemaElement;
24
25use crate::basic::{
26    ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
27};
28use crate::errors::{ParquetError, Result};
29
30// ----------------------------------------------------------------------
31// Parquet Type definitions
32
33/// Type alias for `Arc<Type>`.
34pub type TypePtr = Arc<Type>;
35/// Type alias for `Arc<SchemaDescriptor>`.
36pub type SchemaDescPtr = Arc<SchemaDescriptor>;
37/// Type alias for `Arc<ColumnDescriptor>`.
38pub type ColumnDescPtr = Arc<ColumnDescriptor>;
39
40/// Representation of a Parquet type.
41///
42/// Used to describe primitive leaf fields and structs, including top-level schema.
43///
44/// Note that the top-level schema is represented using [`Type::GroupType`] whose
45/// repetition is `None`.
46#[derive(Clone, Debug, PartialEq)]
47pub enum Type {
48    /// Represents a primitive leaf field.
49    PrimitiveType {
50        /// Basic information about the type.
51        basic_info: BasicTypeInfo,
52        /// Physical type of this primitive type.
53        physical_type: PhysicalType,
54        /// Length of this type.
55        type_length: i32,
56        /// Scale of this type.
57        scale: i32,
58        /// Precision of this type.
59        precision: i32,
60    },
61    /// Represents a group of fields (similar to struct).
62    GroupType {
63        /// Basic information about the type.
64        basic_info: BasicTypeInfo,
65        /// Fields of this group type.
66        fields: Vec<TypePtr>,
67    },
68}
69
70impl HeapSize for Type {
71    fn heap_size(&self) -> usize {
72        match self {
73            Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
74            Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
75        }
76    }
77}
78
79impl Type {
80    /// Creates primitive type builder with provided field name and physical type.
81    pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder {
82        PrimitiveTypeBuilder::new(name, physical_type)
83    }
84
85    /// Creates group type builder with provided column name.
86    pub fn group_type_builder(name: &str) -> GroupTypeBuilder {
87        GroupTypeBuilder::new(name)
88    }
89
90    /// Returns [`BasicTypeInfo`] information about the type.
91    pub fn get_basic_info(&self) -> &BasicTypeInfo {
92        match *self {
93            Type::PrimitiveType { ref basic_info, .. } => basic_info,
94            Type::GroupType { ref basic_info, .. } => basic_info,
95        }
96    }
97
98    /// Returns this type's field name.
99    pub fn name(&self) -> &str {
100        self.get_basic_info().name()
101    }
102
103    /// Gets the fields from this group type.
104    /// Note that this will panic if called on a non-group type.
105    // TODO: should we return `&[&Type]` here?
106    pub fn get_fields(&self) -> &[TypePtr] {
107        match *self {
108            Type::GroupType { ref fields, .. } => &fields[..],
109            _ => panic!("Cannot call get_fields() on a non-group type"),
110        }
111    }
112
113    /// Gets physical type of this primitive type.
114    /// Note that this will panic if called on a non-primitive type.
115    pub fn get_physical_type(&self) -> PhysicalType {
116        match *self {
117            Type::PrimitiveType {
118                basic_info: _,
119                physical_type,
120                ..
121            } => physical_type,
122            _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
123        }
124    }
125
126    /// Gets precision of this primitive type.
127    /// Note that this will panic if called on a non-primitive type.
128    pub fn get_precision(&self) -> i32 {
129        match *self {
130            Type::PrimitiveType { precision, .. } => precision,
131            _ => panic!("Cannot call get_precision() on non-primitive type"),
132        }
133    }
134
135    /// Gets scale of this primitive type.
136    /// Note that this will panic if called on a non-primitive type.
137    pub fn get_scale(&self) -> i32 {
138        match *self {
139            Type::PrimitiveType { scale, .. } => scale,
140            _ => panic!("Cannot call get_scale() on non-primitive type"),
141        }
142    }
143
144    /// Checks if `sub_type` schema is part of current schema.
145    /// This method can be used to check if projected columns are part of the root schema.
146    pub fn check_contains(&self, sub_type: &Type) -> bool {
147        // Names match, and repetitions match or not set for both
148        let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
149            && (self.is_schema() && sub_type.is_schema()
150                || !self.is_schema()
151                    && !sub_type.is_schema()
152                    && self.get_basic_info().repetition()
153                        == sub_type.get_basic_info().repetition());
154
155        match *self {
156            Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
157                self.get_physical_type() == sub_type.get_physical_type()
158            }
159            Type::GroupType { .. } if basic_match && sub_type.is_group() => {
160                // build hashmap of name -> TypePtr
161                let mut field_map = HashMap::new();
162                for field in self.get_fields() {
163                    field_map.insert(field.name(), field);
164                }
165
166                for field in sub_type.get_fields() {
167                    if !field_map
168                        .get(field.name())
169                        .map(|tpe| tpe.check_contains(field))
170                        .unwrap_or(false)
171                    {
172                        return false;
173                    }
174                }
175                true
176            }
177            _ => false,
178        }
179    }
180
181    /// Returns `true` if this type is a primitive type, `false` otherwise.
182    pub fn is_primitive(&self) -> bool {
183        matches!(*self, Type::PrimitiveType { .. })
184    }
185
186    /// Returns `true` if this type is a group type, `false` otherwise.
187    pub fn is_group(&self) -> bool {
188        matches!(*self, Type::GroupType { .. })
189    }
190
191    /// Returns `true` if this type is the top-level schema type (message type).
192    pub fn is_schema(&self) -> bool {
193        match *self {
194            Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
195            _ => false,
196        }
197    }
198
199    /// Returns `true` if this type is repeated or optional.
200    /// If this type doesn't have repetition defined, we treat it as required.
201    pub fn is_optional(&self) -> bool {
202        self.get_basic_info().has_repetition()
203            && self.get_basic_info().repetition() != Repetition::REQUIRED
204    }
205}
206
207/// A builder for primitive types. All attributes are optional
208/// except the name and physical type.
209/// Note that if not specified explicitly, `Repetition::OPTIONAL` is used.
210pub struct PrimitiveTypeBuilder<'a> {
211    name: &'a str,
212    repetition: Repetition,
213    physical_type: PhysicalType,
214    converted_type: ConvertedType,
215    logical_type: Option<LogicalType>,
216    length: i32,
217    precision: i32,
218    scale: i32,
219    id: Option<i32>,
220}
221
222impl<'a> PrimitiveTypeBuilder<'a> {
223    /// Creates new primitive type builder with provided field name and physical type.
224    pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
225        Self {
226            name,
227            repetition: Repetition::OPTIONAL,
228            physical_type,
229            converted_type: ConvertedType::NONE,
230            logical_type: None,
231            length: -1,
232            precision: -1,
233            scale: -1,
234            id: None,
235        }
236    }
237
238    /// Sets [`Repetition`] for this field and returns itself.
239    pub fn with_repetition(self, repetition: Repetition) -> Self {
240        Self { repetition, ..self }
241    }
242
243    /// Sets [`ConvertedType`] for this field and returns itself.
244    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
245        Self {
246            converted_type,
247            ..self
248        }
249    }
250
251    /// Sets [`LogicalType`] for this field and returns itself.
252    /// If only the logical type is populated for a primitive type, the converted type
253    /// will be automatically populated, and can thus be omitted.
254    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
255        Self {
256            logical_type,
257            ..self
258        }
259    }
260
261    /// Sets type length and returns itself.
262    /// This is only applied to FIXED_LEN_BYTE_ARRAY and INT96 (INTERVAL) types, because
263    /// they maintain fixed size underlying byte array.
264    /// By default, value is `0`.
265    pub fn with_length(self, length: i32) -> Self {
266        Self { length, ..self }
267    }
268
269    /// Sets precision for Parquet DECIMAL physical type and returns itself.
270    /// By default, it equals to `0` and used only for decimal context.
271    pub fn with_precision(self, precision: i32) -> Self {
272        Self { precision, ..self }
273    }
274
275    /// Sets scale for Parquet DECIMAL physical type and returns itself.
276    /// By default, it equals to `0` and used only for decimal context.
277    pub fn with_scale(self, scale: i32) -> Self {
278        Self { scale, ..self }
279    }
280
281    /// Sets optional field id and returns itself.
282    pub fn with_id(self, id: Option<i32>) -> Self {
283        Self { id, ..self }
284    }
285
286    /// Creates a new `PrimitiveType` instance from the collected attributes.
287    /// Returns `Err` in case of any building conditions are not met.
288    pub fn build(self) -> Result<Type> {
289        let mut basic_info = BasicTypeInfo {
290            name: String::from(self.name),
291            repetition: Some(self.repetition),
292            converted_type: self.converted_type,
293            logical_type: self.logical_type.clone(),
294            id: self.id,
295        };
296
297        // Check length before logical type, since it is used for logical type validation.
298        if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
299            return Err(general_err!(
300                "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
301                self.length,
302                self.name
303            ));
304        }
305
306        if let Some(logical_type) = &self.logical_type {
307            // If a converted type is populated, check that it is consistent with
308            // its logical type
309            if self.converted_type != ConvertedType::NONE {
310                if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
311                    return Err(general_err!(
312                        "Logical type {:?} is incompatible with converted type {} for field '{}'",
313                        logical_type,
314                        self.converted_type,
315                        self.name
316                    ));
317                }
318            } else {
319                // Populate the converted type for backwards compatibility
320                basic_info.converted_type = self.logical_type.clone().into();
321            }
322            // Check that logical type and physical type are compatible
323            match (logical_type, self.physical_type) {
324                (LogicalType::Map, _) | (LogicalType::List, _) => {
325                    return Err(general_err!(
326                        "{:?} cannot be applied to a primitive type for field '{}'",
327                        logical_type,
328                        self.name
329                    ));
330                }
331                (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
332                (LogicalType::Decimal { scale, precision }, _) => {
333                    // Check that scale and precision are consistent with legacy values
334                    if *scale != self.scale {
335                        return Err(general_err!(
336                            "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
337                            scale,
338                            self.scale,
339                            self.name
340                        ));
341                    }
342                    if *precision != self.precision {
343                        return Err(general_err!(
344                            "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
345                            precision,
346                            self.precision,
347                            self.name
348                        ));
349                    }
350                    self.check_decimal_precision_scale()?;
351                }
352                (LogicalType::Date, PhysicalType::INT32) => {}
353                (
354                    LogicalType::Time {
355                        unit: TimeUnit::MILLIS(_),
356                        ..
357                    },
358                    PhysicalType::INT32,
359                ) => {}
360                (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
361                    if *unit == TimeUnit::MILLIS(Default::default()) {
362                        return Err(general_err!(
363                            "Cannot use millisecond unit on INT64 type for field '{}'",
364                            self.name
365                        ));
366                    }
367                }
368                (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
369                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
370                    if *bit_width <= 32 => {}
371                (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
372                    if *bit_width == 64 => {}
373                // Null type
374                (LogicalType::Unknown, PhysicalType::INT32) => {}
375                (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
376                (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
377                (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
378                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
379                (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
380                    return Err(general_err!(
381                        "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
382                        self.name
383                    ))
384                }
385                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY)
386                    if self.length == 2 => {}
387                (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
388                    return Err(general_err!(
389                        "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
390                        self.name
391                    ))
392                }
393                (a, b) => {
394                    return Err(general_err!(
395                        "Cannot annotate {:?} from {} for field '{}'",
396                        a,
397                        b,
398                        self.name
399                    ))
400                }
401            }
402        }
403
404        match self.converted_type {
405            ConvertedType::NONE => {}
406            ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
407                if self.physical_type != PhysicalType::BYTE_ARRAY {
408                    return Err(general_err!(
409                        "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
410                        self.converted_type,
411                        self.name
412                    ));
413                }
414            }
415            ConvertedType::DECIMAL => {
416                self.check_decimal_precision_scale()?;
417            }
418            ConvertedType::DATE
419            | ConvertedType::TIME_MILLIS
420            | ConvertedType::UINT_8
421            | ConvertedType::UINT_16
422            | ConvertedType::UINT_32
423            | ConvertedType::INT_8
424            | ConvertedType::INT_16
425            | ConvertedType::INT_32 => {
426                if self.physical_type != PhysicalType::INT32 {
427                    return Err(general_err!(
428                        "{} cannot annotate field '{}' because it is not a INT32 field",
429                        self.converted_type,
430                        self.name
431                    ));
432                }
433            }
434            ConvertedType::TIME_MICROS
435            | ConvertedType::TIMESTAMP_MILLIS
436            | ConvertedType::TIMESTAMP_MICROS
437            | ConvertedType::UINT_64
438            | ConvertedType::INT_64 => {
439                if self.physical_type != PhysicalType::INT64 {
440                    return Err(general_err!(
441                        "{} cannot annotate field '{}' because it is not a INT64 field",
442                        self.converted_type,
443                        self.name
444                    ));
445                }
446            }
447            ConvertedType::INTERVAL => {
448                if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
449                    return Err(general_err!(
450                        "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
451                        self.name
452                    ));
453                }
454            }
455            ConvertedType::ENUM => {
456                if self.physical_type != PhysicalType::BYTE_ARRAY {
457                    return Err(general_err!(
458                        "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
459                        self.name
460                    ));
461                }
462            }
463            _ => {
464                return Err(general_err!(
465                    "{} cannot be applied to primitive field '{}'",
466                    self.converted_type,
467                    self.name
468                ));
469            }
470        }
471
472        Ok(Type::PrimitiveType {
473            basic_info,
474            physical_type: self.physical_type,
475            type_length: self.length,
476            scale: self.scale,
477            precision: self.precision,
478        })
479    }
480
481    #[inline]
482    fn check_decimal_precision_scale(&self) -> Result<()> {
483        match self.physical_type {
484            PhysicalType::INT32
485            | PhysicalType::INT64
486            | PhysicalType::BYTE_ARRAY
487            | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
488            _ => {
489                return Err(general_err!(
490                    "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
491                ));
492            }
493        }
494
495        // Precision is required and must be a non-zero positive integer.
496        if self.precision < 1 {
497            return Err(general_err!(
498                "Invalid DECIMAL precision: {}",
499                self.precision
500            ));
501        }
502
503        // Scale must be zero or a positive integer less than the precision.
504        if self.scale < 0 {
505            return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
506        }
507
508        if self.scale > self.precision {
509            return Err(general_err!(
510                "Invalid DECIMAL: scale ({}) cannot be greater than precision \
511             ({})",
512                self.scale,
513                self.precision
514            ));
515        }
516
517        // Check precision and scale based on physical type limitations.
518        match self.physical_type {
519            PhysicalType::INT32 => {
520                if self.precision > 9 {
521                    return Err(general_err!(
522                        "Cannot represent INT32 as DECIMAL with precision {}",
523                        self.precision
524                    ));
525                }
526            }
527            PhysicalType::INT64 => {
528                if self.precision > 18 {
529                    return Err(general_err!(
530                        "Cannot represent INT64 as DECIMAL with precision {}",
531                        self.precision
532                    ));
533                }
534            }
535            PhysicalType::FIXED_LEN_BYTE_ARRAY => {
536                let max_precision = (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32;
537
538                if self.precision > max_precision {
539                    return Err(general_err!(
540                        "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
541                        precision {}. The max precision can only be {}",
542                        self.length,
543                        self.precision,
544                        max_precision
545                    ));
546                }
547            }
548            _ => (), // For BYTE_ARRAY precision is not limited
549        }
550
551        Ok(())
552    }
553}
554
555/// A builder for group types. All attributes are optional except the name.
556/// Note that if not specified explicitly, `None` is used as the repetition of the group,
557/// which means it is a root (message) type.
558pub struct GroupTypeBuilder<'a> {
559    name: &'a str,
560    repetition: Option<Repetition>,
561    converted_type: ConvertedType,
562    logical_type: Option<LogicalType>,
563    fields: Vec<TypePtr>,
564    id: Option<i32>,
565}
566
567impl<'a> GroupTypeBuilder<'a> {
568    /// Creates new group type builder with provided field name.
569    pub fn new(name: &'a str) -> Self {
570        Self {
571            name,
572            repetition: None,
573            converted_type: ConvertedType::NONE,
574            logical_type: None,
575            fields: Vec::new(),
576            id: None,
577        }
578    }
579
580    /// Sets [`Repetition`] for this field and returns itself.
581    pub fn with_repetition(mut self, repetition: Repetition) -> Self {
582        self.repetition = Some(repetition);
583        self
584    }
585
586    /// Sets [`ConvertedType`] for this field and returns itself.
587    pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
588        Self {
589            converted_type,
590            ..self
591        }
592    }
593
594    /// Sets [`LogicalType`] for this field and returns itself.
595    pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
596        Self {
597            logical_type,
598            ..self
599        }
600    }
601
602    /// Sets a list of fields that should be child nodes of this field.
603    /// Returns updated self.
604    pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
605        Self { fields, ..self }
606    }
607
608    /// Sets optional field id and returns itself.
609    pub fn with_id(self, id: Option<i32>) -> Self {
610        Self { id, ..self }
611    }
612
613    /// Creates a new `GroupType` instance from the gathered attributes.
614    pub fn build(self) -> Result<Type> {
615        let mut basic_info = BasicTypeInfo {
616            name: String::from(self.name),
617            repetition: self.repetition,
618            converted_type: self.converted_type,
619            logical_type: self.logical_type.clone(),
620            id: self.id,
621        };
622        // Populate the converted type if only the logical type is populated
623        if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
624            basic_info.converted_type = self.logical_type.into();
625        }
626        Ok(Type::GroupType {
627            basic_info,
628            fields: self.fields,
629        })
630    }
631}
632
633/// Basic type info. This contains information such as the name of the type,
634/// the repetition level, the logical type and the kind of the type (group, primitive).
635#[derive(Clone, Debug, PartialEq, Eq)]
636pub struct BasicTypeInfo {
637    name: String,
638    repetition: Option<Repetition>,
639    converted_type: ConvertedType,
640    logical_type: Option<LogicalType>,
641    id: Option<i32>,
642}
643
644impl HeapSize for BasicTypeInfo {
645    fn heap_size(&self) -> usize {
646        // no heap allocations in any other subfield
647        self.name.heap_size()
648    }
649}
650
651impl BasicTypeInfo {
652    /// Returns field name.
653    pub fn name(&self) -> &str {
654        &self.name
655    }
656
657    /// Returns `true` if type has repetition field set, `false` otherwise.
658    /// This is mostly applied to group type, because primitive type always has
659    /// repetition set.
660    pub fn has_repetition(&self) -> bool {
661        self.repetition.is_some()
662    }
663
664    /// Returns [`Repetition`] value for the type.
665    pub fn repetition(&self) -> Repetition {
666        assert!(self.repetition.is_some());
667        self.repetition.unwrap()
668    }
669
670    /// Returns [`ConvertedType`] value for the type.
671    pub fn converted_type(&self) -> ConvertedType {
672        self.converted_type
673    }
674
675    /// Returns [`LogicalType`] value for the type.
676    pub fn logical_type(&self) -> Option<LogicalType> {
677        // Unlike ConvertedType, LogicalType cannot implement Copy, thus we clone it
678        self.logical_type.clone()
679    }
680
681    /// Returns `true` if id is set, `false` otherwise.
682    pub fn has_id(&self) -> bool {
683        self.id.is_some()
684    }
685
686    /// Returns id value for the type.
687    pub fn id(&self) -> i32 {
688        assert!(self.id.is_some());
689        self.id.unwrap()
690    }
691}
692
693// ----------------------------------------------------------------------
694// Parquet descriptor definitions
695
696/// Represents the location of a column in a Parquet schema
697///
698/// # Example: refer to column named `'my_column'`
699/// ```
700/// # use parquet::schema::types::ColumnPath;
701/// let column_path = ColumnPath::from("my_column");
702/// ```
703///
704/// # Example: refer to column named `c` in a nested struct `{a: {b: {c: ...}}}`
705/// ```
706/// # use parquet::schema::types::ColumnPath;
707/// // form path 'a.b.c'
708/// let column_path = ColumnPath::from(vec![
709///   String::from("a"),
710///   String::from("b"),
711///   String::from("c")
712/// ]);
713/// ```
714#[derive(Clone, PartialEq, Debug, Eq, Hash)]
715pub struct ColumnPath {
716    parts: Vec<String>,
717}
718
719impl HeapSize for ColumnPath {
720    fn heap_size(&self) -> usize {
721        self.parts.heap_size()
722    }
723}
724
725impl ColumnPath {
726    /// Creates new column path from vector of field names.
727    pub fn new(parts: Vec<String>) -> Self {
728        ColumnPath { parts }
729    }
730
731    /// Returns string representation of this column path.
732    /// ```rust
733    /// use parquet::schema::types::ColumnPath;
734    ///
735    /// let path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c".to_string()]);
736    /// assert_eq!(&path.string(), "a.b.c");
737    /// ```
738    pub fn string(&self) -> String {
739        self.parts.join(".")
740    }
741
742    /// Appends more components to end of column path.
743    /// ```rust
744    /// use parquet::schema::types::ColumnPath;
745    ///
746    /// let mut path = ColumnPath::new(vec!["a".to_string(), "b".to_string(), "c"
747    /// .to_string()]);
748    /// assert_eq!(&path.string(), "a.b.c");
749    ///
750    /// path.append(vec!["d".to_string(), "e".to_string()]);
751    /// assert_eq!(&path.string(), "a.b.c.d.e");
752    /// ```
753    pub fn append(&mut self, mut tail: Vec<String>) {
754        self.parts.append(&mut tail);
755    }
756
757    /// Returns a slice of path components.
758    pub fn parts(&self) -> &[String] {
759        &self.parts
760    }
761}
762
763impl fmt::Display for ColumnPath {
764    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
765        write!(f, "{:?}", self.string())
766    }
767}
768
769impl From<Vec<String>> for ColumnPath {
770    fn from(parts: Vec<String>) -> Self {
771        ColumnPath { parts }
772    }
773}
774
775impl From<&str> for ColumnPath {
776    fn from(single_path: &str) -> Self {
777        let s = String::from(single_path);
778        ColumnPath::from(s)
779    }
780}
781
782impl From<String> for ColumnPath {
783    fn from(single_path: String) -> Self {
784        let v = vec![single_path];
785        ColumnPath { parts: v }
786    }
787}
788
789impl AsRef<[String]> for ColumnPath {
790    fn as_ref(&self) -> &[String] {
791        &self.parts
792    }
793}
794
795/// Physical type for leaf-level primitive columns.
796///
797/// Also includes the maximum definition and repetition levels required to
798/// re-assemble nested data.
799#[derive(Debug, PartialEq)]
800pub struct ColumnDescriptor {
801    /// The "leaf" primitive type of this column
802    primitive_type: TypePtr,
803
804    /// The maximum definition level for this column
805    max_def_level: i16,
806
807    /// The maximum repetition level for this column
808    max_rep_level: i16,
809
810    /// The path of this column. For instance, "a.b.c.d".
811    path: ColumnPath,
812}
813
814impl HeapSize for ColumnDescriptor {
815    fn heap_size(&self) -> usize {
816        self.primitive_type.heap_size() + self.path.heap_size()
817    }
818}
819
820impl ColumnDescriptor {
821    /// Creates new descriptor for leaf-level column.
822    pub fn new(
823        primitive_type: TypePtr,
824        max_def_level: i16,
825        max_rep_level: i16,
826        path: ColumnPath,
827    ) -> Self {
828        Self {
829            primitive_type,
830            max_def_level,
831            max_rep_level,
832            path,
833        }
834    }
835
836    /// Returns maximum definition level for this column.
837    #[inline]
838    pub fn max_def_level(&self) -> i16 {
839        self.max_def_level
840    }
841
842    /// Returns maximum repetition level for this column.
843    #[inline]
844    pub fn max_rep_level(&self) -> i16 {
845        self.max_rep_level
846    }
847
848    /// Returns [`ColumnPath`] for this column.
849    pub fn path(&self) -> &ColumnPath {
850        &self.path
851    }
852
853    /// Returns self type [`Type`] for this leaf column.
854    pub fn self_type(&self) -> &Type {
855        self.primitive_type.as_ref()
856    }
857
858    /// Returns self type [`TypePtr`]  for this leaf
859    /// column.
860    pub fn self_type_ptr(&self) -> TypePtr {
861        self.primitive_type.clone()
862    }
863
864    /// Returns column name.
865    pub fn name(&self) -> &str {
866        self.primitive_type.name()
867    }
868
869    /// Returns [`ConvertedType`] for this column.
870    pub fn converted_type(&self) -> ConvertedType {
871        self.primitive_type.get_basic_info().converted_type()
872    }
873
874    /// Returns [`LogicalType`] for this column.
875    pub fn logical_type(&self) -> Option<LogicalType> {
876        self.primitive_type.get_basic_info().logical_type()
877    }
878
879    /// Returns physical type for this column.
880    /// Note that it will panic if called on a non-primitive type.
881    pub fn physical_type(&self) -> PhysicalType {
882        match self.primitive_type.as_ref() {
883            Type::PrimitiveType { physical_type, .. } => *physical_type,
884            _ => panic!("Expected primitive type!"),
885        }
886    }
887
888    /// Returns type length for this column.
889    /// Note that it will panic if called on a non-primitive type.
890    pub fn type_length(&self) -> i32 {
891        match self.primitive_type.as_ref() {
892            Type::PrimitiveType { type_length, .. } => *type_length,
893            _ => panic!("Expected primitive type!"),
894        }
895    }
896
897    /// Returns type precision for this column.
898    /// Note that it will panic if called on a non-primitive type.
899    pub fn type_precision(&self) -> i32 {
900        match self.primitive_type.as_ref() {
901            Type::PrimitiveType { precision, .. } => *precision,
902            _ => panic!("Expected primitive type!"),
903        }
904    }
905
906    /// Returns type scale for this column.
907    /// Note that it will panic if called on a non-primitive type.
908    pub fn type_scale(&self) -> i32 {
909        match self.primitive_type.as_ref() {
910            Type::PrimitiveType { scale, .. } => *scale,
911            _ => panic!("Expected primitive type!"),
912        }
913    }
914
915    /// Returns the sort order for this column
916    pub fn sort_order(&self) -> SortOrder {
917        ColumnOrder::get_sort_order(
918            self.logical_type(),
919            self.converted_type(),
920            self.physical_type(),
921        )
922    }
923}
924
925/// Schema of a Parquet file.
926///
927/// Encapsulates the file's schema ([`Type`]) and [`ColumnDescriptor`]s for
928/// each primitive (leaf) column.
929#[derive(PartialEq)]
930pub struct SchemaDescriptor {
931    /// The top-level logical schema (the "message" type).
932    ///
933    /// This must be a [`Type::GroupType`] where each field is a root
934    /// column type in the schema.
935    schema: TypePtr,
936
937    /// The descriptors for the physical type of each leaf column in this schema
938    ///
939    /// Constructed from `schema` in DFS order.
940    leaves: Vec<ColumnDescPtr>,
941
942    /// Mapping from a leaf column's index to the root column index that it
943    /// comes from.
944    ///
945    /// For instance: the leaf `a.b.c.d` would have a link back to `a`:
946    /// ```text
947    /// -- a  <-----+
948    /// -- -- b     |
949    /// -- -- -- c  |
950    /// -- -- -- -- d
951    /// ```
952    leaf_to_base: Vec<usize>,
953}
954
955impl fmt::Debug for SchemaDescriptor {
956    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
957        // Skip leaves and leaf_to_base as they only a cache information already found in `schema`
958        f.debug_struct("SchemaDescriptor")
959            .field("schema", &self.schema)
960            .finish()
961    }
962}
963
964// Need to implement HeapSize in this module as the fields are private
965impl HeapSize for SchemaDescriptor {
966    fn heap_size(&self) -> usize {
967        self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
968    }
969}
970
971impl SchemaDescriptor {
972    /// Creates new schema descriptor from Parquet schema.
973    pub fn new(tp: TypePtr) -> Self {
974        assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
975        let mut leaves = vec![];
976        let mut leaf_to_base = Vec::new();
977        for (root_idx, f) in tp.get_fields().iter().enumerate() {
978            let mut path = vec![];
979            build_tree(f, root_idx, 0, 0, &mut leaves, &mut leaf_to_base, &mut path);
980        }
981
982        Self {
983            schema: tp,
984            leaves,
985            leaf_to_base,
986        }
987    }
988
989    /// Returns [`ColumnDescriptor`] for a field position.
990    pub fn column(&self, i: usize) -> ColumnDescPtr {
991        assert!(
992            i < self.leaves.len(),
993            "Index out of bound: {} not in [0, {})",
994            i,
995            self.leaves.len()
996        );
997        self.leaves[i].clone()
998    }
999
1000    /// Returns slice of [`ColumnDescriptor`].
1001    pub fn columns(&self) -> &[ColumnDescPtr] {
1002        &self.leaves
1003    }
1004
1005    /// Returns number of leaf-level columns.
1006    pub fn num_columns(&self) -> usize {
1007        self.leaves.len()
1008    }
1009
1010    /// Returns column root [`Type`] for a leaf position.
1011    pub fn get_column_root(&self, i: usize) -> &Type {
1012        let result = self.column_root_of(i);
1013        result.as_ref()
1014    }
1015
1016    /// Returns column root [`Type`] pointer for a leaf position.
1017    pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1018        let result = self.column_root_of(i);
1019        result.clone()
1020    }
1021
1022    /// Returns the index of the root column for a field position
1023    pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1024        assert!(
1025            leaf < self.leaves.len(),
1026            "Index out of bound: {} not in [0, {})",
1027            leaf,
1028            self.leaves.len()
1029        );
1030
1031        *self
1032            .leaf_to_base
1033            .get(leaf)
1034            .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1035    }
1036
1037    fn column_root_of(&self, i: usize) -> &TypePtr {
1038        &self.schema.get_fields()[self.get_column_root_idx(i)]
1039    }
1040
1041    /// Returns schema as [`Type`].
1042    pub fn root_schema(&self) -> &Type {
1043        self.schema.as_ref()
1044    }
1045
1046    /// Returns schema as [`TypePtr`] for cheap cloning.
1047    pub fn root_schema_ptr(&self) -> TypePtr {
1048        self.schema.clone()
1049    }
1050
1051    /// Returns schema name.
1052    pub fn name(&self) -> &str {
1053        self.schema.name()
1054    }
1055}
1056
1057fn build_tree<'a>(
1058    tp: &'a TypePtr,
1059    root_idx: usize,
1060    mut max_rep_level: i16,
1061    mut max_def_level: i16,
1062    leaves: &mut Vec<ColumnDescPtr>,
1063    leaf_to_base: &mut Vec<usize>,
1064    path_so_far: &mut Vec<&'a str>,
1065) {
1066    assert!(tp.get_basic_info().has_repetition());
1067
1068    path_so_far.push(tp.name());
1069    match tp.get_basic_info().repetition() {
1070        Repetition::OPTIONAL => {
1071            max_def_level += 1;
1072        }
1073        Repetition::REPEATED => {
1074            max_def_level += 1;
1075            max_rep_level += 1;
1076        }
1077        _ => {}
1078    }
1079
1080    match tp.as_ref() {
1081        Type::PrimitiveType { .. } => {
1082            let mut path: Vec<String> = vec![];
1083            path.extend(path_so_far.iter().copied().map(String::from));
1084            leaves.push(Arc::new(ColumnDescriptor::new(
1085                tp.clone(),
1086                max_def_level,
1087                max_rep_level,
1088                ColumnPath::new(path),
1089            )));
1090            leaf_to_base.push(root_idx);
1091        }
1092        Type::GroupType { ref fields, .. } => {
1093            for f in fields {
1094                build_tree(
1095                    f,
1096                    root_idx,
1097                    max_rep_level,
1098                    max_def_level,
1099                    leaves,
1100                    leaf_to_base,
1101                    path_so_far,
1102                );
1103                path_so_far.pop();
1104            }
1105        }
1106    }
1107}
1108
1109/// Method to convert from Thrift.
1110pub fn from_thrift(elements: &[SchemaElement]) -> Result<TypePtr> {
1111    let mut index = 0;
1112    let mut schema_nodes = Vec::new();
1113    while index < elements.len() {
1114        let t = from_thrift_helper(elements, index)?;
1115        index = t.0;
1116        schema_nodes.push(t.1);
1117    }
1118    if schema_nodes.len() != 1 {
1119        return Err(general_err!(
1120            "Expected exactly one root node, but found {}",
1121            schema_nodes.len()
1122        ));
1123    }
1124
1125    Ok(schema_nodes.remove(0))
1126}
1127
1128/// Constructs a new Type from the `elements`, starting at index `index`.
1129/// The first result is the starting index for the next Type after this one. If it is
1130/// equal to `elements.len()`, then this Type is the last one.
1131/// The second result is the result Type.
1132fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> {
1133    // Whether or not the current node is root (message type).
1134    // There is only one message type node in the schema tree.
1135    let is_root_node = index == 0;
1136
1137    if index >= elements.len() {
1138        return Err(general_err!(
1139            "Index out of bound, index = {}, len = {}",
1140            index,
1141            elements.len()
1142        ));
1143    }
1144    let element = &elements[index];
1145    let converted_type = ConvertedType::try_from(element.converted_type)?;
1146    // LogicalType is only present in v2 Parquet files. ConvertedType is always
1147    // populated, regardless of the version of the file (v1 or v2).
1148    let logical_type = element
1149        .logical_type
1150        .as_ref()
1151        .map(|value| LogicalType::from(value.clone()));
1152    let field_id = elements[index].field_id;
1153    match elements[index].num_children {
1154        // From parquet-format:
1155        //   The children count is used to construct the nested relationship.
1156        //   This field is not set when the element is a primitive type
1157        // Sometimes parquet-cpp sets num_children field to 0 for primitive types, so we
1158        // have to handle this case too.
1159        None | Some(0) => {
1160            // primitive type
1161            if elements[index].repetition_type.is_none() {
1162                return Err(general_err!(
1163                    "Repetition level must be defined for a primitive type"
1164                ));
1165            }
1166            let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?;
1167            if let Some(type_) = elements[index].type_ {
1168                let physical_type = PhysicalType::try_from(type_)?;
1169                let length = elements[index].type_length.unwrap_or(-1);
1170                let scale = elements[index].scale.unwrap_or(-1);
1171                let precision = elements[index].precision.unwrap_or(-1);
1172                let name = &elements[index].name;
1173                let builder = Type::primitive_type_builder(name, physical_type)
1174                    .with_repetition(repetition)
1175                    .with_converted_type(converted_type)
1176                    .with_logical_type(logical_type)
1177                    .with_length(length)
1178                    .with_precision(precision)
1179                    .with_scale(scale)
1180                    .with_id(field_id);
1181                Ok((index + 1, Arc::new(builder.build()?)))
1182            } else {
1183                let mut builder = Type::group_type_builder(&elements[index].name)
1184                    .with_converted_type(converted_type)
1185                    .with_logical_type(logical_type)
1186                    .with_id(field_id);
1187                if !is_root_node {
1188                    // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1189                    // REPEATED for root node.
1190                    //
1191                    // We only set repetition for group types that are not top-level message
1192                    // type. According to parquet-format:
1193                    //   Root of the schema does not have a repetition_type.
1194                    //   All other types must have one.
1195                    builder = builder.with_repetition(repetition);
1196                }
1197                Ok((index + 1, Arc::new(builder.build().unwrap())))
1198            }
1199        }
1200        Some(n) => {
1201            let repetition = elements[index]
1202                .repetition_type
1203                .map(Repetition::try_from)
1204                .transpose()?;
1205
1206            let mut fields = vec![];
1207            let mut next_index = index + 1;
1208            for _ in 0..n {
1209                let child_result = from_thrift_helper(elements, next_index)?;
1210                next_index = child_result.0;
1211                fields.push(child_result.1);
1212            }
1213
1214            let mut builder = Type::group_type_builder(&elements[index].name)
1215                .with_converted_type(converted_type)
1216                .with_logical_type(logical_type)
1217                .with_fields(fields)
1218                .with_id(field_id);
1219            if let Some(rep) = repetition {
1220                // Sometimes parquet-cpp and parquet-mr set repetition level REQUIRED or
1221                // REPEATED for root node.
1222                //
1223                // We only set repetition for group types that are not top-level message
1224                // type. According to parquet-format:
1225                //   Root of the schema does not have a repetition_type.
1226                //   All other types must have one.
1227                if !is_root_node {
1228                    builder = builder.with_repetition(rep);
1229                }
1230            }
1231            Ok((next_index, Arc::new(builder.build().unwrap())))
1232        }
1233    }
1234}
1235
1236/// Method to convert to Thrift.
1237pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> {
1238    if !schema.is_group() {
1239        return Err(general_err!("Root schema must be Group type"));
1240    }
1241    let mut elements: Vec<SchemaElement> = Vec::new();
1242    to_thrift_helper(schema, &mut elements);
1243    Ok(elements)
1244}
1245
1246/// Constructs list of `SchemaElement` from the schema using depth-first traversal.
1247/// Here we assume that schema is always valid and starts with group type.
1248fn to_thrift_helper(schema: &Type, elements: &mut Vec<SchemaElement>) {
1249    match *schema {
1250        Type::PrimitiveType {
1251            ref basic_info,
1252            physical_type,
1253            type_length,
1254            scale,
1255            precision,
1256        } => {
1257            let element = SchemaElement {
1258                type_: Some(physical_type.into()),
1259                type_length: if type_length >= 0 {
1260                    Some(type_length)
1261                } else {
1262                    None
1263                },
1264                repetition_type: Some(basic_info.repetition().into()),
1265                name: basic_info.name().to_owned(),
1266                num_children: None,
1267                converted_type: basic_info.converted_type().into(),
1268                scale: if scale >= 0 { Some(scale) } else { None },
1269                precision: if precision >= 0 {
1270                    Some(precision)
1271                } else {
1272                    None
1273                },
1274                field_id: if basic_info.has_id() {
1275                    Some(basic_info.id())
1276                } else {
1277                    None
1278                },
1279                logical_type: basic_info.logical_type().map(|value| value.into()),
1280            };
1281
1282            elements.push(element);
1283        }
1284        Type::GroupType {
1285            ref basic_info,
1286            ref fields,
1287        } => {
1288            let repetition = if basic_info.has_repetition() {
1289                Some(basic_info.repetition().into())
1290            } else {
1291                None
1292            };
1293
1294            let element = SchemaElement {
1295                type_: None,
1296                type_length: None,
1297                repetition_type: repetition,
1298                name: basic_info.name().to_owned(),
1299                num_children: Some(fields.len() as i32),
1300                converted_type: basic_info.converted_type().into(),
1301                scale: None,
1302                precision: None,
1303                field_id: if basic_info.has_id() {
1304                    Some(basic_info.id())
1305                } else {
1306                    None
1307                },
1308                logical_type: basic_info.logical_type().map(|value| value.into()),
1309            };
1310
1311            elements.push(element);
1312
1313            // Add child elements for a group
1314            for field in fields {
1315                to_thrift_helper(field, elements);
1316            }
1317        }
1318    }
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323    use super::*;
1324
1325    use crate::schema::parser::parse_message_type;
1326
1327    // TODO: add tests for v2 types
1328
1329    #[test]
1330    fn test_primitive_type() {
1331        let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1332            .with_logical_type(Some(LogicalType::Integer {
1333                bit_width: 32,
1334                is_signed: true,
1335            }))
1336            .with_id(Some(0))
1337            .build();
1338        assert!(result.is_ok());
1339
1340        if let Ok(tp) = result {
1341            assert!(tp.is_primitive());
1342            assert!(!tp.is_group());
1343            let basic_info = tp.get_basic_info();
1344            assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1345            assert_eq!(
1346                basic_info.logical_type(),
1347                Some(LogicalType::Integer {
1348                    bit_width: 32,
1349                    is_signed: true
1350                })
1351            );
1352            assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1353            assert_eq!(basic_info.id(), 0);
1354            match tp {
1355                Type::PrimitiveType { physical_type, .. } => {
1356                    assert_eq!(physical_type, PhysicalType::INT32);
1357                }
1358                _ => panic!(),
1359            }
1360        }
1361
1362        // Test illegal inputs with logical type
1363        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1364            .with_repetition(Repetition::REPEATED)
1365            .with_logical_type(Some(LogicalType::Integer {
1366                is_signed: true,
1367                bit_width: 8,
1368            }))
1369            .build();
1370        assert!(result.is_err());
1371        if let Err(e) = result {
1372            assert_eq!(
1373                format!("{e}"),
1374                "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1375            );
1376        }
1377
1378        // Test illegal inputs with converted type
1379        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1380            .with_repetition(Repetition::REPEATED)
1381            .with_converted_type(ConvertedType::BSON)
1382            .build();
1383        assert!(result.is_err());
1384        if let Err(e) = result {
1385            assert_eq!(
1386                format!("{e}"),
1387                "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1388            );
1389        }
1390
1391        result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1392            .with_repetition(Repetition::REQUIRED)
1393            .with_converted_type(ConvertedType::DECIMAL)
1394            .with_precision(-1)
1395            .with_scale(-1)
1396            .build();
1397        assert!(result.is_err());
1398        if let Err(e) = result {
1399            assert_eq!(
1400                format!("{e}"),
1401                "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1402            );
1403        }
1404
1405        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1406            .with_repetition(Repetition::REQUIRED)
1407            .with_logical_type(Some(LogicalType::Decimal {
1408                scale: 32,
1409                precision: 12,
1410            }))
1411            .with_precision(-1)
1412            .with_scale(-1)
1413            .build();
1414        assert!(result.is_err());
1415        if let Err(e) = result {
1416            assert_eq!(
1417                format!("{e}"),
1418                "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1419            );
1420        }
1421
1422        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1423            .with_repetition(Repetition::REQUIRED)
1424            .with_converted_type(ConvertedType::DECIMAL)
1425            .with_precision(-1)
1426            .with_scale(-1)
1427            .build();
1428        assert!(result.is_err());
1429        if let Err(e) = result {
1430            assert_eq!(
1431                format!("{e}"),
1432                "Parquet error: Invalid DECIMAL precision: -1"
1433            );
1434        }
1435
1436        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1437            .with_repetition(Repetition::REQUIRED)
1438            .with_converted_type(ConvertedType::DECIMAL)
1439            .with_precision(0)
1440            .with_scale(-1)
1441            .build();
1442        assert!(result.is_err());
1443        if let Err(e) = result {
1444            assert_eq!(
1445                format!("{e}"),
1446                "Parquet error: Invalid DECIMAL precision: 0"
1447            );
1448        }
1449
1450        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1451            .with_repetition(Repetition::REQUIRED)
1452            .with_converted_type(ConvertedType::DECIMAL)
1453            .with_precision(1)
1454            .with_scale(-1)
1455            .build();
1456        assert!(result.is_err());
1457        if let Err(e) = result {
1458            assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1459        }
1460
1461        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1462            .with_repetition(Repetition::REQUIRED)
1463            .with_converted_type(ConvertedType::DECIMAL)
1464            .with_precision(1)
1465            .with_scale(2)
1466            .build();
1467        assert!(result.is_err());
1468        if let Err(e) = result {
1469            assert_eq!(
1470                format!("{e}"),
1471                "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1472            );
1473        }
1474
1475        // It is OK if precision == scale
1476        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1477            .with_repetition(Repetition::REQUIRED)
1478            .with_converted_type(ConvertedType::DECIMAL)
1479            .with_precision(1)
1480            .with_scale(1)
1481            .build();
1482        assert!(result.is_ok());
1483
1484        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1485            .with_repetition(Repetition::REQUIRED)
1486            .with_converted_type(ConvertedType::DECIMAL)
1487            .with_precision(18)
1488            .with_scale(2)
1489            .build();
1490        assert!(result.is_err());
1491        if let Err(e) = result {
1492            assert_eq!(
1493                format!("{e}"),
1494                "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1495            );
1496        }
1497
1498        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1499            .with_repetition(Repetition::REQUIRED)
1500            .with_converted_type(ConvertedType::DECIMAL)
1501            .with_precision(32)
1502            .with_scale(2)
1503            .build();
1504        assert!(result.is_err());
1505        if let Err(e) = result {
1506            assert_eq!(
1507                format!("{e}"),
1508                "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1509            );
1510        }
1511
1512        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1513            .with_repetition(Repetition::REQUIRED)
1514            .with_converted_type(ConvertedType::DECIMAL)
1515            .with_length(5)
1516            .with_precision(12)
1517            .with_scale(2)
1518            .build();
1519        assert!(result.is_err());
1520        if let Err(e) = result {
1521            assert_eq!(
1522                format!("{e}"),
1523                "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1524            );
1525        }
1526
1527        result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1528            .with_repetition(Repetition::REQUIRED)
1529            .with_converted_type(ConvertedType::UINT_8)
1530            .build();
1531        assert!(result.is_err());
1532        if let Err(e) = result {
1533            assert_eq!(
1534                format!("{e}"),
1535                "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1536            );
1537        }
1538
1539        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1540            .with_repetition(Repetition::REQUIRED)
1541            .with_converted_type(ConvertedType::TIME_MICROS)
1542            .build();
1543        assert!(result.is_err());
1544        if let Err(e) = result {
1545            assert_eq!(
1546                format!("{e}"),
1547                "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1548            );
1549        }
1550
1551        result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1552            .with_repetition(Repetition::REQUIRED)
1553            .with_converted_type(ConvertedType::INTERVAL)
1554            .build();
1555        assert!(result.is_err());
1556        if let Err(e) = result {
1557            assert_eq!(
1558                format!("{e}"),
1559                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1560            );
1561        }
1562
1563        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1564            .with_repetition(Repetition::REQUIRED)
1565            .with_converted_type(ConvertedType::INTERVAL)
1566            .with_length(1)
1567            .build();
1568        assert!(result.is_err());
1569        if let Err(e) = result {
1570            assert_eq!(
1571                format!("{e}"),
1572                "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1573            );
1574        }
1575
1576        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1577            .with_repetition(Repetition::REQUIRED)
1578            .with_converted_type(ConvertedType::ENUM)
1579            .build();
1580        assert!(result.is_err());
1581        if let Err(e) = result {
1582            assert_eq!(
1583                format!("{e}"),
1584                "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1585            );
1586        }
1587
1588        result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1589            .with_repetition(Repetition::REQUIRED)
1590            .with_converted_type(ConvertedType::MAP)
1591            .build();
1592        assert!(result.is_err());
1593        if let Err(e) = result {
1594            assert_eq!(
1595                format!("{e}"),
1596                "Parquet error: MAP cannot be applied to primitive field 'foo'"
1597            );
1598        }
1599
1600        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1601            .with_repetition(Repetition::REQUIRED)
1602            .with_converted_type(ConvertedType::DECIMAL)
1603            .with_length(-1)
1604            .build();
1605        assert!(result.is_err());
1606        if let Err(e) = result {
1607            assert_eq!(
1608                format!("{e}"),
1609                "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1610            );
1611        }
1612
1613        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1614            .with_repetition(Repetition::REQUIRED)
1615            .with_logical_type(Some(LogicalType::Float16))
1616            .with_length(2)
1617            .build();
1618        assert!(result.is_ok());
1619
1620        // Can't be other than FIXED_LEN_BYTE_ARRAY for physical type
1621        result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1622            .with_repetition(Repetition::REQUIRED)
1623            .with_logical_type(Some(LogicalType::Float16))
1624            .with_length(2)
1625            .build();
1626        assert!(result.is_err());
1627        if let Err(e) = result {
1628            assert_eq!(
1629                format!("{e}"),
1630                "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1631            );
1632        }
1633
1634        // Must have length 2
1635        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1636            .with_repetition(Repetition::REQUIRED)
1637            .with_logical_type(Some(LogicalType::Float16))
1638            .with_length(4)
1639            .build();
1640        assert!(result.is_err());
1641        if let Err(e) = result {
1642            assert_eq!(
1643                format!("{e}"),
1644                "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1645            );
1646        }
1647
1648        // Must have length 16
1649        result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1650            .with_repetition(Repetition::REQUIRED)
1651            .with_logical_type(Some(LogicalType::Uuid))
1652            .with_length(15)
1653            .build();
1654        assert!(result.is_err());
1655        if let Err(e) = result {
1656            assert_eq!(
1657                format!("{e}"),
1658                "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1659            );
1660        }
1661    }
1662
1663    #[test]
1664    fn test_group_type() {
1665        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1666            .with_converted_type(ConvertedType::INT_32)
1667            .with_id(Some(0))
1668            .build();
1669        assert!(f1.is_ok());
1670        let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1671            .with_converted_type(ConvertedType::UTF8)
1672            .with_id(Some(1))
1673            .build();
1674        assert!(f2.is_ok());
1675
1676        let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1677
1678        let result = Type::group_type_builder("foo")
1679            .with_repetition(Repetition::REPEATED)
1680            .with_logical_type(Some(LogicalType::List))
1681            .with_fields(fields)
1682            .with_id(Some(1))
1683            .build();
1684        assert!(result.is_ok());
1685
1686        let tp = result.unwrap();
1687        let basic_info = tp.get_basic_info();
1688        assert!(tp.is_group());
1689        assert!(!tp.is_primitive());
1690        assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1691        assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
1692        assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1693        assert_eq!(basic_info.id(), 1);
1694        assert_eq!(tp.get_fields().len(), 2);
1695        assert_eq!(tp.get_fields()[0].name(), "f1");
1696        assert_eq!(tp.get_fields()[1].name(), "f2");
1697    }
1698
1699    #[test]
1700    fn test_column_descriptor() {
1701        let result = test_column_descriptor_helper();
1702        assert!(
1703            result.is_ok(),
1704            "Expected result to be OK but got err:\n {}",
1705            result.unwrap_err()
1706        );
1707    }
1708
1709    fn test_column_descriptor_helper() -> Result<()> {
1710        let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1711            .with_converted_type(ConvertedType::UTF8)
1712            .build()?;
1713
1714        let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1715
1716        assert_eq!(descr.path(), &ColumnPath::from("name"));
1717        assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1718        assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1719        assert_eq!(descr.max_def_level(), 4);
1720        assert_eq!(descr.max_rep_level(), 1);
1721        assert_eq!(descr.name(), "name");
1722        assert_eq!(descr.type_length(), -1);
1723        assert_eq!(descr.type_precision(), -1);
1724        assert_eq!(descr.type_scale(), -1);
1725
1726        Ok(())
1727    }
1728
1729    #[test]
1730    fn test_schema_descriptor() {
1731        let result = test_schema_descriptor_helper();
1732        assert!(
1733            result.is_ok(),
1734            "Expected result to be OK but got err:\n {}",
1735            result.unwrap_err()
1736        );
1737    }
1738
1739    // A helper fn to avoid handling the results from type creation
1740    fn test_schema_descriptor_helper() -> Result<()> {
1741        let mut fields = vec![];
1742
1743        let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1744            .with_repetition(Repetition::REQUIRED)
1745            .with_converted_type(ConvertedType::INT_32)
1746            .build()?;
1747        fields.push(Arc::new(inta));
1748        let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1749            .with_converted_type(ConvertedType::INT_64)
1750            .build()?;
1751        fields.push(Arc::new(intb));
1752        let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1753            .with_repetition(Repetition::REPEATED)
1754            .with_converted_type(ConvertedType::UTF8)
1755            .build()?;
1756        fields.push(Arc::new(intc));
1757
1758        // 3-level list encoding
1759        let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1760            .with_repetition(Repetition::REQUIRED)
1761            .with_converted_type(ConvertedType::INT_64)
1762            .build()?;
1763        let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1764        let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1765            .with_repetition(Repetition::REPEATED)
1766            .with_converted_type(ConvertedType::INT_32)
1767            .build()?;
1768        let list = Type::group_type_builder("records")
1769            .with_repetition(Repetition::REPEATED)
1770            .with_converted_type(ConvertedType::LIST)
1771            .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1772            .build()?;
1773        let bag = Type::group_type_builder("bag")
1774            .with_repetition(Repetition::OPTIONAL)
1775            .with_fields(vec![Arc::new(list)])
1776            .build()?;
1777        fields.push(Arc::new(bag));
1778
1779        let schema = Type::group_type_builder("schema")
1780            .with_repetition(Repetition::REPEATED)
1781            .with_fields(fields)
1782            .build()?;
1783        let descr = SchemaDescriptor::new(Arc::new(schema));
1784
1785        let nleaves = 6;
1786        assert_eq!(descr.num_columns(), nleaves);
1787
1788        //                             mdef mrep
1789        // required int32 a            0    0
1790        // optional int64 b            1    0
1791        // repeated byte_array c       1    1
1792        // optional group bag          1    0
1793        //   repeated group records    2    1
1794        //     required int64 item1    2    1
1795        //     optional boolean item2  3    1
1796        //     repeated int32 item3    3    2
1797        let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1798        let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1799
1800        for i in 0..nleaves {
1801            let col = descr.column(i);
1802            assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1803            assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1804        }
1805
1806        assert_eq!(descr.column(0).path().string(), "a");
1807        assert_eq!(descr.column(1).path().string(), "b");
1808        assert_eq!(descr.column(2).path().string(), "c");
1809        assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1810        assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1811        assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1812
1813        assert_eq!(descr.get_column_root(0).name(), "a");
1814        assert_eq!(descr.get_column_root(3).name(), "bag");
1815        assert_eq!(descr.get_column_root(4).name(), "bag");
1816        assert_eq!(descr.get_column_root(5).name(), "bag");
1817
1818        Ok(())
1819    }
1820
1821    #[test]
1822    fn test_schema_build_tree_def_rep_levels() {
1823        let message_type = "
1824    message spark_schema {
1825      REQUIRED INT32 a;
1826      OPTIONAL group b {
1827        OPTIONAL INT32 _1;
1828        OPTIONAL INT32 _2;
1829      }
1830      OPTIONAL group c (LIST) {
1831        REPEATED group list {
1832          OPTIONAL INT32 element;
1833        }
1834      }
1835    }
1836    ";
1837        let schema = parse_message_type(message_type).expect("should parse schema");
1838        let descr = SchemaDescriptor::new(Arc::new(schema));
1839        // required int32 a
1840        assert_eq!(descr.column(0).max_def_level(), 0);
1841        assert_eq!(descr.column(0).max_rep_level(), 0);
1842        // optional int32 b._1
1843        assert_eq!(descr.column(1).max_def_level(), 2);
1844        assert_eq!(descr.column(1).max_rep_level(), 0);
1845        // optional int32 b._2
1846        assert_eq!(descr.column(2).max_def_level(), 2);
1847        assert_eq!(descr.column(2).max_rep_level(), 0);
1848        // repeated optional int32 c.list.element
1849        assert_eq!(descr.column(3).max_def_level(), 3);
1850        assert_eq!(descr.column(3).max_rep_level(), 1);
1851    }
1852
1853    #[test]
1854    #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
1855    fn test_get_physical_type_panic() {
1856        let list = Type::group_type_builder("records")
1857            .with_repetition(Repetition::REPEATED)
1858            .build()
1859            .unwrap();
1860        list.get_physical_type();
1861    }
1862
1863    #[test]
1864    fn test_get_physical_type_primitive() {
1865        let f = Type::primitive_type_builder("f", PhysicalType::INT64)
1866            .build()
1867            .unwrap();
1868        assert_eq!(f.get_physical_type(), PhysicalType::INT64);
1869
1870        let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
1871            .build()
1872            .unwrap();
1873        assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
1874    }
1875
1876    #[test]
1877    fn test_check_contains_primitive_primitive() {
1878        // OK
1879        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1880            .build()
1881            .unwrap();
1882        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1883            .build()
1884            .unwrap();
1885        assert!(f1.check_contains(&f2));
1886
1887        // OK: different logical type does not affect check_contains
1888        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1889            .with_converted_type(ConvertedType::UINT_8)
1890            .build()
1891            .unwrap();
1892        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1893            .with_converted_type(ConvertedType::UINT_16)
1894            .build()
1895            .unwrap();
1896        assert!(f1.check_contains(&f2));
1897
1898        // KO: different name
1899        let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1900            .build()
1901            .unwrap();
1902        let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
1903            .build()
1904            .unwrap();
1905        assert!(!f1.check_contains(&f2));
1906
1907        // KO: different type
1908        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1909            .build()
1910            .unwrap();
1911        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
1912            .build()
1913            .unwrap();
1914        assert!(!f1.check_contains(&f2));
1915
1916        // KO: different repetition
1917        let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1918            .with_repetition(Repetition::REQUIRED)
1919            .build()
1920            .unwrap();
1921        let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1922            .with_repetition(Repetition::OPTIONAL)
1923            .build()
1924            .unwrap();
1925        assert!(!f1.check_contains(&f2));
1926    }
1927
1928    // function to create a new group type for testing
1929    fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
1930        Type::group_type_builder(name)
1931            .with_repetition(repetition)
1932            .with_fields(types.into_iter().map(Arc::new).collect())
1933            .build()
1934            .unwrap()
1935    }
1936
1937    #[test]
1938    fn test_check_contains_group_group() {
1939        // OK: should match okay with empty fields
1940        let f1 = Type::group_type_builder("f").build().unwrap();
1941        let f2 = Type::group_type_builder("f").build().unwrap();
1942        assert!(f1.check_contains(&f2));
1943        assert!(!f1.is_optional());
1944
1945        // OK: fields match
1946        let f1 = test_new_group_type(
1947            "f",
1948            Repetition::REPEATED,
1949            vec![
1950                Type::primitive_type_builder("f1", PhysicalType::INT32)
1951                    .build()
1952                    .unwrap(),
1953                Type::primitive_type_builder("f2", PhysicalType::INT64)
1954                    .build()
1955                    .unwrap(),
1956            ],
1957        );
1958        let f2 = test_new_group_type(
1959            "f",
1960            Repetition::REPEATED,
1961            vec![
1962                Type::primitive_type_builder("f1", PhysicalType::INT32)
1963                    .build()
1964                    .unwrap(),
1965                Type::primitive_type_builder("f2", PhysicalType::INT64)
1966                    .build()
1967                    .unwrap(),
1968            ],
1969        );
1970        assert!(f1.check_contains(&f2));
1971
1972        // OK: subset of fields
1973        let f1 = test_new_group_type(
1974            "f",
1975            Repetition::REPEATED,
1976            vec![
1977                Type::primitive_type_builder("f1", PhysicalType::INT32)
1978                    .build()
1979                    .unwrap(),
1980                Type::primitive_type_builder("f2", PhysicalType::INT64)
1981                    .build()
1982                    .unwrap(),
1983            ],
1984        );
1985        let f2 = test_new_group_type(
1986            "f",
1987            Repetition::REPEATED,
1988            vec![Type::primitive_type_builder("f2", PhysicalType::INT64)
1989                .build()
1990                .unwrap()],
1991        );
1992        assert!(f1.check_contains(&f2));
1993
1994        // KO: different name
1995        let f1 = Type::group_type_builder("f1").build().unwrap();
1996        let f2 = Type::group_type_builder("f2").build().unwrap();
1997        assert!(!f1.check_contains(&f2));
1998
1999        // KO: different repetition
2000        let f1 = Type::group_type_builder("f")
2001            .with_repetition(Repetition::OPTIONAL)
2002            .build()
2003            .unwrap();
2004        let f2 = Type::group_type_builder("f")
2005            .with_repetition(Repetition::REPEATED)
2006            .build()
2007            .unwrap();
2008        assert!(!f1.check_contains(&f2));
2009
2010        // KO: different fields
2011        let f1 = test_new_group_type(
2012            "f",
2013            Repetition::REPEATED,
2014            vec![
2015                Type::primitive_type_builder("f1", PhysicalType::INT32)
2016                    .build()
2017                    .unwrap(),
2018                Type::primitive_type_builder("f2", PhysicalType::INT64)
2019                    .build()
2020                    .unwrap(),
2021            ],
2022        );
2023        let f2 = test_new_group_type(
2024            "f",
2025            Repetition::REPEATED,
2026            vec![
2027                Type::primitive_type_builder("f1", PhysicalType::INT32)
2028                    .build()
2029                    .unwrap(),
2030                Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2031                    .build()
2032                    .unwrap(),
2033            ],
2034        );
2035        assert!(!f1.check_contains(&f2));
2036
2037        // KO: different fields
2038        let f1 = test_new_group_type(
2039            "f",
2040            Repetition::REPEATED,
2041            vec![
2042                Type::primitive_type_builder("f1", PhysicalType::INT32)
2043                    .build()
2044                    .unwrap(),
2045                Type::primitive_type_builder("f2", PhysicalType::INT64)
2046                    .build()
2047                    .unwrap(),
2048            ],
2049        );
2050        let f2 = test_new_group_type(
2051            "f",
2052            Repetition::REPEATED,
2053            vec![Type::primitive_type_builder("f3", PhysicalType::INT32)
2054                .build()
2055                .unwrap()],
2056        );
2057        assert!(!f1.check_contains(&f2));
2058    }
2059
2060    #[test]
2061    fn test_check_contains_group_primitive() {
2062        // KO: should not match
2063        let f1 = Type::group_type_builder("f").build().unwrap();
2064        let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2065            .build()
2066            .unwrap();
2067        assert!(!f1.check_contains(&f2));
2068        assert!(!f2.check_contains(&f1));
2069
2070        // KO: should not match when primitive field is part of group type
2071        let f1 = test_new_group_type(
2072            "f",
2073            Repetition::REPEATED,
2074            vec![Type::primitive_type_builder("f1", PhysicalType::INT32)
2075                .build()
2076                .unwrap()],
2077        );
2078        let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2079            .build()
2080            .unwrap();
2081        assert!(!f1.check_contains(&f2));
2082        assert!(!f2.check_contains(&f1));
2083
2084        // OK: match nested types
2085        let f1 = test_new_group_type(
2086            "a",
2087            Repetition::REPEATED,
2088            vec![
2089                test_new_group_type(
2090                    "b",
2091                    Repetition::REPEATED,
2092                    vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2093                        .build()
2094                        .unwrap()],
2095                ),
2096                Type::primitive_type_builder("d", PhysicalType::INT64)
2097                    .build()
2098                    .unwrap(),
2099                Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2100                    .build()
2101                    .unwrap(),
2102            ],
2103        );
2104        let f2 = test_new_group_type(
2105            "a",
2106            Repetition::REPEATED,
2107            vec![test_new_group_type(
2108                "b",
2109                Repetition::REPEATED,
2110                vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2111                    .build()
2112                    .unwrap()],
2113            )],
2114        );
2115        assert!(f1.check_contains(&f2)); // should match
2116        assert!(!f2.check_contains(&f1)); // should fail
2117    }
2118
2119    #[test]
2120    fn test_schema_type_thrift_conversion_err() {
2121        let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2122            .build()
2123            .unwrap();
2124        let thrift_schema = to_thrift(&schema);
2125        assert!(thrift_schema.is_err());
2126        if let Err(e) = thrift_schema {
2127            assert_eq!(
2128                format!("{e}"),
2129                "Parquet error: Root schema must be Group type"
2130            );
2131        }
2132    }
2133
2134    #[test]
2135    fn test_schema_type_thrift_conversion() {
2136        let message_type = "
2137    message conversions {
2138      REQUIRED INT64 id;
2139      OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2140      OPTIONAL group int_array_Array (LIST) {
2141        REPEATED group list {
2142          OPTIONAL group element (LIST) {
2143            REPEATED group list {
2144              OPTIONAL INT32 element;
2145            }
2146          }
2147        }
2148      }
2149      OPTIONAL group int_map (MAP) {
2150        REPEATED group map (MAP_KEY_VALUE) {
2151          REQUIRED BYTE_ARRAY key (UTF8);
2152          OPTIONAL INT32 value;
2153        }
2154      }
2155      OPTIONAL group int_Map_Array (LIST) {
2156        REPEATED group list {
2157          OPTIONAL group g (MAP) {
2158            REPEATED group map (MAP_KEY_VALUE) {
2159              REQUIRED BYTE_ARRAY key (UTF8);
2160              OPTIONAL group value {
2161                OPTIONAL group H {
2162                  OPTIONAL group i (LIST) {
2163                    REPEATED group list {
2164                      OPTIONAL DOUBLE element;
2165                    }
2166                  }
2167                }
2168              }
2169            }
2170          }
2171        }
2172      }
2173      OPTIONAL group nested_struct {
2174        OPTIONAL INT32 A;
2175        OPTIONAL group b (LIST) {
2176          REPEATED group list {
2177            REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2178          }
2179        }
2180      }
2181    }
2182    ";
2183        let expected_schema = parse_message_type(message_type).unwrap();
2184        let thrift_schema = to_thrift(&expected_schema).unwrap();
2185        let result_schema = from_thrift(&thrift_schema).unwrap();
2186        assert_eq!(result_schema, Arc::new(expected_schema));
2187    }
2188
2189    #[test]
2190    fn test_schema_type_thrift_conversion_decimal() {
2191        let message_type = "
2192    message decimals {
2193      OPTIONAL INT32 field0;
2194      OPTIONAL INT64 field1 (DECIMAL (18, 2));
2195      OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2196      OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2197    }
2198    ";
2199        let expected_schema = parse_message_type(message_type).unwrap();
2200        let thrift_schema = to_thrift(&expected_schema).unwrap();
2201        let result_schema = from_thrift(&thrift_schema).unwrap();
2202        assert_eq!(result_schema, Arc::new(expected_schema));
2203    }
2204
2205    // Tests schema conversion from thrift, when num_children is set to Some(0) for a
2206    // primitive type.
2207    #[test]
2208    fn test_schema_from_thrift_with_num_children_set() {
2209        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2210        let message_type = "
2211    message schema {
2212      OPTIONAL BYTE_ARRAY id (UTF8);
2213      OPTIONAL BYTE_ARRAY name (UTF8);
2214      OPTIONAL BYTE_ARRAY message (UTF8);
2215      OPTIONAL INT32 type (UINT_8);
2216      OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2217      OPTIONAL INT64 __index_level_0__;
2218    }
2219    ";
2220
2221        let expected_schema = parse_message_type(message_type).unwrap();
2222        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2223        // Change all of None to Some(0)
2224        for elem in &mut thrift_schema[..] {
2225            if elem.num_children.is_none() {
2226                elem.num_children = Some(0);
2227            }
2228        }
2229
2230        let result_schema = from_thrift(&thrift_schema).unwrap();
2231        assert_eq!(result_schema, Arc::new(expected_schema));
2232    }
2233
2234    // Sometimes parquet-cpp sets repetition level for the root node, which is against
2235    // the format definition, but we need to handle it by setting it back to None.
2236    #[test]
2237    fn test_schema_from_thrift_root_has_repetition() {
2238        // schema definition written by parquet-cpp version 1.3.2-SNAPSHOT
2239        let message_type = "
2240    message schema {
2241      OPTIONAL BYTE_ARRAY a (UTF8);
2242      OPTIONAL INT32 b (UINT_8);
2243    }
2244    ";
2245
2246        let expected_schema = parse_message_type(message_type).unwrap();
2247        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2248        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2249
2250        let result_schema = from_thrift(&thrift_schema).unwrap();
2251        assert_eq!(result_schema, Arc::new(expected_schema));
2252    }
2253
2254    #[test]
2255    fn test_schema_from_thrift_group_has_no_child() {
2256        let message_type = "message schema {}";
2257
2258        let expected_schema = parse_message_type(message_type).unwrap();
2259        let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2260        thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2261
2262        let result_schema = from_thrift(&thrift_schema).unwrap();
2263        assert_eq!(result_schema, Arc::new(expected_schema));
2264    }
2265}