1use std::{collections::HashMap, fmt, sync::Arc};
21
22use crate::file::metadata::HeapSize;
23use crate::format::SchemaElement;
24
25use crate::basic::{
26 ColumnOrder, ConvertedType, LogicalType, Repetition, SortOrder, TimeUnit, Type as PhysicalType,
27};
28use crate::errors::{ParquetError, Result};
29
30pub type TypePtr = Arc<Type>;
35pub type SchemaDescPtr = Arc<SchemaDescriptor>;
37pub type ColumnDescPtr = Arc<ColumnDescriptor>;
39
40#[derive(Clone, Debug, PartialEq)]
47pub enum Type {
48 PrimitiveType {
50 basic_info: BasicTypeInfo,
52 physical_type: PhysicalType,
54 type_length: i32,
56 scale: i32,
58 precision: i32,
60 },
61 GroupType {
63 basic_info: BasicTypeInfo,
65 fields: Vec<TypePtr>,
67 },
68}
69
70impl HeapSize for Type {
71 fn heap_size(&self) -> usize {
72 match self {
73 Type::PrimitiveType { basic_info, .. } => basic_info.heap_size(),
74 Type::GroupType { basic_info, fields } => basic_info.heap_size() + fields.heap_size(),
75 }
76 }
77}
78
79impl Type {
80 pub fn primitive_type_builder(name: &str, physical_type: PhysicalType) -> PrimitiveTypeBuilder {
82 PrimitiveTypeBuilder::new(name, physical_type)
83 }
84
85 pub fn group_type_builder(name: &str) -> GroupTypeBuilder {
87 GroupTypeBuilder::new(name)
88 }
89
90 pub fn get_basic_info(&self) -> &BasicTypeInfo {
92 match *self {
93 Type::PrimitiveType { ref basic_info, .. } => basic_info,
94 Type::GroupType { ref basic_info, .. } => basic_info,
95 }
96 }
97
98 pub fn name(&self) -> &str {
100 self.get_basic_info().name()
101 }
102
103 pub fn get_fields(&self) -> &[TypePtr] {
107 match *self {
108 Type::GroupType { ref fields, .. } => &fields[..],
109 _ => panic!("Cannot call get_fields() on a non-group type"),
110 }
111 }
112
113 pub fn get_physical_type(&self) -> PhysicalType {
116 match *self {
117 Type::PrimitiveType {
118 basic_info: _,
119 physical_type,
120 ..
121 } => physical_type,
122 _ => panic!("Cannot call get_physical_type() on a non-primitive type"),
123 }
124 }
125
126 pub fn get_precision(&self) -> i32 {
129 match *self {
130 Type::PrimitiveType { precision, .. } => precision,
131 _ => panic!("Cannot call get_precision() on non-primitive type"),
132 }
133 }
134
135 pub fn get_scale(&self) -> i32 {
138 match *self {
139 Type::PrimitiveType { scale, .. } => scale,
140 _ => panic!("Cannot call get_scale() on non-primitive type"),
141 }
142 }
143
144 pub fn check_contains(&self, sub_type: &Type) -> bool {
147 let basic_match = self.get_basic_info().name() == sub_type.get_basic_info().name()
149 && (self.is_schema() && sub_type.is_schema()
150 || !self.is_schema()
151 && !sub_type.is_schema()
152 && self.get_basic_info().repetition()
153 == sub_type.get_basic_info().repetition());
154
155 match *self {
156 Type::PrimitiveType { .. } if basic_match && sub_type.is_primitive() => {
157 self.get_physical_type() == sub_type.get_physical_type()
158 }
159 Type::GroupType { .. } if basic_match && sub_type.is_group() => {
160 let mut field_map = HashMap::new();
162 for field in self.get_fields() {
163 field_map.insert(field.name(), field);
164 }
165
166 for field in sub_type.get_fields() {
167 if !field_map
168 .get(field.name())
169 .map(|tpe| tpe.check_contains(field))
170 .unwrap_or(false)
171 {
172 return false;
173 }
174 }
175 true
176 }
177 _ => false,
178 }
179 }
180
181 pub fn is_primitive(&self) -> bool {
183 matches!(*self, Type::PrimitiveType { .. })
184 }
185
186 pub fn is_group(&self) -> bool {
188 matches!(*self, Type::GroupType { .. })
189 }
190
191 pub fn is_schema(&self) -> bool {
193 match *self {
194 Type::GroupType { ref basic_info, .. } => !basic_info.has_repetition(),
195 _ => false,
196 }
197 }
198
199 pub fn is_optional(&self) -> bool {
202 self.get_basic_info().has_repetition()
203 && self.get_basic_info().repetition() != Repetition::REQUIRED
204 }
205}
206
207pub struct PrimitiveTypeBuilder<'a> {
211 name: &'a str,
212 repetition: Repetition,
213 physical_type: PhysicalType,
214 converted_type: ConvertedType,
215 logical_type: Option<LogicalType>,
216 length: i32,
217 precision: i32,
218 scale: i32,
219 id: Option<i32>,
220}
221
222impl<'a> PrimitiveTypeBuilder<'a> {
223 pub fn new(name: &'a str, physical_type: PhysicalType) -> Self {
225 Self {
226 name,
227 repetition: Repetition::OPTIONAL,
228 physical_type,
229 converted_type: ConvertedType::NONE,
230 logical_type: None,
231 length: -1,
232 precision: -1,
233 scale: -1,
234 id: None,
235 }
236 }
237
238 pub fn with_repetition(self, repetition: Repetition) -> Self {
240 Self { repetition, ..self }
241 }
242
243 pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
245 Self {
246 converted_type,
247 ..self
248 }
249 }
250
251 pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
255 Self {
256 logical_type,
257 ..self
258 }
259 }
260
261 pub fn with_length(self, length: i32) -> Self {
266 Self { length, ..self }
267 }
268
269 pub fn with_precision(self, precision: i32) -> Self {
272 Self { precision, ..self }
273 }
274
275 pub fn with_scale(self, scale: i32) -> Self {
278 Self { scale, ..self }
279 }
280
281 pub fn with_id(self, id: Option<i32>) -> Self {
283 Self { id, ..self }
284 }
285
286 pub fn build(self) -> Result<Type> {
289 let mut basic_info = BasicTypeInfo {
290 name: String::from(self.name),
291 repetition: Some(self.repetition),
292 converted_type: self.converted_type,
293 logical_type: self.logical_type.clone(),
294 id: self.id,
295 };
296
297 if self.physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY && self.length < 0 {
299 return Err(general_err!(
300 "Invalid FIXED_LEN_BYTE_ARRAY length: {} for field '{}'",
301 self.length,
302 self.name
303 ));
304 }
305
306 if let Some(logical_type) = &self.logical_type {
307 if self.converted_type != ConvertedType::NONE {
310 if ConvertedType::from(self.logical_type.clone()) != self.converted_type {
311 return Err(general_err!(
312 "Logical type {:?} is incompatible with converted type {} for field '{}'",
313 logical_type,
314 self.converted_type,
315 self.name
316 ));
317 }
318 } else {
319 basic_info.converted_type = self.logical_type.clone().into();
321 }
322 match (logical_type, self.physical_type) {
324 (LogicalType::Map, _) | (LogicalType::List, _) => {
325 return Err(general_err!(
326 "{:?} cannot be applied to a primitive type for field '{}'",
327 logical_type,
328 self.name
329 ));
330 }
331 (LogicalType::Enum, PhysicalType::BYTE_ARRAY) => {}
332 (LogicalType::Decimal { scale, precision }, _) => {
333 if *scale != self.scale {
335 return Err(general_err!(
336 "DECIMAL logical type scale {} must match self.scale {} for field '{}'",
337 scale,
338 self.scale,
339 self.name
340 ));
341 }
342 if *precision != self.precision {
343 return Err(general_err!(
344 "DECIMAL logical type precision {} must match self.precision {} for field '{}'",
345 precision,
346 self.precision,
347 self.name
348 ));
349 }
350 self.check_decimal_precision_scale()?;
351 }
352 (LogicalType::Date, PhysicalType::INT32) => {}
353 (
354 LogicalType::Time {
355 unit: TimeUnit::MILLIS(_),
356 ..
357 },
358 PhysicalType::INT32,
359 ) => {}
360 (LogicalType::Time { unit, .. }, PhysicalType::INT64) => {
361 if *unit == TimeUnit::MILLIS(Default::default()) {
362 return Err(general_err!(
363 "Cannot use millisecond unit on INT64 type for field '{}'",
364 self.name
365 ));
366 }
367 }
368 (LogicalType::Timestamp { .. }, PhysicalType::INT64) => {}
369 (LogicalType::Integer { bit_width, .. }, PhysicalType::INT32)
370 if *bit_width <= 32 => {}
371 (LogicalType::Integer { bit_width, .. }, PhysicalType::INT64)
372 if *bit_width == 64 => {}
373 (LogicalType::Unknown, PhysicalType::INT32) => {}
375 (LogicalType::String, PhysicalType::BYTE_ARRAY) => {}
376 (LogicalType::Json, PhysicalType::BYTE_ARRAY) => {}
377 (LogicalType::Bson, PhysicalType::BYTE_ARRAY) => {}
378 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) if self.length == 16 => {}
379 (LogicalType::Uuid, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
380 return Err(general_err!(
381 "UUID cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(16) field",
382 self.name
383 ))
384 }
385 (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY)
386 if self.length == 2 => {}
387 (LogicalType::Float16, PhysicalType::FIXED_LEN_BYTE_ARRAY) => {
388 return Err(general_err!(
389 "FLOAT16 cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(2) field",
390 self.name
391 ))
392 }
393 (a, b) => {
394 return Err(general_err!(
395 "Cannot annotate {:?} from {} for field '{}'",
396 a,
397 b,
398 self.name
399 ))
400 }
401 }
402 }
403
404 match self.converted_type {
405 ConvertedType::NONE => {}
406 ConvertedType::UTF8 | ConvertedType::BSON | ConvertedType::JSON => {
407 if self.physical_type != PhysicalType::BYTE_ARRAY {
408 return Err(general_err!(
409 "{} cannot annotate field '{}' because it is not a BYTE_ARRAY field",
410 self.converted_type,
411 self.name
412 ));
413 }
414 }
415 ConvertedType::DECIMAL => {
416 self.check_decimal_precision_scale()?;
417 }
418 ConvertedType::DATE
419 | ConvertedType::TIME_MILLIS
420 | ConvertedType::UINT_8
421 | ConvertedType::UINT_16
422 | ConvertedType::UINT_32
423 | ConvertedType::INT_8
424 | ConvertedType::INT_16
425 | ConvertedType::INT_32 => {
426 if self.physical_type != PhysicalType::INT32 {
427 return Err(general_err!(
428 "{} cannot annotate field '{}' because it is not a INT32 field",
429 self.converted_type,
430 self.name
431 ));
432 }
433 }
434 ConvertedType::TIME_MICROS
435 | ConvertedType::TIMESTAMP_MILLIS
436 | ConvertedType::TIMESTAMP_MICROS
437 | ConvertedType::UINT_64
438 | ConvertedType::INT_64 => {
439 if self.physical_type != PhysicalType::INT64 {
440 return Err(general_err!(
441 "{} cannot annotate field '{}' because it is not a INT64 field",
442 self.converted_type,
443 self.name
444 ));
445 }
446 }
447 ConvertedType::INTERVAL => {
448 if self.physical_type != PhysicalType::FIXED_LEN_BYTE_ARRAY || self.length != 12 {
449 return Err(general_err!(
450 "INTERVAL cannot annotate field '{}' because it is not a FIXED_LEN_BYTE_ARRAY(12) field",
451 self.name
452 ));
453 }
454 }
455 ConvertedType::ENUM => {
456 if self.physical_type != PhysicalType::BYTE_ARRAY {
457 return Err(general_err!(
458 "ENUM cannot annotate field '{}' because it is not a BYTE_ARRAY field",
459 self.name
460 ));
461 }
462 }
463 _ => {
464 return Err(general_err!(
465 "{} cannot be applied to primitive field '{}'",
466 self.converted_type,
467 self.name
468 ));
469 }
470 }
471
472 Ok(Type::PrimitiveType {
473 basic_info,
474 physical_type: self.physical_type,
475 type_length: self.length,
476 scale: self.scale,
477 precision: self.precision,
478 })
479 }
480
481 #[inline]
482 fn check_decimal_precision_scale(&self) -> Result<()> {
483 match self.physical_type {
484 PhysicalType::INT32
485 | PhysicalType::INT64
486 | PhysicalType::BYTE_ARRAY
487 | PhysicalType::FIXED_LEN_BYTE_ARRAY => (),
488 _ => {
489 return Err(general_err!(
490 "DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
491 ));
492 }
493 }
494
495 if self.precision < 1 {
497 return Err(general_err!(
498 "Invalid DECIMAL precision: {}",
499 self.precision
500 ));
501 }
502
503 if self.scale < 0 {
505 return Err(general_err!("Invalid DECIMAL scale: {}", self.scale));
506 }
507
508 if self.scale > self.precision {
509 return Err(general_err!(
510 "Invalid DECIMAL: scale ({}) cannot be greater than precision \
511 ({})",
512 self.scale,
513 self.precision
514 ));
515 }
516
517 match self.physical_type {
519 PhysicalType::INT32 => {
520 if self.precision > 9 {
521 return Err(general_err!(
522 "Cannot represent INT32 as DECIMAL with precision {}",
523 self.precision
524 ));
525 }
526 }
527 PhysicalType::INT64 => {
528 if self.precision > 18 {
529 return Err(general_err!(
530 "Cannot represent INT64 as DECIMAL with precision {}",
531 self.precision
532 ));
533 }
534 }
535 PhysicalType::FIXED_LEN_BYTE_ARRAY => {
536 let max_precision = (2f64.powi(8 * self.length - 1) - 1f64).log10().floor() as i32;
537
538 if self.precision > max_precision {
539 return Err(general_err!(
540 "Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length {} and \
541 precision {}. The max precision can only be {}",
542 self.length,
543 self.precision,
544 max_precision
545 ));
546 }
547 }
548 _ => (), }
550
551 Ok(())
552 }
553}
554
555pub struct GroupTypeBuilder<'a> {
559 name: &'a str,
560 repetition: Option<Repetition>,
561 converted_type: ConvertedType,
562 logical_type: Option<LogicalType>,
563 fields: Vec<TypePtr>,
564 id: Option<i32>,
565}
566
567impl<'a> GroupTypeBuilder<'a> {
568 pub fn new(name: &'a str) -> Self {
570 Self {
571 name,
572 repetition: None,
573 converted_type: ConvertedType::NONE,
574 logical_type: None,
575 fields: Vec::new(),
576 id: None,
577 }
578 }
579
580 pub fn with_repetition(mut self, repetition: Repetition) -> Self {
582 self.repetition = Some(repetition);
583 self
584 }
585
586 pub fn with_converted_type(self, converted_type: ConvertedType) -> Self {
588 Self {
589 converted_type,
590 ..self
591 }
592 }
593
594 pub fn with_logical_type(self, logical_type: Option<LogicalType>) -> Self {
596 Self {
597 logical_type,
598 ..self
599 }
600 }
601
602 pub fn with_fields(self, fields: Vec<TypePtr>) -> Self {
605 Self { fields, ..self }
606 }
607
608 pub fn with_id(self, id: Option<i32>) -> Self {
610 Self { id, ..self }
611 }
612
613 pub fn build(self) -> Result<Type> {
615 let mut basic_info = BasicTypeInfo {
616 name: String::from(self.name),
617 repetition: self.repetition,
618 converted_type: self.converted_type,
619 logical_type: self.logical_type.clone(),
620 id: self.id,
621 };
622 if self.logical_type.is_some() && self.converted_type == ConvertedType::NONE {
624 basic_info.converted_type = self.logical_type.into();
625 }
626 Ok(Type::GroupType {
627 basic_info,
628 fields: self.fields,
629 })
630 }
631}
632
633#[derive(Clone, Debug, PartialEq, Eq)]
636pub struct BasicTypeInfo {
637 name: String,
638 repetition: Option<Repetition>,
639 converted_type: ConvertedType,
640 logical_type: Option<LogicalType>,
641 id: Option<i32>,
642}
643
644impl HeapSize for BasicTypeInfo {
645 fn heap_size(&self) -> usize {
646 self.name.heap_size()
648 }
649}
650
651impl BasicTypeInfo {
652 pub fn name(&self) -> &str {
654 &self.name
655 }
656
657 pub fn has_repetition(&self) -> bool {
661 self.repetition.is_some()
662 }
663
664 pub fn repetition(&self) -> Repetition {
666 assert!(self.repetition.is_some());
667 self.repetition.unwrap()
668 }
669
670 pub fn converted_type(&self) -> ConvertedType {
672 self.converted_type
673 }
674
675 pub fn logical_type(&self) -> Option<LogicalType> {
677 self.logical_type.clone()
679 }
680
681 pub fn has_id(&self) -> bool {
683 self.id.is_some()
684 }
685
686 pub fn id(&self) -> i32 {
688 assert!(self.id.is_some());
689 self.id.unwrap()
690 }
691}
692
693#[derive(Clone, PartialEq, Debug, Eq, Hash)]
715pub struct ColumnPath {
716 parts: Vec<String>,
717}
718
719impl HeapSize for ColumnPath {
720 fn heap_size(&self) -> usize {
721 self.parts.heap_size()
722 }
723}
724
725impl ColumnPath {
726 pub fn new(parts: Vec<String>) -> Self {
728 ColumnPath { parts }
729 }
730
731 pub fn string(&self) -> String {
739 self.parts.join(".")
740 }
741
742 pub fn append(&mut self, mut tail: Vec<String>) {
754 self.parts.append(&mut tail);
755 }
756
757 pub fn parts(&self) -> &[String] {
759 &self.parts
760 }
761}
762
763impl fmt::Display for ColumnPath {
764 fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
765 write!(f, "{:?}", self.string())
766 }
767}
768
769impl From<Vec<String>> for ColumnPath {
770 fn from(parts: Vec<String>) -> Self {
771 ColumnPath { parts }
772 }
773}
774
775impl From<&str> for ColumnPath {
776 fn from(single_path: &str) -> Self {
777 let s = String::from(single_path);
778 ColumnPath::from(s)
779 }
780}
781
782impl From<String> for ColumnPath {
783 fn from(single_path: String) -> Self {
784 let v = vec![single_path];
785 ColumnPath { parts: v }
786 }
787}
788
789impl AsRef<[String]> for ColumnPath {
790 fn as_ref(&self) -> &[String] {
791 &self.parts
792 }
793}
794
795#[derive(Debug, PartialEq)]
800pub struct ColumnDescriptor {
801 primitive_type: TypePtr,
803
804 max_def_level: i16,
806
807 max_rep_level: i16,
809
810 path: ColumnPath,
812}
813
814impl HeapSize for ColumnDescriptor {
815 fn heap_size(&self) -> usize {
816 self.primitive_type.heap_size() + self.path.heap_size()
817 }
818}
819
820impl ColumnDescriptor {
821 pub fn new(
823 primitive_type: TypePtr,
824 max_def_level: i16,
825 max_rep_level: i16,
826 path: ColumnPath,
827 ) -> Self {
828 Self {
829 primitive_type,
830 max_def_level,
831 max_rep_level,
832 path,
833 }
834 }
835
836 #[inline]
838 pub fn max_def_level(&self) -> i16 {
839 self.max_def_level
840 }
841
842 #[inline]
844 pub fn max_rep_level(&self) -> i16 {
845 self.max_rep_level
846 }
847
848 pub fn path(&self) -> &ColumnPath {
850 &self.path
851 }
852
853 pub fn self_type(&self) -> &Type {
855 self.primitive_type.as_ref()
856 }
857
858 pub fn self_type_ptr(&self) -> TypePtr {
861 self.primitive_type.clone()
862 }
863
864 pub fn name(&self) -> &str {
866 self.primitive_type.name()
867 }
868
869 pub fn converted_type(&self) -> ConvertedType {
871 self.primitive_type.get_basic_info().converted_type()
872 }
873
874 pub fn logical_type(&self) -> Option<LogicalType> {
876 self.primitive_type.get_basic_info().logical_type()
877 }
878
879 pub fn physical_type(&self) -> PhysicalType {
882 match self.primitive_type.as_ref() {
883 Type::PrimitiveType { physical_type, .. } => *physical_type,
884 _ => panic!("Expected primitive type!"),
885 }
886 }
887
888 pub fn type_length(&self) -> i32 {
891 match self.primitive_type.as_ref() {
892 Type::PrimitiveType { type_length, .. } => *type_length,
893 _ => panic!("Expected primitive type!"),
894 }
895 }
896
897 pub fn type_precision(&self) -> i32 {
900 match self.primitive_type.as_ref() {
901 Type::PrimitiveType { precision, .. } => *precision,
902 _ => panic!("Expected primitive type!"),
903 }
904 }
905
906 pub fn type_scale(&self) -> i32 {
909 match self.primitive_type.as_ref() {
910 Type::PrimitiveType { scale, .. } => *scale,
911 _ => panic!("Expected primitive type!"),
912 }
913 }
914
915 pub fn sort_order(&self) -> SortOrder {
917 ColumnOrder::get_sort_order(
918 self.logical_type(),
919 self.converted_type(),
920 self.physical_type(),
921 )
922 }
923}
924
925#[derive(PartialEq)]
930pub struct SchemaDescriptor {
931 schema: TypePtr,
936
937 leaves: Vec<ColumnDescPtr>,
941
942 leaf_to_base: Vec<usize>,
953}
954
955impl fmt::Debug for SchemaDescriptor {
956 fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
957 f.debug_struct("SchemaDescriptor")
959 .field("schema", &self.schema)
960 .finish()
961 }
962}
963
964impl HeapSize for SchemaDescriptor {
966 fn heap_size(&self) -> usize {
967 self.schema.heap_size() + self.leaves.heap_size() + self.leaf_to_base.heap_size()
968 }
969}
970
971impl SchemaDescriptor {
972 pub fn new(tp: TypePtr) -> Self {
974 assert!(tp.is_group(), "SchemaDescriptor should take a GroupType");
975 let mut leaves = vec![];
976 let mut leaf_to_base = Vec::new();
977 for (root_idx, f) in tp.get_fields().iter().enumerate() {
978 let mut path = vec![];
979 build_tree(f, root_idx, 0, 0, &mut leaves, &mut leaf_to_base, &mut path);
980 }
981
982 Self {
983 schema: tp,
984 leaves,
985 leaf_to_base,
986 }
987 }
988
989 pub fn column(&self, i: usize) -> ColumnDescPtr {
991 assert!(
992 i < self.leaves.len(),
993 "Index out of bound: {} not in [0, {})",
994 i,
995 self.leaves.len()
996 );
997 self.leaves[i].clone()
998 }
999
1000 pub fn columns(&self) -> &[ColumnDescPtr] {
1002 &self.leaves
1003 }
1004
1005 pub fn num_columns(&self) -> usize {
1007 self.leaves.len()
1008 }
1009
1010 pub fn get_column_root(&self, i: usize) -> &Type {
1012 let result = self.column_root_of(i);
1013 result.as_ref()
1014 }
1015
1016 pub fn get_column_root_ptr(&self, i: usize) -> TypePtr {
1018 let result = self.column_root_of(i);
1019 result.clone()
1020 }
1021
1022 pub fn get_column_root_idx(&self, leaf: usize) -> usize {
1024 assert!(
1025 leaf < self.leaves.len(),
1026 "Index out of bound: {} not in [0, {})",
1027 leaf,
1028 self.leaves.len()
1029 );
1030
1031 *self
1032 .leaf_to_base
1033 .get(leaf)
1034 .unwrap_or_else(|| panic!("Expected a value for index {leaf} but found None"))
1035 }
1036
1037 fn column_root_of(&self, i: usize) -> &TypePtr {
1038 &self.schema.get_fields()[self.get_column_root_idx(i)]
1039 }
1040
1041 pub fn root_schema(&self) -> &Type {
1043 self.schema.as_ref()
1044 }
1045
1046 pub fn root_schema_ptr(&self) -> TypePtr {
1048 self.schema.clone()
1049 }
1050
1051 pub fn name(&self) -> &str {
1053 self.schema.name()
1054 }
1055}
1056
1057fn build_tree<'a>(
1058 tp: &'a TypePtr,
1059 root_idx: usize,
1060 mut max_rep_level: i16,
1061 mut max_def_level: i16,
1062 leaves: &mut Vec<ColumnDescPtr>,
1063 leaf_to_base: &mut Vec<usize>,
1064 path_so_far: &mut Vec<&'a str>,
1065) {
1066 assert!(tp.get_basic_info().has_repetition());
1067
1068 path_so_far.push(tp.name());
1069 match tp.get_basic_info().repetition() {
1070 Repetition::OPTIONAL => {
1071 max_def_level += 1;
1072 }
1073 Repetition::REPEATED => {
1074 max_def_level += 1;
1075 max_rep_level += 1;
1076 }
1077 _ => {}
1078 }
1079
1080 match tp.as_ref() {
1081 Type::PrimitiveType { .. } => {
1082 let mut path: Vec<String> = vec![];
1083 path.extend(path_so_far.iter().copied().map(String::from));
1084 leaves.push(Arc::new(ColumnDescriptor::new(
1085 tp.clone(),
1086 max_def_level,
1087 max_rep_level,
1088 ColumnPath::new(path),
1089 )));
1090 leaf_to_base.push(root_idx);
1091 }
1092 Type::GroupType { ref fields, .. } => {
1093 for f in fields {
1094 build_tree(
1095 f,
1096 root_idx,
1097 max_rep_level,
1098 max_def_level,
1099 leaves,
1100 leaf_to_base,
1101 path_so_far,
1102 );
1103 path_so_far.pop();
1104 }
1105 }
1106 }
1107}
1108
1109pub fn from_thrift(elements: &[SchemaElement]) -> Result<TypePtr> {
1111 let mut index = 0;
1112 let mut schema_nodes = Vec::new();
1113 while index < elements.len() {
1114 let t = from_thrift_helper(elements, index)?;
1115 index = t.0;
1116 schema_nodes.push(t.1);
1117 }
1118 if schema_nodes.len() != 1 {
1119 return Err(general_err!(
1120 "Expected exactly one root node, but found {}",
1121 schema_nodes.len()
1122 ));
1123 }
1124
1125 Ok(schema_nodes.remove(0))
1126}
1127
1128fn from_thrift_helper(elements: &[SchemaElement], index: usize) -> Result<(usize, TypePtr)> {
1133 let is_root_node = index == 0;
1136
1137 if index >= elements.len() {
1138 return Err(general_err!(
1139 "Index out of bound, index = {}, len = {}",
1140 index,
1141 elements.len()
1142 ));
1143 }
1144 let element = &elements[index];
1145 let converted_type = ConvertedType::try_from(element.converted_type)?;
1146 let logical_type = element
1149 .logical_type
1150 .as_ref()
1151 .map(|value| LogicalType::from(value.clone()));
1152 let field_id = elements[index].field_id;
1153 match elements[index].num_children {
1154 None | Some(0) => {
1160 if elements[index].repetition_type.is_none() {
1162 return Err(general_err!(
1163 "Repetition level must be defined for a primitive type"
1164 ));
1165 }
1166 let repetition = Repetition::try_from(elements[index].repetition_type.unwrap())?;
1167 if let Some(type_) = elements[index].type_ {
1168 let physical_type = PhysicalType::try_from(type_)?;
1169 let length = elements[index].type_length.unwrap_or(-1);
1170 let scale = elements[index].scale.unwrap_or(-1);
1171 let precision = elements[index].precision.unwrap_or(-1);
1172 let name = &elements[index].name;
1173 let builder = Type::primitive_type_builder(name, physical_type)
1174 .with_repetition(repetition)
1175 .with_converted_type(converted_type)
1176 .with_logical_type(logical_type)
1177 .with_length(length)
1178 .with_precision(precision)
1179 .with_scale(scale)
1180 .with_id(field_id);
1181 Ok((index + 1, Arc::new(builder.build()?)))
1182 } else {
1183 let mut builder = Type::group_type_builder(&elements[index].name)
1184 .with_converted_type(converted_type)
1185 .with_logical_type(logical_type)
1186 .with_id(field_id);
1187 if !is_root_node {
1188 builder = builder.with_repetition(repetition);
1196 }
1197 Ok((index + 1, Arc::new(builder.build().unwrap())))
1198 }
1199 }
1200 Some(n) => {
1201 let repetition = elements[index]
1202 .repetition_type
1203 .map(Repetition::try_from)
1204 .transpose()?;
1205
1206 let mut fields = vec![];
1207 let mut next_index = index + 1;
1208 for _ in 0..n {
1209 let child_result = from_thrift_helper(elements, next_index)?;
1210 next_index = child_result.0;
1211 fields.push(child_result.1);
1212 }
1213
1214 let mut builder = Type::group_type_builder(&elements[index].name)
1215 .with_converted_type(converted_type)
1216 .with_logical_type(logical_type)
1217 .with_fields(fields)
1218 .with_id(field_id);
1219 if let Some(rep) = repetition {
1220 if !is_root_node {
1228 builder = builder.with_repetition(rep);
1229 }
1230 }
1231 Ok((next_index, Arc::new(builder.build().unwrap())))
1232 }
1233 }
1234}
1235
1236pub fn to_thrift(schema: &Type) -> Result<Vec<SchemaElement>> {
1238 if !schema.is_group() {
1239 return Err(general_err!("Root schema must be Group type"));
1240 }
1241 let mut elements: Vec<SchemaElement> = Vec::new();
1242 to_thrift_helper(schema, &mut elements);
1243 Ok(elements)
1244}
1245
1246fn to_thrift_helper(schema: &Type, elements: &mut Vec<SchemaElement>) {
1249 match *schema {
1250 Type::PrimitiveType {
1251 ref basic_info,
1252 physical_type,
1253 type_length,
1254 scale,
1255 precision,
1256 } => {
1257 let element = SchemaElement {
1258 type_: Some(physical_type.into()),
1259 type_length: if type_length >= 0 {
1260 Some(type_length)
1261 } else {
1262 None
1263 },
1264 repetition_type: Some(basic_info.repetition().into()),
1265 name: basic_info.name().to_owned(),
1266 num_children: None,
1267 converted_type: basic_info.converted_type().into(),
1268 scale: if scale >= 0 { Some(scale) } else { None },
1269 precision: if precision >= 0 {
1270 Some(precision)
1271 } else {
1272 None
1273 },
1274 field_id: if basic_info.has_id() {
1275 Some(basic_info.id())
1276 } else {
1277 None
1278 },
1279 logical_type: basic_info.logical_type().map(|value| value.into()),
1280 };
1281
1282 elements.push(element);
1283 }
1284 Type::GroupType {
1285 ref basic_info,
1286 ref fields,
1287 } => {
1288 let repetition = if basic_info.has_repetition() {
1289 Some(basic_info.repetition().into())
1290 } else {
1291 None
1292 };
1293
1294 let element = SchemaElement {
1295 type_: None,
1296 type_length: None,
1297 repetition_type: repetition,
1298 name: basic_info.name().to_owned(),
1299 num_children: Some(fields.len() as i32),
1300 converted_type: basic_info.converted_type().into(),
1301 scale: None,
1302 precision: None,
1303 field_id: if basic_info.has_id() {
1304 Some(basic_info.id())
1305 } else {
1306 None
1307 },
1308 logical_type: basic_info.logical_type().map(|value| value.into()),
1309 };
1310
1311 elements.push(element);
1312
1313 for field in fields {
1315 to_thrift_helper(field, elements);
1316 }
1317 }
1318 }
1319}
1320
1321#[cfg(test)]
1322mod tests {
1323 use super::*;
1324
1325 use crate::schema::parser::parse_message_type;
1326
1327 #[test]
1330 fn test_primitive_type() {
1331 let mut result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1332 .with_logical_type(Some(LogicalType::Integer {
1333 bit_width: 32,
1334 is_signed: true,
1335 }))
1336 .with_id(Some(0))
1337 .build();
1338 assert!(result.is_ok());
1339
1340 if let Ok(tp) = result {
1341 assert!(tp.is_primitive());
1342 assert!(!tp.is_group());
1343 let basic_info = tp.get_basic_info();
1344 assert_eq!(basic_info.repetition(), Repetition::OPTIONAL);
1345 assert_eq!(
1346 basic_info.logical_type(),
1347 Some(LogicalType::Integer {
1348 bit_width: 32,
1349 is_signed: true
1350 })
1351 );
1352 assert_eq!(basic_info.converted_type(), ConvertedType::INT_32);
1353 assert_eq!(basic_info.id(), 0);
1354 match tp {
1355 Type::PrimitiveType { physical_type, .. } => {
1356 assert_eq!(physical_type, PhysicalType::INT32);
1357 }
1358 _ => panic!(),
1359 }
1360 }
1361
1362 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1364 .with_repetition(Repetition::REPEATED)
1365 .with_logical_type(Some(LogicalType::Integer {
1366 is_signed: true,
1367 bit_width: 8,
1368 }))
1369 .build();
1370 assert!(result.is_err());
1371 if let Err(e) = result {
1372 assert_eq!(
1373 format!("{e}"),
1374 "Parquet error: Cannot annotate Integer { bit_width: 8, is_signed: true } from INT64 for field 'foo'"
1375 );
1376 }
1377
1378 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1380 .with_repetition(Repetition::REPEATED)
1381 .with_converted_type(ConvertedType::BSON)
1382 .build();
1383 assert!(result.is_err());
1384 if let Err(e) = result {
1385 assert_eq!(
1386 format!("{e}"),
1387 "Parquet error: BSON cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1388 );
1389 }
1390
1391 result = Type::primitive_type_builder("foo", PhysicalType::INT96)
1392 .with_repetition(Repetition::REQUIRED)
1393 .with_converted_type(ConvertedType::DECIMAL)
1394 .with_precision(-1)
1395 .with_scale(-1)
1396 .build();
1397 assert!(result.is_err());
1398 if let Err(e) = result {
1399 assert_eq!(
1400 format!("{e}"),
1401 "Parquet error: DECIMAL can only annotate INT32, INT64, BYTE_ARRAY and FIXED_LEN_BYTE_ARRAY"
1402 );
1403 }
1404
1405 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1406 .with_repetition(Repetition::REQUIRED)
1407 .with_logical_type(Some(LogicalType::Decimal {
1408 scale: 32,
1409 precision: 12,
1410 }))
1411 .with_precision(-1)
1412 .with_scale(-1)
1413 .build();
1414 assert!(result.is_err());
1415 if let Err(e) = result {
1416 assert_eq!(
1417 format!("{e}"),
1418 "Parquet error: DECIMAL logical type scale 32 must match self.scale -1 for field 'foo'"
1419 );
1420 }
1421
1422 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1423 .with_repetition(Repetition::REQUIRED)
1424 .with_converted_type(ConvertedType::DECIMAL)
1425 .with_precision(-1)
1426 .with_scale(-1)
1427 .build();
1428 assert!(result.is_err());
1429 if let Err(e) = result {
1430 assert_eq!(
1431 format!("{e}"),
1432 "Parquet error: Invalid DECIMAL precision: -1"
1433 );
1434 }
1435
1436 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1437 .with_repetition(Repetition::REQUIRED)
1438 .with_converted_type(ConvertedType::DECIMAL)
1439 .with_precision(0)
1440 .with_scale(-1)
1441 .build();
1442 assert!(result.is_err());
1443 if let Err(e) = result {
1444 assert_eq!(
1445 format!("{e}"),
1446 "Parquet error: Invalid DECIMAL precision: 0"
1447 );
1448 }
1449
1450 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1451 .with_repetition(Repetition::REQUIRED)
1452 .with_converted_type(ConvertedType::DECIMAL)
1453 .with_precision(1)
1454 .with_scale(-1)
1455 .build();
1456 assert!(result.is_err());
1457 if let Err(e) = result {
1458 assert_eq!(format!("{e}"), "Parquet error: Invalid DECIMAL scale: -1");
1459 }
1460
1461 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1462 .with_repetition(Repetition::REQUIRED)
1463 .with_converted_type(ConvertedType::DECIMAL)
1464 .with_precision(1)
1465 .with_scale(2)
1466 .build();
1467 assert!(result.is_err());
1468 if let Err(e) = result {
1469 assert_eq!(
1470 format!("{e}"),
1471 "Parquet error: Invalid DECIMAL: scale (2) cannot be greater than precision (1)"
1472 );
1473 }
1474
1475 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1477 .with_repetition(Repetition::REQUIRED)
1478 .with_converted_type(ConvertedType::DECIMAL)
1479 .with_precision(1)
1480 .with_scale(1)
1481 .build();
1482 assert!(result.is_ok());
1483
1484 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1485 .with_repetition(Repetition::REQUIRED)
1486 .with_converted_type(ConvertedType::DECIMAL)
1487 .with_precision(18)
1488 .with_scale(2)
1489 .build();
1490 assert!(result.is_err());
1491 if let Err(e) = result {
1492 assert_eq!(
1493 format!("{e}"),
1494 "Parquet error: Cannot represent INT32 as DECIMAL with precision 18"
1495 );
1496 }
1497
1498 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1499 .with_repetition(Repetition::REQUIRED)
1500 .with_converted_type(ConvertedType::DECIMAL)
1501 .with_precision(32)
1502 .with_scale(2)
1503 .build();
1504 assert!(result.is_err());
1505 if let Err(e) = result {
1506 assert_eq!(
1507 format!("{e}"),
1508 "Parquet error: Cannot represent INT64 as DECIMAL with precision 32"
1509 );
1510 }
1511
1512 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1513 .with_repetition(Repetition::REQUIRED)
1514 .with_converted_type(ConvertedType::DECIMAL)
1515 .with_length(5)
1516 .with_precision(12)
1517 .with_scale(2)
1518 .build();
1519 assert!(result.is_err());
1520 if let Err(e) = result {
1521 assert_eq!(
1522 format!("{e}"),
1523 "Parquet error: Cannot represent FIXED_LEN_BYTE_ARRAY as DECIMAL with length 5 and precision 12. The max precision can only be 11"
1524 );
1525 }
1526
1527 result = Type::primitive_type_builder("foo", PhysicalType::INT64)
1528 .with_repetition(Repetition::REQUIRED)
1529 .with_converted_type(ConvertedType::UINT_8)
1530 .build();
1531 assert!(result.is_err());
1532 if let Err(e) = result {
1533 assert_eq!(
1534 format!("{e}"),
1535 "Parquet error: UINT_8 cannot annotate field 'foo' because it is not a INT32 field"
1536 );
1537 }
1538
1539 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1540 .with_repetition(Repetition::REQUIRED)
1541 .with_converted_type(ConvertedType::TIME_MICROS)
1542 .build();
1543 assert!(result.is_err());
1544 if let Err(e) = result {
1545 assert_eq!(
1546 format!("{e}"),
1547 "Parquet error: TIME_MICROS cannot annotate field 'foo' because it is not a INT64 field"
1548 );
1549 }
1550
1551 result = Type::primitive_type_builder("foo", PhysicalType::BYTE_ARRAY)
1552 .with_repetition(Repetition::REQUIRED)
1553 .with_converted_type(ConvertedType::INTERVAL)
1554 .build();
1555 assert!(result.is_err());
1556 if let Err(e) = result {
1557 assert_eq!(
1558 format!("{e}"),
1559 "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1560 );
1561 }
1562
1563 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1564 .with_repetition(Repetition::REQUIRED)
1565 .with_converted_type(ConvertedType::INTERVAL)
1566 .with_length(1)
1567 .build();
1568 assert!(result.is_err());
1569 if let Err(e) = result {
1570 assert_eq!(
1571 format!("{e}"),
1572 "Parquet error: INTERVAL cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(12) field"
1573 );
1574 }
1575
1576 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1577 .with_repetition(Repetition::REQUIRED)
1578 .with_converted_type(ConvertedType::ENUM)
1579 .build();
1580 assert!(result.is_err());
1581 if let Err(e) = result {
1582 assert_eq!(
1583 format!("{e}"),
1584 "Parquet error: ENUM cannot annotate field 'foo' because it is not a BYTE_ARRAY field"
1585 );
1586 }
1587
1588 result = Type::primitive_type_builder("foo", PhysicalType::INT32)
1589 .with_repetition(Repetition::REQUIRED)
1590 .with_converted_type(ConvertedType::MAP)
1591 .build();
1592 assert!(result.is_err());
1593 if let Err(e) = result {
1594 assert_eq!(
1595 format!("{e}"),
1596 "Parquet error: MAP cannot be applied to primitive field 'foo'"
1597 );
1598 }
1599
1600 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1601 .with_repetition(Repetition::REQUIRED)
1602 .with_converted_type(ConvertedType::DECIMAL)
1603 .with_length(-1)
1604 .build();
1605 assert!(result.is_err());
1606 if let Err(e) = result {
1607 assert_eq!(
1608 format!("{e}"),
1609 "Parquet error: Invalid FIXED_LEN_BYTE_ARRAY length: -1 for field 'foo'"
1610 );
1611 }
1612
1613 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1614 .with_repetition(Repetition::REQUIRED)
1615 .with_logical_type(Some(LogicalType::Float16))
1616 .with_length(2)
1617 .build();
1618 assert!(result.is_ok());
1619
1620 result = Type::primitive_type_builder("foo", PhysicalType::FLOAT)
1622 .with_repetition(Repetition::REQUIRED)
1623 .with_logical_type(Some(LogicalType::Float16))
1624 .with_length(2)
1625 .build();
1626 assert!(result.is_err());
1627 if let Err(e) = result {
1628 assert_eq!(
1629 format!("{e}"),
1630 "Parquet error: Cannot annotate Float16 from FLOAT for field 'foo'"
1631 );
1632 }
1633
1634 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1636 .with_repetition(Repetition::REQUIRED)
1637 .with_logical_type(Some(LogicalType::Float16))
1638 .with_length(4)
1639 .build();
1640 assert!(result.is_err());
1641 if let Err(e) = result {
1642 assert_eq!(
1643 format!("{e}"),
1644 "Parquet error: FLOAT16 cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(2) field"
1645 );
1646 }
1647
1648 result = Type::primitive_type_builder("foo", PhysicalType::FIXED_LEN_BYTE_ARRAY)
1650 .with_repetition(Repetition::REQUIRED)
1651 .with_logical_type(Some(LogicalType::Uuid))
1652 .with_length(15)
1653 .build();
1654 assert!(result.is_err());
1655 if let Err(e) = result {
1656 assert_eq!(
1657 format!("{e}"),
1658 "Parquet error: UUID cannot annotate field 'foo' because it is not a FIXED_LEN_BYTE_ARRAY(16) field"
1659 );
1660 }
1661 }
1662
1663 #[test]
1664 fn test_group_type() {
1665 let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1666 .with_converted_type(ConvertedType::INT_32)
1667 .with_id(Some(0))
1668 .build();
1669 assert!(f1.is_ok());
1670 let f2 = Type::primitive_type_builder("f2", PhysicalType::BYTE_ARRAY)
1671 .with_converted_type(ConvertedType::UTF8)
1672 .with_id(Some(1))
1673 .build();
1674 assert!(f2.is_ok());
1675
1676 let fields = vec![Arc::new(f1.unwrap()), Arc::new(f2.unwrap())];
1677
1678 let result = Type::group_type_builder("foo")
1679 .with_repetition(Repetition::REPEATED)
1680 .with_logical_type(Some(LogicalType::List))
1681 .with_fields(fields)
1682 .with_id(Some(1))
1683 .build();
1684 assert!(result.is_ok());
1685
1686 let tp = result.unwrap();
1687 let basic_info = tp.get_basic_info();
1688 assert!(tp.is_group());
1689 assert!(!tp.is_primitive());
1690 assert_eq!(basic_info.repetition(), Repetition::REPEATED);
1691 assert_eq!(basic_info.logical_type(), Some(LogicalType::List));
1692 assert_eq!(basic_info.converted_type(), ConvertedType::LIST);
1693 assert_eq!(basic_info.id(), 1);
1694 assert_eq!(tp.get_fields().len(), 2);
1695 assert_eq!(tp.get_fields()[0].name(), "f1");
1696 assert_eq!(tp.get_fields()[1].name(), "f2");
1697 }
1698
1699 #[test]
1700 fn test_column_descriptor() {
1701 let result = test_column_descriptor_helper();
1702 assert!(
1703 result.is_ok(),
1704 "Expected result to be OK but got err:\n {}",
1705 result.unwrap_err()
1706 );
1707 }
1708
1709 fn test_column_descriptor_helper() -> Result<()> {
1710 let tp = Type::primitive_type_builder("name", PhysicalType::BYTE_ARRAY)
1711 .with_converted_type(ConvertedType::UTF8)
1712 .build()?;
1713
1714 let descr = ColumnDescriptor::new(Arc::new(tp), 4, 1, ColumnPath::from("name"));
1715
1716 assert_eq!(descr.path(), &ColumnPath::from("name"));
1717 assert_eq!(descr.converted_type(), ConvertedType::UTF8);
1718 assert_eq!(descr.physical_type(), PhysicalType::BYTE_ARRAY);
1719 assert_eq!(descr.max_def_level(), 4);
1720 assert_eq!(descr.max_rep_level(), 1);
1721 assert_eq!(descr.name(), "name");
1722 assert_eq!(descr.type_length(), -1);
1723 assert_eq!(descr.type_precision(), -1);
1724 assert_eq!(descr.type_scale(), -1);
1725
1726 Ok(())
1727 }
1728
1729 #[test]
1730 fn test_schema_descriptor() {
1731 let result = test_schema_descriptor_helper();
1732 assert!(
1733 result.is_ok(),
1734 "Expected result to be OK but got err:\n {}",
1735 result.unwrap_err()
1736 );
1737 }
1738
1739 fn test_schema_descriptor_helper() -> Result<()> {
1741 let mut fields = vec![];
1742
1743 let inta = Type::primitive_type_builder("a", PhysicalType::INT32)
1744 .with_repetition(Repetition::REQUIRED)
1745 .with_converted_type(ConvertedType::INT_32)
1746 .build()?;
1747 fields.push(Arc::new(inta));
1748 let intb = Type::primitive_type_builder("b", PhysicalType::INT64)
1749 .with_converted_type(ConvertedType::INT_64)
1750 .build()?;
1751 fields.push(Arc::new(intb));
1752 let intc = Type::primitive_type_builder("c", PhysicalType::BYTE_ARRAY)
1753 .with_repetition(Repetition::REPEATED)
1754 .with_converted_type(ConvertedType::UTF8)
1755 .build()?;
1756 fields.push(Arc::new(intc));
1757
1758 let item1 = Type::primitive_type_builder("item1", PhysicalType::INT64)
1760 .with_repetition(Repetition::REQUIRED)
1761 .with_converted_type(ConvertedType::INT_64)
1762 .build()?;
1763 let item2 = Type::primitive_type_builder("item2", PhysicalType::BOOLEAN).build()?;
1764 let item3 = Type::primitive_type_builder("item3", PhysicalType::INT32)
1765 .with_repetition(Repetition::REPEATED)
1766 .with_converted_type(ConvertedType::INT_32)
1767 .build()?;
1768 let list = Type::group_type_builder("records")
1769 .with_repetition(Repetition::REPEATED)
1770 .with_converted_type(ConvertedType::LIST)
1771 .with_fields(vec![Arc::new(item1), Arc::new(item2), Arc::new(item3)])
1772 .build()?;
1773 let bag = Type::group_type_builder("bag")
1774 .with_repetition(Repetition::OPTIONAL)
1775 .with_fields(vec![Arc::new(list)])
1776 .build()?;
1777 fields.push(Arc::new(bag));
1778
1779 let schema = Type::group_type_builder("schema")
1780 .with_repetition(Repetition::REPEATED)
1781 .with_fields(fields)
1782 .build()?;
1783 let descr = SchemaDescriptor::new(Arc::new(schema));
1784
1785 let nleaves = 6;
1786 assert_eq!(descr.num_columns(), nleaves);
1787
1788 let ex_max_def_levels = [0, 1, 1, 2, 3, 3];
1798 let ex_max_rep_levels = [0, 0, 1, 1, 1, 2];
1799
1800 for i in 0..nleaves {
1801 let col = descr.column(i);
1802 assert_eq!(col.max_def_level(), ex_max_def_levels[i], "{i}");
1803 assert_eq!(col.max_rep_level(), ex_max_rep_levels[i], "{i}");
1804 }
1805
1806 assert_eq!(descr.column(0).path().string(), "a");
1807 assert_eq!(descr.column(1).path().string(), "b");
1808 assert_eq!(descr.column(2).path().string(), "c");
1809 assert_eq!(descr.column(3).path().string(), "bag.records.item1");
1810 assert_eq!(descr.column(4).path().string(), "bag.records.item2");
1811 assert_eq!(descr.column(5).path().string(), "bag.records.item3");
1812
1813 assert_eq!(descr.get_column_root(0).name(), "a");
1814 assert_eq!(descr.get_column_root(3).name(), "bag");
1815 assert_eq!(descr.get_column_root(4).name(), "bag");
1816 assert_eq!(descr.get_column_root(5).name(), "bag");
1817
1818 Ok(())
1819 }
1820
1821 #[test]
1822 fn test_schema_build_tree_def_rep_levels() {
1823 let message_type = "
1824 message spark_schema {
1825 REQUIRED INT32 a;
1826 OPTIONAL group b {
1827 OPTIONAL INT32 _1;
1828 OPTIONAL INT32 _2;
1829 }
1830 OPTIONAL group c (LIST) {
1831 REPEATED group list {
1832 OPTIONAL INT32 element;
1833 }
1834 }
1835 }
1836 ";
1837 let schema = parse_message_type(message_type).expect("should parse schema");
1838 let descr = SchemaDescriptor::new(Arc::new(schema));
1839 assert_eq!(descr.column(0).max_def_level(), 0);
1841 assert_eq!(descr.column(0).max_rep_level(), 0);
1842 assert_eq!(descr.column(1).max_def_level(), 2);
1844 assert_eq!(descr.column(1).max_rep_level(), 0);
1845 assert_eq!(descr.column(2).max_def_level(), 2);
1847 assert_eq!(descr.column(2).max_rep_level(), 0);
1848 assert_eq!(descr.column(3).max_def_level(), 3);
1850 assert_eq!(descr.column(3).max_rep_level(), 1);
1851 }
1852
1853 #[test]
1854 #[should_panic(expected = "Cannot call get_physical_type() on a non-primitive type")]
1855 fn test_get_physical_type_panic() {
1856 let list = Type::group_type_builder("records")
1857 .with_repetition(Repetition::REPEATED)
1858 .build()
1859 .unwrap();
1860 list.get_physical_type();
1861 }
1862
1863 #[test]
1864 fn test_get_physical_type_primitive() {
1865 let f = Type::primitive_type_builder("f", PhysicalType::INT64)
1866 .build()
1867 .unwrap();
1868 assert_eq!(f.get_physical_type(), PhysicalType::INT64);
1869
1870 let f = Type::primitive_type_builder("f", PhysicalType::BYTE_ARRAY)
1871 .build()
1872 .unwrap();
1873 assert_eq!(f.get_physical_type(), PhysicalType::BYTE_ARRAY);
1874 }
1875
1876 #[test]
1877 fn test_check_contains_primitive_primitive() {
1878 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1880 .build()
1881 .unwrap();
1882 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1883 .build()
1884 .unwrap();
1885 assert!(f1.check_contains(&f2));
1886
1887 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1889 .with_converted_type(ConvertedType::UINT_8)
1890 .build()
1891 .unwrap();
1892 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1893 .with_converted_type(ConvertedType::UINT_16)
1894 .build()
1895 .unwrap();
1896 assert!(f1.check_contains(&f2));
1897
1898 let f1 = Type::primitive_type_builder("f1", PhysicalType::INT32)
1900 .build()
1901 .unwrap();
1902 let f2 = Type::primitive_type_builder("f2", PhysicalType::INT32)
1903 .build()
1904 .unwrap();
1905 assert!(!f1.check_contains(&f2));
1906
1907 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1909 .build()
1910 .unwrap();
1911 let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
1912 .build()
1913 .unwrap();
1914 assert!(!f1.check_contains(&f2));
1915
1916 let f1 = Type::primitive_type_builder("f", PhysicalType::INT32)
1918 .with_repetition(Repetition::REQUIRED)
1919 .build()
1920 .unwrap();
1921 let f2 = Type::primitive_type_builder("f", PhysicalType::INT32)
1922 .with_repetition(Repetition::OPTIONAL)
1923 .build()
1924 .unwrap();
1925 assert!(!f1.check_contains(&f2));
1926 }
1927
1928 fn test_new_group_type(name: &str, repetition: Repetition, types: Vec<Type>) -> Type {
1930 Type::group_type_builder(name)
1931 .with_repetition(repetition)
1932 .with_fields(types.into_iter().map(Arc::new).collect())
1933 .build()
1934 .unwrap()
1935 }
1936
1937 #[test]
1938 fn test_check_contains_group_group() {
1939 let f1 = Type::group_type_builder("f").build().unwrap();
1941 let f2 = Type::group_type_builder("f").build().unwrap();
1942 assert!(f1.check_contains(&f2));
1943 assert!(!f1.is_optional());
1944
1945 let f1 = test_new_group_type(
1947 "f",
1948 Repetition::REPEATED,
1949 vec![
1950 Type::primitive_type_builder("f1", PhysicalType::INT32)
1951 .build()
1952 .unwrap(),
1953 Type::primitive_type_builder("f2", PhysicalType::INT64)
1954 .build()
1955 .unwrap(),
1956 ],
1957 );
1958 let f2 = test_new_group_type(
1959 "f",
1960 Repetition::REPEATED,
1961 vec![
1962 Type::primitive_type_builder("f1", PhysicalType::INT32)
1963 .build()
1964 .unwrap(),
1965 Type::primitive_type_builder("f2", PhysicalType::INT64)
1966 .build()
1967 .unwrap(),
1968 ],
1969 );
1970 assert!(f1.check_contains(&f2));
1971
1972 let f1 = test_new_group_type(
1974 "f",
1975 Repetition::REPEATED,
1976 vec![
1977 Type::primitive_type_builder("f1", PhysicalType::INT32)
1978 .build()
1979 .unwrap(),
1980 Type::primitive_type_builder("f2", PhysicalType::INT64)
1981 .build()
1982 .unwrap(),
1983 ],
1984 );
1985 let f2 = test_new_group_type(
1986 "f",
1987 Repetition::REPEATED,
1988 vec![Type::primitive_type_builder("f2", PhysicalType::INT64)
1989 .build()
1990 .unwrap()],
1991 );
1992 assert!(f1.check_contains(&f2));
1993
1994 let f1 = Type::group_type_builder("f1").build().unwrap();
1996 let f2 = Type::group_type_builder("f2").build().unwrap();
1997 assert!(!f1.check_contains(&f2));
1998
1999 let f1 = Type::group_type_builder("f")
2001 .with_repetition(Repetition::OPTIONAL)
2002 .build()
2003 .unwrap();
2004 let f2 = Type::group_type_builder("f")
2005 .with_repetition(Repetition::REPEATED)
2006 .build()
2007 .unwrap();
2008 assert!(!f1.check_contains(&f2));
2009
2010 let f1 = test_new_group_type(
2012 "f",
2013 Repetition::REPEATED,
2014 vec![
2015 Type::primitive_type_builder("f1", PhysicalType::INT32)
2016 .build()
2017 .unwrap(),
2018 Type::primitive_type_builder("f2", PhysicalType::INT64)
2019 .build()
2020 .unwrap(),
2021 ],
2022 );
2023 let f2 = test_new_group_type(
2024 "f",
2025 Repetition::REPEATED,
2026 vec![
2027 Type::primitive_type_builder("f1", PhysicalType::INT32)
2028 .build()
2029 .unwrap(),
2030 Type::primitive_type_builder("f2", PhysicalType::BOOLEAN)
2031 .build()
2032 .unwrap(),
2033 ],
2034 );
2035 assert!(!f1.check_contains(&f2));
2036
2037 let f1 = test_new_group_type(
2039 "f",
2040 Repetition::REPEATED,
2041 vec![
2042 Type::primitive_type_builder("f1", PhysicalType::INT32)
2043 .build()
2044 .unwrap(),
2045 Type::primitive_type_builder("f2", PhysicalType::INT64)
2046 .build()
2047 .unwrap(),
2048 ],
2049 );
2050 let f2 = test_new_group_type(
2051 "f",
2052 Repetition::REPEATED,
2053 vec![Type::primitive_type_builder("f3", PhysicalType::INT32)
2054 .build()
2055 .unwrap()],
2056 );
2057 assert!(!f1.check_contains(&f2));
2058 }
2059
2060 #[test]
2061 fn test_check_contains_group_primitive() {
2062 let f1 = Type::group_type_builder("f").build().unwrap();
2064 let f2 = Type::primitive_type_builder("f", PhysicalType::INT64)
2065 .build()
2066 .unwrap();
2067 assert!(!f1.check_contains(&f2));
2068 assert!(!f2.check_contains(&f1));
2069
2070 let f1 = test_new_group_type(
2072 "f",
2073 Repetition::REPEATED,
2074 vec![Type::primitive_type_builder("f1", PhysicalType::INT32)
2075 .build()
2076 .unwrap()],
2077 );
2078 let f2 = Type::primitive_type_builder("f1", PhysicalType::INT32)
2079 .build()
2080 .unwrap();
2081 assert!(!f1.check_contains(&f2));
2082 assert!(!f2.check_contains(&f1));
2083
2084 let f1 = test_new_group_type(
2086 "a",
2087 Repetition::REPEATED,
2088 vec![
2089 test_new_group_type(
2090 "b",
2091 Repetition::REPEATED,
2092 vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2093 .build()
2094 .unwrap()],
2095 ),
2096 Type::primitive_type_builder("d", PhysicalType::INT64)
2097 .build()
2098 .unwrap(),
2099 Type::primitive_type_builder("e", PhysicalType::BOOLEAN)
2100 .build()
2101 .unwrap(),
2102 ],
2103 );
2104 let f2 = test_new_group_type(
2105 "a",
2106 Repetition::REPEATED,
2107 vec![test_new_group_type(
2108 "b",
2109 Repetition::REPEATED,
2110 vec![Type::primitive_type_builder("c", PhysicalType::INT32)
2111 .build()
2112 .unwrap()],
2113 )],
2114 );
2115 assert!(f1.check_contains(&f2)); assert!(!f2.check_contains(&f1)); }
2118
2119 #[test]
2120 fn test_schema_type_thrift_conversion_err() {
2121 let schema = Type::primitive_type_builder("col", PhysicalType::INT32)
2122 .build()
2123 .unwrap();
2124 let thrift_schema = to_thrift(&schema);
2125 assert!(thrift_schema.is_err());
2126 if let Err(e) = thrift_schema {
2127 assert_eq!(
2128 format!("{e}"),
2129 "Parquet error: Root schema must be Group type"
2130 );
2131 }
2132 }
2133
2134 #[test]
2135 fn test_schema_type_thrift_conversion() {
2136 let message_type = "
2137 message conversions {
2138 REQUIRED INT64 id;
2139 OPTIONAL FIXED_LEN_BYTE_ARRAY (2) f16 (FLOAT16);
2140 OPTIONAL group int_array_Array (LIST) {
2141 REPEATED group list {
2142 OPTIONAL group element (LIST) {
2143 REPEATED group list {
2144 OPTIONAL INT32 element;
2145 }
2146 }
2147 }
2148 }
2149 OPTIONAL group int_map (MAP) {
2150 REPEATED group map (MAP_KEY_VALUE) {
2151 REQUIRED BYTE_ARRAY key (UTF8);
2152 OPTIONAL INT32 value;
2153 }
2154 }
2155 OPTIONAL group int_Map_Array (LIST) {
2156 REPEATED group list {
2157 OPTIONAL group g (MAP) {
2158 REPEATED group map (MAP_KEY_VALUE) {
2159 REQUIRED BYTE_ARRAY key (UTF8);
2160 OPTIONAL group value {
2161 OPTIONAL group H {
2162 OPTIONAL group i (LIST) {
2163 REPEATED group list {
2164 OPTIONAL DOUBLE element;
2165 }
2166 }
2167 }
2168 }
2169 }
2170 }
2171 }
2172 }
2173 OPTIONAL group nested_struct {
2174 OPTIONAL INT32 A;
2175 OPTIONAL group b (LIST) {
2176 REPEATED group list {
2177 REQUIRED FIXED_LEN_BYTE_ARRAY (16) element;
2178 }
2179 }
2180 }
2181 }
2182 ";
2183 let expected_schema = parse_message_type(message_type).unwrap();
2184 let thrift_schema = to_thrift(&expected_schema).unwrap();
2185 let result_schema = from_thrift(&thrift_schema).unwrap();
2186 assert_eq!(result_schema, Arc::new(expected_schema));
2187 }
2188
2189 #[test]
2190 fn test_schema_type_thrift_conversion_decimal() {
2191 let message_type = "
2192 message decimals {
2193 OPTIONAL INT32 field0;
2194 OPTIONAL INT64 field1 (DECIMAL (18, 2));
2195 OPTIONAL FIXED_LEN_BYTE_ARRAY (16) field2 (DECIMAL (38, 18));
2196 OPTIONAL BYTE_ARRAY field3 (DECIMAL (9));
2197 }
2198 ";
2199 let expected_schema = parse_message_type(message_type).unwrap();
2200 let thrift_schema = to_thrift(&expected_schema).unwrap();
2201 let result_schema = from_thrift(&thrift_schema).unwrap();
2202 assert_eq!(result_schema, Arc::new(expected_schema));
2203 }
2204
2205 #[test]
2208 fn test_schema_from_thrift_with_num_children_set() {
2209 let message_type = "
2211 message schema {
2212 OPTIONAL BYTE_ARRAY id (UTF8);
2213 OPTIONAL BYTE_ARRAY name (UTF8);
2214 OPTIONAL BYTE_ARRAY message (UTF8);
2215 OPTIONAL INT32 type (UINT_8);
2216 OPTIONAL INT64 author_time (TIMESTAMP_MILLIS);
2217 OPTIONAL INT64 __index_level_0__;
2218 }
2219 ";
2220
2221 let expected_schema = parse_message_type(message_type).unwrap();
2222 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2223 for elem in &mut thrift_schema[..] {
2225 if elem.num_children.is_none() {
2226 elem.num_children = Some(0);
2227 }
2228 }
2229
2230 let result_schema = from_thrift(&thrift_schema).unwrap();
2231 assert_eq!(result_schema, Arc::new(expected_schema));
2232 }
2233
2234 #[test]
2237 fn test_schema_from_thrift_root_has_repetition() {
2238 let message_type = "
2240 message schema {
2241 OPTIONAL BYTE_ARRAY a (UTF8);
2242 OPTIONAL INT32 b (UINT_8);
2243 }
2244 ";
2245
2246 let expected_schema = parse_message_type(message_type).unwrap();
2247 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2248 thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2249
2250 let result_schema = from_thrift(&thrift_schema).unwrap();
2251 assert_eq!(result_schema, Arc::new(expected_schema));
2252 }
2253
2254 #[test]
2255 fn test_schema_from_thrift_group_has_no_child() {
2256 let message_type = "message schema {}";
2257
2258 let expected_schema = parse_message_type(message_type).unwrap();
2259 let mut thrift_schema = to_thrift(&expected_schema).unwrap();
2260 thrift_schema[0].repetition_type = Some(Repetition::REQUIRED.into());
2261
2262 let result_schema = from_thrift(&thrift_schema).unwrap();
2263 assert_eq!(result_schema, Arc::new(expected_schema));
2264 }
2265}