parquet/schema/
parser.rs

1// Licensed to the Apache Software Foundation (ASF) under one
2// or more contributor license agreements.  See the NOTICE file
3// distributed with this work for additional information
4// regarding copyright ownership.  The ASF licenses this file
5// to you under the Apache License, Version 2.0 (the
6// "License"); you may not use this file except in compliance
7// with the License.  You may obtain a copy of the License at
8//
9//   http://www.apache.org/licenses/LICENSE-2.0
10//
11// Unless required by applicable law or agreed to in writing,
12// software distributed under the License is distributed on an
13// "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
14// KIND, either express or implied.  See the License for the
15// specific language governing permissions and limitations
16// under the License.
17
18//! Parquet schema parser.
19//! Provides methods to parse and validate string message type into Parquet
20//! [`Type`].
21//!
22//! # Example
23//!
24//! ```rust
25//! use parquet::schema::parser::parse_message_type;
26//!
27//! let message_type = "
28//!   message spark_schema {
29//!     OPTIONAL BYTE_ARRAY a (UTF8);
30//!     REQUIRED INT32 b;
31//!     REQUIRED DOUBLE c;
32//!     REQUIRED BOOLEAN d;
33//!     OPTIONAL group e (LIST) {
34//!       REPEATED group list {
35//!         REQUIRED INT32 element;
36//!       }
37//!     }
38//!   }
39//! ";
40//!
41//! let schema = parse_message_type(message_type).expect("Expected valid schema");
42//! println!("{:?}", schema);
43//! ```
44
45use std::sync::Arc;
46
47use crate::basic::{ConvertedType, LogicalType, Repetition, TimeUnit, Type as PhysicalType};
48use crate::errors::{ParquetError, Result};
49use crate::schema::types::{Type, TypePtr};
50
51/// Parses message type as string into a Parquet [`Type`]
52/// which, for example, could be used to extract individual columns. Returns Parquet
53/// general error when parsing or validation fails.
54pub fn parse_message_type(message_type: &str) -> Result<Type> {
55    let mut parser = Parser {
56        tokenizer: &mut Tokenizer::from_str(message_type),
57    };
58    parser.parse_message_type()
59}
60
61/// Tokenizer to split message type string into tokens that are separated using characters
62/// defined in `is_schema_delim` method. Tokenizer also preserves delimiters as tokens.
63/// Tokenizer provides Iterator interface to process tokens; it also allows to step back
64/// to reprocess previous tokens.
65struct Tokenizer<'a> {
66    // List of all tokens for a string
67    tokens: Vec<&'a str>,
68    // Current index of vector
69    index: usize,
70}
71
72impl<'a> Tokenizer<'a> {
73    // Create tokenizer from message type string
74    pub fn from_str(string: &'a str) -> Self {
75        let vec = string
76            .split_whitespace()
77            .flat_map(Self::split_token)
78            .collect();
79        Tokenizer {
80            tokens: vec,
81            index: 0,
82        }
83    }
84
85    // List of all special characters in schema
86    fn is_schema_delim(c: char) -> bool {
87        c == ';' || c == '{' || c == '}' || c == '(' || c == ')' || c == '=' || c == ','
88    }
89
90    /// Splits string into tokens; input string can already be token or can contain
91    /// delimiters, e.g. required" -> Vec("required") and
92    /// "(UTF8);" -> Vec("(", "UTF8", ")", ";")
93    fn split_token(string: &str) -> Vec<&str> {
94        let mut buffer: Vec<&str> = Vec::new();
95        let mut tail = string;
96        while let Some(index) = tail.find(Self::is_schema_delim) {
97            let (h, t) = tail.split_at(index);
98            if !h.is_empty() {
99                buffer.push(h);
100            }
101            buffer.push(&t[0..1]);
102            tail = &t[1..];
103        }
104        if !tail.is_empty() {
105            buffer.push(tail);
106        }
107        buffer
108    }
109
110    // Move pointer to a previous element
111    fn backtrack(&mut self) {
112        self.index -= 1;
113    }
114}
115
116impl<'a> Iterator for Tokenizer<'a> {
117    type Item = &'a str;
118
119    fn next(&mut self) -> Option<&'a str> {
120        if self.index < self.tokens.len() {
121            self.index += 1;
122            Some(self.tokens[self.index - 1])
123        } else {
124            None
125        }
126    }
127}
128
129/// Internal Schema parser.
130/// Traverses message type using tokenizer and parses each group/primitive type
131/// recursively.
132struct Parser<'a> {
133    tokenizer: &'a mut Tokenizer<'a>,
134}
135
136// Utility function to assert token on validity.
137fn assert_token(token: Option<&str>, expected: &str) -> Result<()> {
138    match token {
139        Some(value) if value == expected => Ok(()),
140        Some(other) => Err(general_err!(
141            "Expected '{}', found token '{}'",
142            expected,
143            other
144        )),
145        None => Err(general_err!(
146            "Expected '{}', but no token found (None)",
147            expected
148        )),
149    }
150}
151
152// Utility function to parse i32 or return general error.
153#[inline]
154fn parse_i32(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<i32> {
155    value
156        .ok_or_else(|| general_err!(not_found_msg))
157        .and_then(|v| v.parse::<i32>().map_err(|_| general_err!(parse_fail_msg)))
158}
159
160// Utility function to parse boolean or return general error.
161#[inline]
162fn parse_bool(value: Option<&str>, not_found_msg: &str, parse_fail_msg: &str) -> Result<bool> {
163    value
164        .ok_or_else(|| general_err!(not_found_msg))
165        .and_then(|v| {
166            v.to_lowercase()
167                .parse::<bool>()
168                .map_err(|_| general_err!(parse_fail_msg))
169        })
170}
171
172// Utility function to parse TimeUnit or return general error.
173fn parse_timeunit(
174    value: Option<&str>,
175    not_found_msg: &str,
176    parse_fail_msg: &str,
177) -> Result<TimeUnit> {
178    value
179        .ok_or_else(|| general_err!(not_found_msg))
180        .and_then(|v| match v.to_uppercase().as_str() {
181            "MILLIS" => Ok(TimeUnit::MILLIS(Default::default())),
182            "MICROS" => Ok(TimeUnit::MICROS(Default::default())),
183            "NANOS" => Ok(TimeUnit::NANOS(Default::default())),
184            _ => Err(general_err!(parse_fail_msg)),
185        })
186}
187
188impl Parser<'_> {
189    // Entry function to parse message type, uses internal tokenizer.
190    fn parse_message_type(&mut self) -> Result<Type> {
191        // Check that message type starts with "message".
192        match self.tokenizer.next() {
193            Some("message") => {
194                let name = self
195                    .tokenizer
196                    .next()
197                    .ok_or_else(|| general_err!("Expected name, found None"))?;
198                Type::group_type_builder(name)
199                    .with_fields(self.parse_child_types()?)
200                    .build()
201            }
202            _ => Err(general_err!("Message type does not start with 'message'")),
203        }
204    }
205
206    // Parses child types for a current group type.
207    // This is only invoked on root and group types.
208    fn parse_child_types(&mut self) -> Result<Vec<TypePtr>> {
209        assert_token(self.tokenizer.next(), "{")?;
210        let mut vec = Vec::new();
211        while let Some(value) = self.tokenizer.next() {
212            if value == "}" {
213                break;
214            } else {
215                self.tokenizer.backtrack();
216                vec.push(Arc::new(self.add_type()?));
217            }
218        }
219        Ok(vec)
220    }
221
222    fn add_type(&mut self) -> Result<Type> {
223        // Parse repetition
224        let repetition = self
225            .tokenizer
226            .next()
227            .ok_or_else(|| general_err!("Expected repetition, found None"))
228            .and_then(|v| v.to_uppercase().parse::<Repetition>())?;
229
230        match self.tokenizer.next() {
231            Some(group) if group.to_uppercase() == "GROUP" => self.add_group_type(Some(repetition)),
232            Some(type_string) => {
233                let physical_type = type_string.to_uppercase().parse::<PhysicalType>()?;
234                self.add_primitive_type(repetition, physical_type)
235            }
236            None => Err(general_err!("Invalid type, could not extract next token")),
237        }
238    }
239
240    fn add_group_type(&mut self, repetition: Option<Repetition>) -> Result<Type> {
241        // Parse name of the group type
242        let name = self
243            .tokenizer
244            .next()
245            .ok_or_else(|| general_err!("Expected name, found None"))?;
246
247        // Parse logical or converted type if exists
248        let (logical_type, converted_type) = if let Some("(") = self.tokenizer.next() {
249            let tpe = self
250                .tokenizer
251                .next()
252                .ok_or_else(|| general_err!("Expected converted type, found None"))
253                .and_then(|v| {
254                    // Try logical type first
255                    let upper = v.to_uppercase();
256                    let logical = upper.parse::<LogicalType>();
257                    match logical {
258                        Ok(logical) => {
259                            Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
260                        }
261                        Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
262                    }
263                })?;
264            assert_token(self.tokenizer.next(), ")")?;
265            tpe
266        } else {
267            self.tokenizer.backtrack();
268            (None, ConvertedType::NONE)
269        };
270
271        // Parse optional id
272        let id = if let Some("=") = self.tokenizer.next() {
273            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
274        } else {
275            self.tokenizer.backtrack();
276            None
277        };
278
279        let mut builder = Type::group_type_builder(name)
280            .with_logical_type(logical_type)
281            .with_converted_type(converted_type)
282            .with_fields(self.parse_child_types()?)
283            .with_id(id);
284        if let Some(rep) = repetition {
285            builder = builder.with_repetition(rep);
286        }
287        builder.build()
288    }
289
290    fn add_primitive_type(
291        &mut self,
292        repetition: Repetition,
293        physical_type: PhysicalType,
294    ) -> Result<Type> {
295        // Read type length if the type is FIXED_LEN_BYTE_ARRAY.
296        let mut length: i32 = -1;
297        if physical_type == PhysicalType::FIXED_LEN_BYTE_ARRAY {
298            assert_token(self.tokenizer.next(), "(")?;
299            length = parse_i32(
300                self.tokenizer.next(),
301                "Expected length for FIXED_LEN_BYTE_ARRAY, found None",
302                "Failed to parse length for FIXED_LEN_BYTE_ARRAY",
303            )?;
304            assert_token(self.tokenizer.next(), ")")?;
305        }
306
307        // Parse name of the primitive type
308        let name = self
309            .tokenizer
310            .next()
311            .ok_or_else(|| general_err!("Expected name, found None"))?;
312
313        // Parse converted type
314        let (logical_type, converted_type, precision, scale) =
315            if let Some("(") = self.tokenizer.next() {
316                let (mut logical, mut converted) = self
317                    .tokenizer
318                    .next()
319                    .ok_or_else(|| general_err!("Expected logical or converted type, found None"))
320                    .and_then(|v| {
321                        let upper = v.to_uppercase();
322                        let logical = upper.parse::<LogicalType>();
323                        match logical {
324                            Ok(logical) => {
325                                Ok((Some(logical.clone()), ConvertedType::from(Some(logical))))
326                            }
327                            Err(_) => Ok((None, upper.parse::<ConvertedType>()?)),
328                        }
329                    })?;
330
331                // Parse precision and scale for decimals
332                let mut precision: i32 = -1;
333                let mut scale: i32 = -1;
334
335                // Parse the concrete logical type
336                if let Some(tpe) = &logical {
337                    match tpe {
338                        LogicalType::Decimal { .. } => {
339                            if let Some("(") = self.tokenizer.next() {
340                                precision = parse_i32(
341                                    self.tokenizer.next(),
342                                    "Expected precision, found None",
343                                    "Failed to parse precision for DECIMAL type",
344                                )?;
345                                if let Some(",") = self.tokenizer.next() {
346                                    scale = parse_i32(
347                                        self.tokenizer.next(),
348                                        "Expected scale, found None",
349                                        "Failed to parse scale for DECIMAL type",
350                                    )?;
351                                    assert_token(self.tokenizer.next(), ")")?;
352                                } else {
353                                    scale = 0
354                                }
355                                logical = Some(LogicalType::Decimal { scale, precision });
356                                converted = ConvertedType::from(logical.clone());
357                            }
358                        }
359                        LogicalType::Time { .. } => {
360                            if let Some("(") = self.tokenizer.next() {
361                                let unit = parse_timeunit(
362                                    self.tokenizer.next(),
363                                    "Invalid timeunit found",
364                                    "Failed to parse timeunit for TIME type",
365                                )?;
366                                if let Some(",") = self.tokenizer.next() {
367                                    let is_adjusted_to_u_t_c = parse_bool(
368                                        self.tokenizer.next(),
369                                        "Invalid boolean found",
370                                        "Failed to parse timezone info for TIME type",
371                                    )?;
372                                    assert_token(self.tokenizer.next(), ")")?;
373                                    logical = Some(LogicalType::Time {
374                                        is_adjusted_to_u_t_c,
375                                        unit,
376                                    });
377                                    converted = ConvertedType::from(logical.clone());
378                                } else {
379                                    // Invalid token for unit
380                                    self.tokenizer.backtrack();
381                                }
382                            }
383                        }
384                        LogicalType::Timestamp { .. } => {
385                            if let Some("(") = self.tokenizer.next() {
386                                let unit = parse_timeunit(
387                                    self.tokenizer.next(),
388                                    "Invalid timeunit found",
389                                    "Failed to parse timeunit for TIMESTAMP type",
390                                )?;
391                                if let Some(",") = self.tokenizer.next() {
392                                    let is_adjusted_to_u_t_c = parse_bool(
393                                        self.tokenizer.next(),
394                                        "Invalid boolean found",
395                                        "Failed to parse timezone info for TIMESTAMP type",
396                                    )?;
397                                    assert_token(self.tokenizer.next(), ")")?;
398                                    logical = Some(LogicalType::Timestamp {
399                                        is_adjusted_to_u_t_c,
400                                        unit,
401                                    });
402                                    converted = ConvertedType::from(logical.clone());
403                                } else {
404                                    // Invalid token for unit
405                                    self.tokenizer.backtrack();
406                                }
407                            }
408                        }
409                        LogicalType::Integer { .. } => {
410                            if let Some("(") = self.tokenizer.next() {
411                                let bit_width = parse_i32(
412                                    self.tokenizer.next(),
413                                    "Invalid bit_width found",
414                                    "Failed to parse bit_width for INTEGER type",
415                                )? as i8;
416                                match physical_type {
417                                    PhysicalType::INT32 => match bit_width {
418                                        8 | 16 | 32 => {}
419                                        _ => {
420                                            return Err(general_err!(
421                                                "Incorrect bit width {} for INT32",
422                                                bit_width
423                                            ))
424                                        }
425                                    },
426                                    PhysicalType::INT64 => {
427                                        if bit_width != 64 {
428                                            return Err(general_err!(
429                                                "Incorrect bit width {} for INT64",
430                                                bit_width
431                                            ));
432                                        }
433                                    }
434                                    _ => {
435                                        return Err(general_err!(
436                                        "Logical type Integer cannot be used with physical type {}",
437                                        physical_type
438                                    ))
439                                    }
440                                }
441                                if let Some(",") = self.tokenizer.next() {
442                                    let is_signed = parse_bool(
443                                        self.tokenizer.next(),
444                                        "Invalid boolean found",
445                                        "Failed to parse is_signed for INTEGER type",
446                                    )?;
447                                    assert_token(self.tokenizer.next(), ")")?;
448                                    logical = Some(LogicalType::Integer {
449                                        bit_width,
450                                        is_signed,
451                                    });
452                                    converted = ConvertedType::from(logical.clone());
453                                } else {
454                                    // Invalid token for unit
455                                    self.tokenizer.backtrack();
456                                }
457                            }
458                        }
459                        _ => {}
460                    }
461                } else if converted == ConvertedType::DECIMAL {
462                    if let Some("(") = self.tokenizer.next() {
463                        // Parse precision
464                        precision = parse_i32(
465                            self.tokenizer.next(),
466                            "Expected precision, found None",
467                            "Failed to parse precision for DECIMAL type",
468                        )?;
469
470                        // Parse scale
471                        scale = if let Some(",") = self.tokenizer.next() {
472                            parse_i32(
473                                self.tokenizer.next(),
474                                "Expected scale, found None",
475                                "Failed to parse scale for DECIMAL type",
476                            )?
477                        } else {
478                            // Scale is not provided, set it to 0.
479                            self.tokenizer.backtrack();
480                            0
481                        };
482
483                        assert_token(self.tokenizer.next(), ")")?;
484                    } else {
485                        self.tokenizer.backtrack();
486                    }
487                }
488
489                assert_token(self.tokenizer.next(), ")")?;
490                (logical, converted, precision, scale)
491            } else {
492                self.tokenizer.backtrack();
493                (None, ConvertedType::NONE, -1, -1)
494            };
495
496        // Parse optional id
497        let id = if let Some("=") = self.tokenizer.next() {
498            self.tokenizer.next().and_then(|v| v.parse::<i32>().ok())
499        } else {
500            self.tokenizer.backtrack();
501            None
502        };
503        assert_token(self.tokenizer.next(), ";")?;
504
505        Type::primitive_type_builder(name, physical_type)
506            .with_repetition(repetition)
507            .with_logical_type(logical_type)
508            .with_converted_type(converted_type)
509            .with_length(length)
510            .with_precision(precision)
511            .with_scale(scale)
512            .with_id(id)
513            .build()
514    }
515}
516
517#[cfg(test)]
518mod tests {
519    use super::*;
520
521    #[test]
522    fn test_tokenize_empty_string() {
523        assert_eq!(Tokenizer::from_str("").next(), None);
524    }
525
526    #[test]
527    fn test_tokenize_delimiters() {
528        let mut iter = Tokenizer::from_str(",;{}()=");
529        assert_eq!(iter.next(), Some(","));
530        assert_eq!(iter.next(), Some(";"));
531        assert_eq!(iter.next(), Some("{"));
532        assert_eq!(iter.next(), Some("}"));
533        assert_eq!(iter.next(), Some("("));
534        assert_eq!(iter.next(), Some(")"));
535        assert_eq!(iter.next(), Some("="));
536        assert_eq!(iter.next(), None);
537    }
538
539    #[test]
540    fn test_tokenize_delimiters_with_whitespaces() {
541        let mut iter = Tokenizer::from_str(" , ; { } ( ) = ");
542        assert_eq!(iter.next(), Some(","));
543        assert_eq!(iter.next(), Some(";"));
544        assert_eq!(iter.next(), Some("{"));
545        assert_eq!(iter.next(), Some("}"));
546        assert_eq!(iter.next(), Some("("));
547        assert_eq!(iter.next(), Some(")"));
548        assert_eq!(iter.next(), Some("="));
549        assert_eq!(iter.next(), None);
550    }
551
552    #[test]
553    fn test_tokenize_words() {
554        let mut iter = Tokenizer::from_str("abc def ghi jkl mno");
555        assert_eq!(iter.next(), Some("abc"));
556        assert_eq!(iter.next(), Some("def"));
557        assert_eq!(iter.next(), Some("ghi"));
558        assert_eq!(iter.next(), Some("jkl"));
559        assert_eq!(iter.next(), Some("mno"));
560        assert_eq!(iter.next(), None);
561    }
562
563    #[test]
564    fn test_tokenize_backtrack() {
565        let mut iter = Tokenizer::from_str("abc;");
566        assert_eq!(iter.next(), Some("abc"));
567        assert_eq!(iter.next(), Some(";"));
568        iter.backtrack();
569        assert_eq!(iter.next(), Some(";"));
570        assert_eq!(iter.next(), None);
571    }
572
573    #[test]
574    fn test_tokenize_message_type() {
575        let schema = "
576    message schema {
577      required int32 a;
578      optional binary c (UTF8);
579      required group d {
580        required int32 a;
581        optional binary c (UTF8);
582      }
583      required group e (LIST) {
584        repeated group list {
585          required int32 element;
586        }
587      }
588    }
589    ";
590        let iter = Tokenizer::from_str(schema);
591        let mut res = Vec::new();
592        for token in iter {
593            res.push(token);
594        }
595        assert_eq!(
596            res,
597            vec![
598                "message", "schema", "{", "required", "int32", "a", ";", "optional", "binary", "c",
599                "(", "UTF8", ")", ";", "required", "group", "d", "{", "required", "int32", "a",
600                ";", "optional", "binary", "c", "(", "UTF8", ")", ";", "}", "required", "group",
601                "e", "(", "LIST", ")", "{", "repeated", "group", "list", "{", "required", "int32",
602                "element", ";", "}", "}", "}"
603            ]
604        );
605    }
606
607    #[test]
608    fn test_assert_token() {
609        assert!(assert_token(Some("a"), "a").is_ok());
610        assert!(assert_token(Some("a"), "b").is_err());
611        assert!(assert_token(None, "b").is_err());
612    }
613
614    fn parse(schema: &str) -> Result<Type, ParquetError> {
615        let mut iter = Tokenizer::from_str(schema);
616        Parser {
617            tokenizer: &mut iter,
618        }
619        .parse_message_type()
620    }
621
622    #[test]
623    fn test_parse_message_type_invalid() {
624        assert_eq!(
625            parse("test").unwrap_err().to_string(),
626            "Parquet error: Message type does not start with 'message'"
627        );
628    }
629
630    #[test]
631    fn test_parse_message_type_no_name() {
632        assert_eq!(
633            parse("message").unwrap_err().to_string(),
634            "Parquet error: Expected name, found None"
635        );
636    }
637
638    #[test]
639    fn test_parse_message_type_fixed_byte_array() {
640        let schema = "
641            message schema {
642              REQUIRED FIXED_LEN_BYTE_ARRAY col;
643            }
644        ";
645        assert_eq!(
646            parse(schema).unwrap_err().to_string(),
647            "Parquet error: Expected '(', found token 'col'"
648        );
649
650        let schema = "
651            message schema {
652              REQUIRED FIXED_LEN_BYTE_ARRAY(16) col;
653            }
654        ";
655        parse(schema).unwrap();
656    }
657
658    #[test]
659    fn test_parse_message_type_integer() {
660        // Invalid integer syntax
661        let schema = "
662            message root {
663              optional int64 f1 (INTEGER());
664            }
665        ";
666        assert_eq!(
667            parse(schema).unwrap_err().to_string(),
668            "Parquet error: Failed to parse bit_width for INTEGER type"
669        );
670
671        // Invalid integer syntax, needs both bit-width and UTC sign
672        let schema = "
673    message root {
674      optional int64 f1 (INTEGER(32,));
675    }
676    ";
677        assert_eq!(
678            parse(schema).unwrap_err().to_string(),
679            "Parquet error: Incorrect bit width 32 for INT64"
680        );
681
682        // Invalid integer because of non-numeric bit width
683        let schema = "
684            message root {
685              optional int32 f1 (INTEGER(eight,true));
686            }
687        ";
688        assert_eq!(
689            parse(schema).unwrap_err().to_string(),
690            "Parquet error: Failed to parse bit_width for INTEGER type"
691        );
692
693        // Valid types
694        let schema = "
695            message root {
696              optional int32 f1 (INTEGER(8,false));
697              optional int32 f2 (INTEGER(8,true));
698              optional int32 f3 (INTEGER(16,false));
699              optional int32 f4 (INTEGER(16,true));
700              optional int32 f5 (INTEGER(32,false));
701              optional int32 f6 (INTEGER(32,true));
702              optional int64 f7 (INTEGER(64,false));
703              optional int64 f7 (INTEGER(64,true));
704            }
705        ";
706        parse(schema).unwrap();
707    }
708
709    #[test]
710    fn test_parse_message_type_temporal() {
711        // Invalid timestamp syntax
712        let schema = "
713            message root {
714              optional int64 f1 (TIMESTAMP();
715            }
716        ";
717        assert_eq!(
718            parse(schema).unwrap_err().to_string(),
719            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
720        );
721
722        // Invalid timestamp syntax, needs both unit and UTC adjustment
723        let schema = "
724            message root {
725              optional int64 f1 (TIMESTAMP(MILLIS,));
726            }
727        ";
728        assert_eq!(
729            parse(schema).unwrap_err().to_string(),
730            "Parquet error: Failed to parse timezone info for TIMESTAMP type"
731        );
732
733        // Invalid timestamp because of unknown unit
734        let schema = "
735            message root {
736              optional int64 f1 (TIMESTAMP(YOCTOS,));
737            }
738        ";
739
740        assert_eq!(
741            parse(schema).unwrap_err().to_string(),
742            "Parquet error: Failed to parse timeunit for TIMESTAMP type"
743        );
744
745        // Valid types
746        let schema = "
747            message root {
748              optional int32 f1 (DATE);
749              optional int32 f2 (TIME(MILLIS,true));
750              optional int64 f3 (TIME(MICROS,false));
751              optional int64 f4 (TIME(NANOS,true));
752              optional int64 f5 (TIMESTAMP(MILLIS,true));
753              optional int64 f6 (TIMESTAMP(MICROS,true));
754              optional int64 f7 (TIMESTAMP(NANOS,false));
755            }
756        ";
757        parse(schema).unwrap();
758    }
759
760    #[test]
761    fn test_parse_message_type_decimal() {
762        // It is okay for decimal to omit precision and scale with right syntax.
763        // Here we test wrong syntax of decimal type
764
765        // Invalid decimal syntax
766        let schema = "
767            message root {
768              optional int32 f1 (DECIMAL();
769            }
770        ";
771        assert_eq!(
772            parse(schema).unwrap_err().to_string(),
773            "Parquet error: Failed to parse precision for DECIMAL type"
774        );
775
776        // Invalid decimal, need precision and scale
777        let schema = "
778            message root {
779              optional int32 f1 (DECIMAL());
780            }
781        ";
782        assert_eq!(
783            parse(schema).unwrap_err().to_string(),
784            "Parquet error: Failed to parse precision for DECIMAL type"
785        );
786
787        // Invalid decimal because of `,` - has precision, needs scale
788        let schema = "
789            message root {
790              optional int32 f1 (DECIMAL(8,));
791            }
792        ";
793        assert_eq!(
794            parse(schema).unwrap_err().to_string(),
795            "Parquet error: Failed to parse scale for DECIMAL type"
796        );
797
798        // Invalid decimal because, we always require either precision or scale to be
799        // specified as part of converted type
800        let schema = "
801            message root {
802              optional int32 f3 (DECIMAL);
803            }
804        ";
805        assert_eq!(
806            parse(schema).unwrap_err().to_string(),
807            "Parquet error: Expected ')', found token ';'"
808        );
809
810        // Valid decimal (precision, scale)
811        let schema = "
812            message root {
813              optional int32 f1 (DECIMAL(8, 3));
814              optional int32 f2 (DECIMAL(8));
815            }
816        ";
817        parse(schema).unwrap();
818    }
819
820    #[test]
821    fn test_parse_message_type_compare_1() {
822        let schema = "
823            message root {
824              optional fixed_len_byte_array(5) f1 (DECIMAL(9, 3));
825              optional fixed_len_byte_array (16) f2 (DECIMAL (38, 18));
826              optional fixed_len_byte_array (2) f3 (FLOAT16);
827            }
828        ";
829        let message = parse(schema).unwrap();
830
831        let expected = Type::group_type_builder("root")
832            .with_fields(vec![
833                Arc::new(
834                    Type::primitive_type_builder("f1", PhysicalType::FIXED_LEN_BYTE_ARRAY)
835                        .with_logical_type(Some(LogicalType::Decimal {
836                            precision: 9,
837                            scale: 3,
838                        }))
839                        .with_converted_type(ConvertedType::DECIMAL)
840                        .with_length(5)
841                        .with_precision(9)
842                        .with_scale(3)
843                        .build()
844                        .unwrap(),
845                ),
846                Arc::new(
847                    Type::primitive_type_builder("f2", PhysicalType::FIXED_LEN_BYTE_ARRAY)
848                        .with_logical_type(Some(LogicalType::Decimal {
849                            precision: 38,
850                            scale: 18,
851                        }))
852                        .with_converted_type(ConvertedType::DECIMAL)
853                        .with_length(16)
854                        .with_precision(38)
855                        .with_scale(18)
856                        .build()
857                        .unwrap(),
858                ),
859                Arc::new(
860                    Type::primitive_type_builder("f3", PhysicalType::FIXED_LEN_BYTE_ARRAY)
861                        .with_logical_type(Some(LogicalType::Float16))
862                        .with_length(2)
863                        .build()
864                        .unwrap(),
865                ),
866            ])
867            .build()
868            .unwrap();
869
870        assert_eq!(message, expected);
871    }
872
873    #[test]
874    fn test_parse_message_type_compare_2() {
875        let schema = "
876            message root {
877              required group a0 {
878                optional group a1 (LIST) {
879                  repeated binary a2 (UTF8);
880                }
881
882                optional group b1 (LIST) {
883                  repeated group b2 {
884                    optional int32 b3;
885                    optional double b4;
886                  }
887                }
888              }
889            }
890        ";
891        let message = parse(schema).unwrap();
892
893        let expected = Type::group_type_builder("root")
894            .with_fields(vec![Arc::new(
895                Type::group_type_builder("a0")
896                    .with_repetition(Repetition::REQUIRED)
897                    .with_fields(vec![
898                        Arc::new(
899                            Type::group_type_builder("a1")
900                                .with_repetition(Repetition::OPTIONAL)
901                                .with_logical_type(Some(LogicalType::List))
902                                .with_converted_type(ConvertedType::LIST)
903                                .with_fields(vec![Arc::new(
904                                    Type::primitive_type_builder("a2", PhysicalType::BYTE_ARRAY)
905                                        .with_repetition(Repetition::REPEATED)
906                                        .with_converted_type(ConvertedType::UTF8)
907                                        .build()
908                                        .unwrap(),
909                                )])
910                                .build()
911                                .unwrap(),
912                        ),
913                        Arc::new(
914                            Type::group_type_builder("b1")
915                                .with_repetition(Repetition::OPTIONAL)
916                                .with_logical_type(Some(LogicalType::List))
917                                .with_converted_type(ConvertedType::LIST)
918                                .with_fields(vec![Arc::new(
919                                    Type::group_type_builder("b2")
920                                        .with_repetition(Repetition::REPEATED)
921                                        .with_fields(vec![
922                                            Arc::new(
923                                                Type::primitive_type_builder(
924                                                    "b3",
925                                                    PhysicalType::INT32,
926                                                )
927                                                .build()
928                                                .unwrap(),
929                                            ),
930                                            Arc::new(
931                                                Type::primitive_type_builder(
932                                                    "b4",
933                                                    PhysicalType::DOUBLE,
934                                                )
935                                                .build()
936                                                .unwrap(),
937                                            ),
938                                        ])
939                                        .build()
940                                        .unwrap(),
941                                )])
942                                .build()
943                                .unwrap(),
944                        ),
945                    ])
946                    .build()
947                    .unwrap(),
948            )])
949            .build()
950            .unwrap();
951
952        assert_eq!(message, expected);
953    }
954
955    #[test]
956    fn test_parse_message_type_compare_3() {
957        let schema = "
958            message root {
959              required int32 _1 (INT_8);
960              required int32 _2 (INT_16);
961              required float _3;
962              required double _4;
963              optional int32 _5 (DATE);
964              optional binary _6 (UTF8);
965            }
966        ";
967        let message = parse(schema).unwrap();
968
969        let fields = vec![
970            Arc::new(
971                Type::primitive_type_builder("_1", PhysicalType::INT32)
972                    .with_repetition(Repetition::REQUIRED)
973                    .with_converted_type(ConvertedType::INT_8)
974                    .build()
975                    .unwrap(),
976            ),
977            Arc::new(
978                Type::primitive_type_builder("_2", PhysicalType::INT32)
979                    .with_repetition(Repetition::REQUIRED)
980                    .with_converted_type(ConvertedType::INT_16)
981                    .build()
982                    .unwrap(),
983            ),
984            Arc::new(
985                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
986                    .with_repetition(Repetition::REQUIRED)
987                    .build()
988                    .unwrap(),
989            ),
990            Arc::new(
991                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
992                    .with_repetition(Repetition::REQUIRED)
993                    .build()
994                    .unwrap(),
995            ),
996            Arc::new(
997                Type::primitive_type_builder("_5", PhysicalType::INT32)
998                    .with_logical_type(Some(LogicalType::Date))
999                    .with_converted_type(ConvertedType::DATE)
1000                    .build()
1001                    .unwrap(),
1002            ),
1003            Arc::new(
1004                Type::primitive_type_builder("_6", PhysicalType::BYTE_ARRAY)
1005                    .with_converted_type(ConvertedType::UTF8)
1006                    .build()
1007                    .unwrap(),
1008            ),
1009        ];
1010
1011        let expected = Type::group_type_builder("root")
1012            .with_fields(fields)
1013            .build()
1014            .unwrap();
1015        assert_eq!(message, expected);
1016    }
1017
1018    #[test]
1019    fn test_parse_message_type_compare_4() {
1020        let schema = "
1021            message root {
1022              required int32 _1 (INTEGER(8,true));
1023              required int32 _2 (INTEGER(16,false));
1024              required float _3;
1025              required double _4;
1026              optional int32 _5 (DATE);
1027              optional int32 _6 (TIME(MILLIS,false));
1028              optional int64 _7 (TIME(MICROS,true));
1029              optional int64 _8 (TIMESTAMP(MILLIS,true));
1030              optional int64 _9 (TIMESTAMP(NANOS,false));
1031              optional binary _10 (STRING);
1032            }
1033        ";
1034        let message = parse(schema).unwrap();
1035
1036        let fields = vec![
1037            Arc::new(
1038                Type::primitive_type_builder("_1", PhysicalType::INT32)
1039                    .with_repetition(Repetition::REQUIRED)
1040                    .with_logical_type(Some(LogicalType::Integer {
1041                        bit_width: 8,
1042                        is_signed: true,
1043                    }))
1044                    .build()
1045                    .unwrap(),
1046            ),
1047            Arc::new(
1048                Type::primitive_type_builder("_2", PhysicalType::INT32)
1049                    .with_repetition(Repetition::REQUIRED)
1050                    .with_logical_type(Some(LogicalType::Integer {
1051                        bit_width: 16,
1052                        is_signed: false,
1053                    }))
1054                    .build()
1055                    .unwrap(),
1056            ),
1057            Arc::new(
1058                Type::primitive_type_builder("_3", PhysicalType::FLOAT)
1059                    .with_repetition(Repetition::REQUIRED)
1060                    .build()
1061                    .unwrap(),
1062            ),
1063            Arc::new(
1064                Type::primitive_type_builder("_4", PhysicalType::DOUBLE)
1065                    .with_repetition(Repetition::REQUIRED)
1066                    .build()
1067                    .unwrap(),
1068            ),
1069            Arc::new(
1070                Type::primitive_type_builder("_5", PhysicalType::INT32)
1071                    .with_logical_type(Some(LogicalType::Date))
1072                    .build()
1073                    .unwrap(),
1074            ),
1075            Arc::new(
1076                Type::primitive_type_builder("_6", PhysicalType::INT32)
1077                    .with_logical_type(Some(LogicalType::Time {
1078                        unit: TimeUnit::MILLIS(Default::default()),
1079                        is_adjusted_to_u_t_c: false,
1080                    }))
1081                    .build()
1082                    .unwrap(),
1083            ),
1084            Arc::new(
1085                Type::primitive_type_builder("_7", PhysicalType::INT64)
1086                    .with_logical_type(Some(LogicalType::Time {
1087                        unit: TimeUnit::MICROS(Default::default()),
1088                        is_adjusted_to_u_t_c: true,
1089                    }))
1090                    .build()
1091                    .unwrap(),
1092            ),
1093            Arc::new(
1094                Type::primitive_type_builder("_8", PhysicalType::INT64)
1095                    .with_logical_type(Some(LogicalType::Timestamp {
1096                        unit: TimeUnit::MILLIS(Default::default()),
1097                        is_adjusted_to_u_t_c: true,
1098                    }))
1099                    .build()
1100                    .unwrap(),
1101            ),
1102            Arc::new(
1103                Type::primitive_type_builder("_9", PhysicalType::INT64)
1104                    .with_logical_type(Some(LogicalType::Timestamp {
1105                        unit: TimeUnit::NANOS(Default::default()),
1106                        is_adjusted_to_u_t_c: false,
1107                    }))
1108                    .build()
1109                    .unwrap(),
1110            ),
1111            Arc::new(
1112                Type::primitive_type_builder("_10", PhysicalType::BYTE_ARRAY)
1113                    .with_logical_type(Some(LogicalType::String))
1114                    .build()
1115                    .unwrap(),
1116            ),
1117        ];
1118
1119        let expected = Type::group_type_builder("root")
1120            .with_fields(fields)
1121            .build()
1122            .unwrap();
1123        assert_eq!(message, expected);
1124    }
1125}
parquet/schema/parser.rs

parquet/schema/
parser.rs