mz_sqllogictest/
parser.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10//! A parser for sqllogictest.
11
12use std::borrow::ToOwned;
13use std::sync::LazyLock;
14
15use anyhow::{anyhow, bail};
16use mz_repr::ColumnName;
17use regex::Regex;
18
19use crate::ast::{Location, Mode, Output, QueryOutput, Record, Sort, Type};
20
21static QUERY_OUTPUT_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"\r?\n----").unwrap());
22static DOUBLE_LINE_REGEX: LazyLock<Regex> =
23    LazyLock::new(|| Regex::new(r"(\n|\r\n|$)(\n|\r\n|$)").unwrap());
24static EOF_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r"(\n|\r\n)EOF(\n|\r\n)").unwrap());
25// Separates the regex from the replacement in a `replace` directive. Two or
26// more spaces, so that the regex itself may contain single spaces.
27static REPLACE_SEP_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new(r" {2,}").unwrap());
28
29#[derive(Debug, Clone)]
30pub struct Parser<'a> {
31    contents: &'a str,
32    fname: String,
33    curline: usize,
34    mode: Mode,
35}
36
37impl<'a> Parser<'a> {
38    pub fn new(fname: &str, contents: &'a str) -> Self {
39        Parser {
40            contents,
41            fname: fname.to_string(),
42            curline: 1,
43            mode: Mode::Standard,
44        }
45    }
46
47    pub fn is_done(&self) -> bool {
48        self.contents.is_empty()
49    }
50
51    pub fn location(&self) -> Location {
52        Location {
53            file: self.fname.clone(),
54            line: self.curline,
55        }
56    }
57
58    fn consume(&mut self, upto: usize) {
59        for ch in self.contents[..upto].chars() {
60            if ch == '\n' {
61                self.curline += 1;
62            }
63        }
64        self.contents = &self.contents[upto..];
65    }
66
67    pub fn split_at(&mut self, sep: &Regex) -> Result<&'a str, anyhow::Error> {
68        match sep.find(self.contents) {
69            Some(found) => {
70                let result = &self.contents[..found.start()];
71                self.consume(found.end());
72                Ok(result)
73            }
74            None => bail!("Couldn't split {:?} at {:?}", self.contents, sep),
75        }
76    }
77
78    pub fn parse_record(&mut self) -> Result<Record<'a>, anyhow::Error> {
79        if self.is_done() {
80            return Ok(Record::Halt);
81        }
82
83        let line_number = self.curline;
84
85        static COMMENT_AND_LINE_REGEX: LazyLock<Regex> =
86            LazyLock::new(|| Regex::new("(#[^\n]*)?\r?(\n|$)").unwrap());
87        let first_line = self.split_at(&COMMENT_AND_LINE_REGEX)?.trim();
88
89        if first_line.is_empty() {
90            // query starts on the next line
91            return self.parse_record();
92        }
93
94        let mut words = first_line.split(' ').peekable();
95        match words.next().unwrap() {
96            "statement" => self.parse_statement(words, first_line),
97
98            "query" => self.parse_query(words, first_line),
99
100            "simple" => self.parse_simple(words),
101
102            "hash-threshold" => {
103                let threshold = words
104                    .next()
105                    .ok_or_else(|| anyhow!("missing threshold in: {}", first_line))?
106                    .parse::<u64>()
107                    .map_err(|err| anyhow!("invalid threshold ({}) in: {}", err, first_line))?;
108                Ok(Record::HashThreshold { threshold })
109            }
110
111            // we'll follow the postgresql version of all these tests
112            "skipif" => {
113                match words.next().unwrap() {
114                    "postgresql" => {
115                        // discard next record
116                        self.parse_record()?;
117                        self.parse_record()
118                    }
119                    _ => self.parse_record(),
120                }
121            }
122            "onlyif" => {
123                match words.next().unwrap() {
124                    "postgresql" => self.parse_record(),
125                    _ => {
126                        // discard next record
127                        self.parse_record()?;
128                        self.parse_record()
129                    }
130                }
131            }
132
133            "halt" => Ok(Record::Halt),
134
135            // this is some cockroach-specific thing, we don't care
136            "subtest" | "kv-batch-size" | "skip_on_retry" => self.parse_record(),
137
138            // CockroachDB's `user` directive switches the session user for
139            // subsequent records.
140            "user" => Ok(Record::User {
141                location: self.location(),
142                user: words
143                    .next()
144                    .ok_or_else(|| anyhow!("user directive missing name"))?,
145            }),
146
147            // CockroachDB's `let $var` binds the result of the following query
148            // to a variable that later records reference. We don't support the
149            // binding, so skip the directive and its query block. Records that
150            // reference the variable fail at execution time instead.
151            "let" => {
152                self.split_at(&DOUBLE_LINE_REGEX)?;
153                self.parse_record()
154            }
155
156            "mode" => {
157                self.mode = match words.next() {
158                    Some("cockroach") => Mode::Cockroach,
159                    Some("standard") | Some("sqlite") => Mode::Standard,
160                    other => bail!("unknown parse mode: {:?}", other),
161                };
162                self.parse_record()
163            }
164
165            "copy" => Ok(Record::Copy {
166                table_name: words
167                    .next()
168                    .ok_or_else(|| anyhow!("load directive missing table name"))?,
169                tsv_path: words
170                    .next()
171                    .ok_or_else(|| anyhow!("load directive missing TSV path"))?,
172            }),
173
174            "reset-server" => Ok(Record::ResetServer),
175
176            // `replace <regex>  <replacement>`: register a substitution applied
177            // to the actual output of subsequent queries before comparison. The
178            // regex and replacement are separated by two-or-more spaces, so the
179            // regex may contain single spaces. See `Record::Replace`.
180            "replace" => {
181                let args = first_line
182                    .strip_prefix("replace")
183                    .expect("dispatched on \"replace\"")
184                    .trim_start();
185                let mut parts = REPLACE_SEP_REGEX.splitn(args, 2);
186                let pattern = parts
187                    .next()
188                    .filter(|s| !s.is_empty())
189                    .ok_or_else(|| anyhow!("replace directive missing regex in: {}", first_line))?;
190                let replacement = parts.next().ok_or_else(|| {
191                    anyhow!(
192                        "replace directive missing replacement (separate the regex \
193                         and replacement with two or more spaces) in: {}",
194                        first_line
195                    )
196                })?;
197                // Validate the regex now, so an error is located at parse time.
198                Regex::new(pattern).map_err(|e| {
199                    anyhow!("invalid regex {:?} in replace directive: {}", pattern, e)
200                })?;
201                Ok(Record::Replace {
202                    pattern: pattern.to_owned(),
203                    replacement: replacement.to_owned(),
204                })
205            }
206
207            other => bail!(
208                "Unexpected start of record on line {}: {}",
209                line_number,
210                other
211            ),
212        }
213    }
214
215    pub fn parse_records(&mut self) -> Result<Vec<Record<'a>>, anyhow::Error> {
216        let mut records = vec![];
217        loop {
218            match self.parse_record()? {
219                Record::Halt => break,
220                record => records.push(record),
221            }
222        }
223        Ok(records)
224    }
225
226    fn parse_statement(
227        &mut self,
228        mut words: impl Iterator<Item = &'a str>,
229        first_line: &'a str,
230    ) -> Result<Record<'a>, anyhow::Error> {
231        let location = self.location();
232        let mut expected_error = None;
233        let mut rows_affected = None;
234        match words.next() {
235            Some("count") => {
236                rows_affected = Some(
237                    words
238                        .next()
239                        .ok_or_else(|| anyhow!("missing count of rows affected"))?
240                        .parse::<u64>()
241                        .map_err(|err| anyhow!("parsing count of rows affected: {}", err))?,
242                );
243            }
244            Some("error") => expected_error = Some(parse_expected_error(first_line)),
245            // CockroachDB's `statement notice <regex>` expects the statement
246            // to succeed and additionally emit a matching notice. We only
247            // check for success.
248            Some("notice") => (),
249            // An `ok` prefix accepts the typos present in files imported from
250            // CockroachDB (`oK`, `ok;`, `oko`), which CockroachDB's own
251            // lenient runner treats as plain `ok`.
252            Some(disposition) if disposition.to_lowercase().starts_with("ok") => (),
253            // A bare `statement` with no disposition expects success.
254            None => (),
255            _ => bail!("invalid statement disposition: {}", first_line),
256        };
257        let sql = self.split_at(&DOUBLE_LINE_REGEX)?;
258        Ok(Record::Statement {
259            expected_error,
260            rows_affected,
261            sql,
262            location,
263        })
264    }
265
266    fn parse_query(
267        &mut self,
268        mut words: std::iter::Peekable<impl Iterator<Item = &'a str>>,
269        first_line: &'a str,
270    ) -> Result<Record<'a>, anyhow::Error> {
271        let location = self.location();
272        if words.peek() == Some(&"error") {
273            let error = parse_expected_error(first_line);
274            let sql = self.split_at(&DOUBLE_LINE_REGEX)?;
275            return Ok(Record::Query {
276                sql,
277                output: Err(error),
278                location,
279            });
280        }
281
282        let types = words.next().map_or(Ok(vec![]), parse_types)?;
283        let mut sort = Sort::No;
284        let mut check_column_names = false;
285        let mut multiline = false;
286        let mut noticetrace = false;
287        if let Some(options) = words.next() {
288            for option in options.split(',') {
289                match option {
290                    "nosort" => sort = Sort::No,
291                    "rowsort" => sort = Sort::Row,
292                    "valuesort" => sort = Sort::Value,
293                    "colnames" => check_column_names = true,
294                    "multiline" => multiline = true,
295                    // CockroachDB re-runs `retry` queries until the output
296                    // converges. We run them once, like any other query.
297                    "retry" => (),
298                    // CockroachDB `noticetrace` queries assert the emitted
299                    // notices rather than rows. We can't observe notices, so
300                    // the whole record is skipped below.
301                    "noticetrace" => noticetrace = true,
302                    other => {
303                        if other.starts_with("partialsort") {
304                            // TODO(jamii) https://github.com/cockroachdb/cockroach/blob/d2f7fbf5dd1fc1a099bbad790a2e1f7c60a66cc3/pkg/sql/logictest/logic.go#L153
305                            // partialsort has comma-separated arguments so our parsing is totally broken
306                            // luckily it always comes last in the existing tests, so we can just bail out for now
307                            sort = Sort::Row;
308                            break;
309                        } else {
310                            bail!("Unrecognized option {:?} in {:?}", other, options);
311                        }
312                    }
313                };
314            }
315        }
316        if multiline && (check_column_names || sort.yes()) {
317            bail!("multiline option is incompatible with all other options");
318        }
319        let label = words.next();
320        static LINE_REGEX: LazyLock<Regex> = LazyLock::new(|| Regex::new("\r?(\n|$)").unwrap());
321        static HASH_REGEX: LazyLock<Regex> =
322            LazyLock::new(|| Regex::new(r"(\S+) values hashing to (\S+)").unwrap());
323        // CockroachDB queries may omit the `----` separator entirely, in
324        // which case the query must succeed and return no rows. Detect this
325        // by checking whether the record ends before the next `----`, which
326        // otherwise belongs to a later record. A blank line only ends the
327        // record if what follows it starts a new record (directive, comment,
328        // or end of file): hand-written files contain blank lines inside a
329        // query's SQL, which CockroachDB's own files never do.
330        static RECORD_START_REGEX: LazyLock<Regex> = LazyLock::new(|| {
331            Regex::new(
332                r"^(#|$|statement( |$)|query( |$)|simple( |$)|halt( |$)|mode |copy |user |subtest( |$)|let |skipif |onlyif |hash-threshold |reset-server( |$)|replace |kv-batch-size |skip_on_retry( |$))",
333            )
334            .unwrap()
335        });
336        let sep = QUERY_OUTPUT_REGEX.find(self.contents);
337        let end = DOUBLE_LINE_REGEX.find(self.contents);
338        let no_separator = match (&sep, &end) {
339            (Some(sep), Some(end)) if sep.start() < end.start() => false,
340            (_, Some(end)) => {
341                let mut rest = &self.contents[end.end()..];
342                loop {
343                    let line_end = rest.find('\n').map_or(rest.len(), |i| i + 1);
344                    let line = rest[..line_end].trim();
345                    if line.is_empty() && line_end < rest.len() {
346                        rest = &rest[line_end..];
347                        continue;
348                    }
349                    break RECORD_START_REGEX.is_match(line);
350                }
351            }
352            (_, None) => false,
353        };
354        if no_separator {
355            let sql = self.split_at(&DOUBLE_LINE_REGEX)?;
356            if noticetrace {
357                return self.parse_record();
358            }
359            return Ok(Record::Query {
360                sql,
361                output: Ok(QueryOutput {
362                    types,
363                    sort,
364                    multiline,
365                    label,
366                    column_names: None,
367                    mode: self.mode,
368                    output: Output::Values(vec![]),
369                    // An empty slice at the end of the SQL, so rewriting has
370                    // an in-bounds position to work with.
371                    output_str: &sql[sql.len()..],
372                }),
373                location,
374            });
375        }
376
377        let sql = self.split_at(&QUERY_OUTPUT_REGEX)?;
378        let mut output_str = self.split_at(if multiline {
379            &EOF_REGEX
380        } else {
381            &DOUBLE_LINE_REGEX
382        })?;
383
384        if noticetrace {
385            return self.parse_record();
386        }
387
388        // The `split_at(&QUERY_OUTPUT_REGEX)` stopped at the end of `----`, so `output_str` usually
389        // starts with a newline, which is not actually part of the expected output. Strip off this
390        // newline.
391        output_str = if let Some(output_str_stripped) = regexp_strip_prefix(output_str, &LINE_REGEX)
392        {
393            output_str_stripped
394        } else {
395            // There should always be a newline after `----`, because we have a lint that there is
396            // always a newline at the end of a file. However, we can still get here, when
397            // the expected output is empty, in which case the EOF_REGEX or DOUBLE_LINE_REGEX eats
398            // the newline at the end of the `----`.
399            assert!(output_str.is_empty());
400            output_str
401        };
402
403        // We don't want to advance the expected output past the column names so rewriting works,
404        // but need to be able to parse past them, so remember the position before possible column
405        // names.
406        let query_output_str = output_str;
407        let column_names = if check_column_names {
408            Some(
409                split_at(&mut output_str, &LINE_REGEX)?
410                    .split(' ')
411                    .filter(|s| !s.is_empty())
412                    .map(|s| ColumnName::from(s.replace('␠', " ")))
413                    .collect(),
414            )
415        } else {
416            None
417        };
418        let output = match HASH_REGEX.captures(output_str) {
419            Some(captures) => Output::Hashed {
420                num_values: captures.get(1).unwrap().as_str().parse::<usize>()?,
421                md5: captures.get(2).unwrap().as_str().to_owned(),
422            },
423            None => {
424                if multiline {
425                    Output::Values(vec![output_str.to_owned()])
426                } else if output_str.starts_with('\r') || output_str.starts_with('\n') {
427                    Output::Values(vec![])
428                } else {
429                    let mut vals: Vec<String> = output_str.lines().map(|s| s.to_owned()).collect();
430                    match self.mode {
431                        Mode::Standard => {
432                            if !multiline {
433                                vals = vals.into_iter().map(|val| val.replace('⏎', "\n")).collect();
434                            }
435
436                            if sort == Sort::Value {
437                                vals.sort();
438                            }
439                        }
440                        Mode::Cockroach => {
441                            let mut rows: Vec<Vec<String>> = vec![];
442                            for line in vals {
443                                let cols = split_cols(&line, types.len());
444                                if sort != Sort::No && cols.len() != types.len() {
445                                    // We can't check this condition for
446                                    // Sort::No, because some tests use strings
447                                    // with whitespace that look like extra
448                                    // columns. (Note that these tests never
449                                    // use any of the sorting options.)
450                                    bail!(
451                                        "col len ({}) did not match declared col len ({})",
452                                        cols.len(),
453                                        types.len()
454                                    );
455                                }
456                                rows.push(
457                                    cols.into_iter()
458                                        .map(|col| {
459                                            let mut col = col.replace('␠', " ");
460                                            if !multiline {
461                                                col = col.replace('⏎', "\n");
462                                            }
463                                            col
464                                        })
465                                        .collect(),
466                                );
467                            }
468                            if sort == Sort::Row {
469                                rows.sort();
470                            }
471                            vals = rows.into_iter().flatten().collect();
472                            if sort == Sort::Value {
473                                vals.sort();
474                            }
475                        }
476                    }
477                    Output::Values(vals)
478                }
479            }
480        };
481        Ok(Record::Query {
482            sql,
483            output: Ok(QueryOutput {
484                types,
485                sort,
486                multiline,
487                label,
488                column_names,
489                mode: self.mode,
490                output,
491                output_str: query_output_str,
492            }),
493            location,
494        })
495    }
496
497    fn parse_simple(
498        &mut self,
499        mut words: std::iter::Peekable<impl Iterator<Item = &'a str>>,
500    ) -> Result<Record<'a>, anyhow::Error> {
501        let location = self.location();
502        let mut conn = None;
503        let mut user = None;
504        let mut password = None;
505        let mut multiline = false;
506        let mut sort = Sort::No;
507        if let Some(options) = words.next() {
508            for option in options.split(',') {
509                if let Some(value) = option.strip_prefix("conn=") {
510                    conn = Some(value);
511                } else if let Some(value) = option.strip_prefix("user=") {
512                    user = Some(value);
513                } else if let Some(value) = option.strip_prefix("password=") {
514                    password = Some(value);
515                } else if option == "rowsort" {
516                    sort = Sort::Row;
517                } else if option == "multiline" {
518                    multiline = true;
519                } else {
520                    bail!("Unrecognized option {:?} in {:?}", option, options);
521                }
522            }
523        }
524        if user.is_some() && conn.is_none() {
525            bail!("cannot set user without also setting conn");
526        }
527        if password.is_some() && user.is_none() {
528            bail!("cannot set password without also setting user");
529        }
530        let sql = self.split_at(&QUERY_OUTPUT_REGEX)?;
531        let output_str = self
532            .split_at(if multiline {
533                &EOF_REGEX
534            } else {
535                &DOUBLE_LINE_REGEX
536            })?
537            .trim_start();
538        let output = if multiline {
539            Output::Values({
540                let mut v = vec![output_str.to_owned()];
541                // for simple queries we still have to pass the COMPLETE string after the EOF
542                let complete_str = self.split_at(&DOUBLE_LINE_REGEX)?.trim_start();
543                v.extend(complete_str.lines().map(String::from));
544                v
545            })
546        } else {
547            // We only apply rowsort in mode cockroach, for "query" statements,
548            // so mirror that here.
549            let mut output_lines: Vec<String> = output_str.lines().map(String::from).collect();
550
551            if self.mode == Mode::Cockroach && sort == Sort::Row {
552                output_lines.sort();
553            }
554
555            Output::Values(output_lines)
556        };
557        Ok(Record::Simple {
558            location,
559            conn,
560            user,
561            password,
562            sql,
563            sort,
564            output,
565            output_str,
566        })
567    }
568}
569
570fn split_at<'a>(input: &mut &'a str, sep: &Regex) -> Result<&'a str, anyhow::Error> {
571    match sep.find(input) {
572        Some(found) => {
573            let result = &input[..found.start()];
574            *input = &input[found.end()..];
575            Ok(result)
576        }
577        None => bail!("Couldn't split {:?} at {:?}", input, sep),
578    }
579}
580
581/// Parse a query result type string into a vec of expected types
582fn parse_types(input: &str) -> Result<Vec<Type>, anyhow::Error> {
583    input
584        .chars()
585        .map(|char| {
586            Ok(match char {
587                'T' => Type::Text,
588                'I' => Type::Integer,
589                'R' => Type::Real,
590                // CockroachDB uses `F` for floats and `R` for decimals. We
591                // don't distinguish the two.
592                'F' => Type::Real,
593                'B' => Type::Bool,
594                'O' => Type::Oid,
595                _ => bail!("Unexpected type char {} in: {}", char, input),
596            })
597        })
598        .collect()
599}
600
601fn parse_expected_error(line: &str) -> &str {
602    static PGCODE_RE: LazyLock<Regex> =
603        LazyLock::new(|| Regex::new("(statement|query) error( pgcode [a-zA-Z0-9]{5})? ?").unwrap());
604    // TODO(benesch): one day this should record the expected pgcode, if
605    // specified.
606    let pos = PGCODE_RE.find(line).unwrap().end();
607    &line[pos..]
608}
609
610/// Split on whitespace to normalize multiple spaces to one space. This happens
611/// unconditionally in Cockroach mode, regardless of the sort option.
612///
613/// TODO: this doesn't have the whitespace-collapsing behavior for
614/// single-column values that cockroach relies on
615pub(crate) fn split_cols(line: &str, expected_columns: usize) -> Vec<&str> {
616    if expected_columns == 1 {
617        vec![line.trim()]
618    } else {
619        line.split_whitespace().collect()
620    }
621}
622
623pub fn regexp_strip_prefix<'a>(text: &'a str, regexp: &Regex) -> Option<&'a str> {
624    match regexp.find(text) {
625        Some(found) => {
626            if found.start() == 0 {
627                Some(&text[found.end()..])
628            } else {
629                None
630            }
631        }
632        None => None,
633    }
634}
635
636#[mz_ore::test]
637fn test_parse_query_blank_lines_and_missing_separator() {
638    // A blank line inside a query's SQL does not end the record.
639    let file = "query I\nSELECT 1\n\nUNION ALL SELECT 2\n----\n1\n2\n\n";
640    let records = Parser::new("f", file).parse_records().unwrap();
641    assert_eq!(records.len(), 1);
642    match &records[0] {
643        Record::Query { sql, .. } => {
644            assert!(sql.contains("UNION ALL"), "sql: {sql}")
645        }
646        other => panic!("unexpected record: {other:?}"),
647    }
648
649    // A query with no ---- separator expects zero rows, and the blank line
650    // ends the record when a new record follows.
651    let file = "query I\nSELECT 3\n\nstatement ok\nSELECT 4\n\n# comment\n\nquery I\nSELECT 5\n";
652    let records = Parser::new("f", file).parse_records().unwrap();
653    assert_eq!(records.len(), 3);
654    match &records[0] {
655        Record::Query { sql, output, .. } => {
656            assert_eq!(*sql, "SELECT 3");
657            assert_eq!(output.as_ref().unwrap().output, Output::Values(vec![]));
658        }
659        other => panic!("unexpected record: {other:?}"),
660    }
661
662    // No separator at end of file.
663    let file = "query I\nSELECT 6\n";
664    let records = Parser::new("f", file).parse_records().unwrap();
665    assert_eq!(records.len(), 1);
666    match &records[0] {
667        Record::Query { output, .. } => {
668            assert_eq!(output.as_ref().unwrap().output, Output::Values(vec![]));
669        }
670        other => panic!("unexpected record: {other:?}"),
671    }
672}
mz_sqllogictest/parser.rs

mz_sqllogictest/
parser.rs