Skip to main content

mz_regexp/
lib.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use mz_repr::adt::regex::Regex;
11
12pub fn regexp_split_to_array<'a>(text: &'a str, regexp: &Regex) -> Vec<&'a str> {
13    // Postgres regex split handling differs a bit from spec regex split, so we can't use
14    // regexp.split here. See: https://www.postgresql.org/docs/15/functions-matching.html:
15    // > the regexp split functions ignore zero-length matches that occur at the start or end
16    // > of the string or immediately after a previous match
17
18    let mut finder = regexp.find_iter(text);
19    let mut last = 0;
20    let mut found = Vec::new();
21    loop {
22        match finder.next() {
23            None => {
24                if last <= text.len() {
25                    let s = &text[last..];
26                    found.push(s);
27                }
28                break;
29            }
30            Some(m) => {
31                // Ignore zero length matches at start and end of string.
32                if m.end() > 0 && m.start() < text.len() {
33                    let matched = &text[last..m.start()];
34                    last = m.end();
35                    found.push(matched);
36                }
37            }
38        }
39    }
40    found
41}
42
43#[cfg(test)]
44mod tests {
45    use mz_repr::adt::regex::Regex;
46
47    use crate::regexp_split_to_array;
48
49    fn build_regex(needle: &str, flags: &str) -> Result<Regex, anyhow::Error> {
50        let mut case_insensitive = false;
51        // Note: Postgres accepts it when both flags are present, taking the last one. We do the same.
52        for f in flags.chars() {
53            match f {
54                'i' => {
55                    case_insensitive = true;
56                }
57                'c' => {
58                    case_insensitive = false;
59                }
60                _ => anyhow::bail!("unexpected regex flags"),
61            }
62        }
63        Regex::new(needle, case_insensitive).map_err(|e| anyhow::anyhow!("{}", e))
64    }
65
66    // Assert equivalency to postgres and generate TestCases.
67    #[mz_ore::test]
68    #[cfg_attr(miri, ignore)] // unsupported operation: returning ready events from epoll_wait is not yet implemented
69    fn test_pg_regexp_split_array() {
70        let Ok(postgres_url) = std::env::var("POSTGRES_URL") else {
71            return;
72        };
73        let mut client = postgres::Client::connect(&postgres_url, postgres::NoTls).unwrap();
74
75        let inputs = vec!["", " ", "  ", "12 34", "12  34", " 12 34 "];
76        let regexps = vec!["", "\\s", "\\s+", "\\s*"];
77        for input in inputs {
78            for re in &regexps {
79                let regex = build_regex(re, "").unwrap();
80                // This test cross-checks against the sync `postgres` crate,
81                // while `mz_postgres_util` wrappers target async tokio-postgres.
82                let pg: Vec<String> = client
83                    .query_one("select regexp_split_to_array($1, $2)", &[&input, re])
84                    .unwrap()
85                    .get(0);
86                let mz = regexp_split_to_array(input, &regex);
87                assert_eq!(pg, mz);
88                // Generate TestCases for static use.
89                println!(
90                    r#"TestCase {{
91                text: "{input}",
92                regexp: "{}",
93                expect: &{pg:?},
94            }},"#,
95                    re.replace('\\', "\\\\"),
96                );
97            }
98        }
99    }
100
101    #[mz_ore::test]
102    #[cfg_attr(miri, ignore)] // too slow
103    fn test_regexp_split_array() {
104        // Expected outputs generated from postgres.
105        struct TestCase {
106            text: &'static str,
107            regexp: &'static str,
108            expect: &'static [&'static str],
109        }
110        let tests = vec![
111            TestCase {
112                text: "",
113                regexp: "",
114                expect: &[""],
115            },
116            TestCase {
117                text: "",
118                regexp: "\\s",
119                expect: &[""],
120            },
121            TestCase {
122                text: "",
123                regexp: "\\s+",
124                expect: &[""],
125            },
126            TestCase {
127                text: "",
128                regexp: "\\s*",
129                expect: &[""],
130            },
131            TestCase {
132                text: " ",
133                regexp: "",
134                expect: &[" "],
135            },
136            TestCase {
137                text: " ",
138                regexp: "\\s",
139                expect: &["", ""],
140            },
141            TestCase {
142                text: " ",
143                regexp: "\\s+",
144                expect: &["", ""],
145            },
146            TestCase {
147                text: " ",
148                regexp: "\\s*",
149                expect: &["", ""],
150            },
151            TestCase {
152                text: "  ",
153                regexp: "",
154                expect: &[" ", " "],
155            },
156            TestCase {
157                text: "  ",
158                regexp: "\\s",
159                expect: &["", "", ""],
160            },
161            TestCase {
162                text: "  ",
163                regexp: "\\s+",
164                expect: &["", ""],
165            },
166            TestCase {
167                text: "  ",
168                regexp: "\\s*",
169                expect: &["", ""],
170            },
171            TestCase {
172                text: "12 34",
173                regexp: "",
174                expect: &["1", "2", " ", "3", "4"],
175            },
176            TestCase {
177                text: "12 34",
178                regexp: "\\s",
179                expect: &["12", "34"],
180            },
181            TestCase {
182                text: "12 34",
183                regexp: "\\s+",
184                expect: &["12", "34"],
185            },
186            TestCase {
187                text: "12 34",
188                regexp: "\\s*",
189                expect: &["1", "2", "3", "4"],
190            },
191            TestCase {
192                text: "12  34",
193                regexp: "",
194                expect: &["1", "2", " ", " ", "3", "4"],
195            },
196            TestCase {
197                text: "12  34",
198                regexp: "\\s",
199                expect: &["12", "", "34"],
200            },
201            TestCase {
202                text: "12  34",
203                regexp: "\\s+",
204                expect: &["12", "34"],
205            },
206            TestCase {
207                text: "12  34",
208                regexp: "\\s*",
209                expect: &["1", "2", "3", "4"],
210            },
211            TestCase {
212                text: " 12 34 ",
213                regexp: "",
214                expect: &[" ", "1", "2", " ", "3", "4", " "],
215            },
216            TestCase {
217                text: " 12 34 ",
218                regexp: "\\s",
219                expect: &["", "12", "34", ""],
220            },
221            TestCase {
222                text: " 12 34 ",
223                regexp: "\\s+",
224                expect: &["", "12", "34", ""],
225            },
226            TestCase {
227                text: " 12 34 ",
228                regexp: "\\s*",
229                expect: &["", "1", "2", "3", "4", ""],
230            },
231        ];
232        for tc in tests {
233            let regex = build_regex(tc.regexp, "").unwrap();
234            let result = regexp_split_to_array(tc.text, &regex);
235            if tc.expect != result {
236                println!(
237                    "input: `{}`, regex: `{}`, got: {:?}, expect: {:?}",
238                    tc.text, tc.regexp, result, tc.expect
239                );
240            }
241        }
242    }
243}