mz_regexp/
lib.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use mz_repr::adt::regex::Regex;
11
12pub fn regexp_split_to_array<'a>(text: &'a str, regexp: &Regex) -> Vec<&'a str> {
13    // Postgres regex split handling differs a bit from spec regex split, so we can't use
14    // regexp.split here. See: https://www.postgresql.org/docs/15/functions-matching.html:
15    // > the regexp split functions ignore zero-length matches that occur at the start or end
16    // > of the string or immediately after a previous match
17
18    let mut finder = regexp.find_iter(text);
19    let mut last = 0;
20    let mut found = Vec::new();
21    loop {
22        match finder.next() {
23            None => {
24                if last <= text.len() {
25                    let s = &text[last..];
26                    found.push(s);
27                }
28                break;
29            }
30            Some(m) => {
31                // Ignore zero length matches at start and end of string.
32                if m.end() > 0 && m.start() < text.len() {
33                    let matched = &text[last..m.start()];
34                    last = m.end();
35                    found.push(matched);
36                }
37            }
38        }
39    }
40    found
41}
42
43#[cfg(test)]
44mod tests {
45    use mz_repr::adt::regex::Regex;
46
47    use crate::regexp_split_to_array;
48
49    fn build_regex(needle: &str, flags: &str) -> Result<Regex, anyhow::Error> {
50        let mut case_insensitive = false;
51        // Note: Postgres accepts it when both flags are present, taking the last one. We do the same.
52        for f in flags.chars() {
53            match f {
54                'i' => {
55                    case_insensitive = true;
56                }
57                'c' => {
58                    case_insensitive = false;
59                }
60                _ => anyhow::bail!("unexpected regex flags"),
61            }
62        }
63        Ok(Regex::new(needle, case_insensitive)?)
64    }
65
66    // Assert equivalency to postgres and generate TestCases.
67    #[mz_ore::test]
68    #[cfg_attr(miri, ignore)] // unsupported operation: returning ready events from epoll_wait is not yet implemented
69    fn test_pg_regexp_split_array() {
70        let Ok(postgres_url) = std::env::var("POSTGRES_URL") else {
71            return;
72        };
73        let mut client = postgres::Client::connect(&postgres_url, postgres::NoTls).unwrap();
74
75        let inputs = vec!["", " ", "  ", "12 34", "12  34", " 12 34 "];
76        let regexps = vec!["", "\\s", "\\s+", "\\s*"];
77        for input in inputs {
78            for re in &regexps {
79                let regex = build_regex(re, "").unwrap();
80                let pg: Vec<String> = client
81                    .query_one("select regexp_split_to_array($1, $2)", &[&input, re])
82                    .unwrap()
83                    .get(0);
84                let mz = regexp_split_to_array(input, &regex);
85                assert_eq!(pg, mz);
86                // Generate TestCases for static use.
87                println!(
88                    r#"TestCase {{
89                text: "{input}",
90                regexp: "{}",
91                expect: &{pg:?},
92            }},"#,
93                    re.replace('\\', "\\\\"),
94                );
95            }
96        }
97    }
98
99    #[mz_ore::test]
100    #[cfg_attr(miri, ignore)] // too slow
101    fn test_regexp_split_array() {
102        // Expected outputs generated from postgres.
103        struct TestCase {
104            text: &'static str,
105            regexp: &'static str,
106            expect: &'static [&'static str],
107        }
108        let tests = vec![
109            TestCase {
110                text: "",
111                regexp: "",
112                expect: &[""],
113            },
114            TestCase {
115                text: "",
116                regexp: "\\s",
117                expect: &[""],
118            },
119            TestCase {
120                text: "",
121                regexp: "\\s+",
122                expect: &[""],
123            },
124            TestCase {
125                text: "",
126                regexp: "\\s*",
127                expect: &[""],
128            },
129            TestCase {
130                text: " ",
131                regexp: "",
132                expect: &[" "],
133            },
134            TestCase {
135                text: " ",
136                regexp: "\\s",
137                expect: &["", ""],
138            },
139            TestCase {
140                text: " ",
141                regexp: "\\s+",
142                expect: &["", ""],
143            },
144            TestCase {
145                text: " ",
146                regexp: "\\s*",
147                expect: &["", ""],
148            },
149            TestCase {
150                text: "  ",
151                regexp: "",
152                expect: &[" ", " "],
153            },
154            TestCase {
155                text: "  ",
156                regexp: "\\s",
157                expect: &["", "", ""],
158            },
159            TestCase {
160                text: "  ",
161                regexp: "\\s+",
162                expect: &["", ""],
163            },
164            TestCase {
165                text: "  ",
166                regexp: "\\s*",
167                expect: &["", ""],
168            },
169            TestCase {
170                text: "12 34",
171                regexp: "",
172                expect: &["1", "2", " ", "3", "4"],
173            },
174            TestCase {
175                text: "12 34",
176                regexp: "\\s",
177                expect: &["12", "34"],
178            },
179            TestCase {
180                text: "12 34",
181                regexp: "\\s+",
182                expect: &["12", "34"],
183            },
184            TestCase {
185                text: "12 34",
186                regexp: "\\s*",
187                expect: &["1", "2", "3", "4"],
188            },
189            TestCase {
190                text: "12  34",
191                regexp: "",
192                expect: &["1", "2", " ", " ", "3", "4"],
193            },
194            TestCase {
195                text: "12  34",
196                regexp: "\\s",
197                expect: &["12", "", "34"],
198            },
199            TestCase {
200                text: "12  34",
201                regexp: "\\s+",
202                expect: &["12", "34"],
203            },
204            TestCase {
205                text: "12  34",
206                regexp: "\\s*",
207                expect: &["1", "2", "3", "4"],
208            },
209            TestCase {
210                text: " 12 34 ",
211                regexp: "",
212                expect: &[" ", "1", "2", " ", "3", "4", " "],
213            },
214            TestCase {
215                text: " 12 34 ",
216                regexp: "\\s",
217                expect: &["", "12", "34", ""],
218            },
219            TestCase {
220                text: " 12 34 ",
221                regexp: "\\s+",
222                expect: &["", "12", "34", ""],
223            },
224            TestCase {
225                text: " 12 34 ",
226                regexp: "\\s*",
227                expect: &["", "1", "2", "3", "4", ""],
228            },
229        ];
230        for tc in tests {
231            let regex = build_regex(tc.regexp, "").unwrap();
232            let result = regexp_split_to_array(tc.text, &regex);
233            if tc.expect != result {
234                println!(
235                    "input: `{}`, regex: `{}`, got: {:?}, expect: {:?}",
236                    tc.text, tc.regexp, result, tc.expect
237                );
238            }
239        }
240    }
241}