Skip to main content

mz_expr/scalar/
like_pattern.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::mem;
11use std::str::FromStr;
12
13use derivative::Derivative;
14use mz_lowertest::MzReflect;
15use mz_ore::fmt::FormatBuffer;
16use mz_repr::adt::regex::{Regex, RegexCompilationError};
17use serde::{Deserialize, Serialize};
18
19use crate::scalar::EvalError;
20
21/// The number of subpatterns after which using regexes would be more efficient.
22const MAX_SUBPATTERNS: usize = 5;
23
24/// The escape character to use by default in LIKE patterns.
25const DEFAULT_ESCAPE: char = '\\';
26const DOUBLED_ESCAPE: &str = "\\\\";
27
28/// Specifies escape behavior for the LIKE pattern.
29#[derive(Clone, Copy, Debug)]
30pub enum EscapeBehavior {
31    /// No escape character.
32    Disabled,
33    /// Use a custom escape character.
34    Char(char),
35}
36
37impl Default for EscapeBehavior {
38    fn default() -> EscapeBehavior {
39        EscapeBehavior::Char(DEFAULT_ESCAPE)
40    }
41}
42
43impl FromStr for EscapeBehavior {
44    type Err = EvalError;
45
46    fn from_str(s: &str) -> Result<EscapeBehavior, EvalError> {
47        let mut chars = s.chars();
48        match chars.next() {
49            None => Ok(EscapeBehavior::Disabled),
50            Some(c) => match chars.next() {
51                None => Ok(EscapeBehavior::Char(c)),
52                Some(_) => Err(EvalError::LikeEscapeTooLong),
53            },
54        }
55    }
56}
57
58/// Converts a pattern string that uses a custom escape character to one that uses the default.
59pub fn normalize_pattern(pattern: &str, escape: EscapeBehavior) -> Result<String, EvalError> {
60    match escape {
61        EscapeBehavior::Disabled => Ok(pattern.replace(DEFAULT_ESCAPE, DOUBLED_ESCAPE)),
62        EscapeBehavior::Char(DEFAULT_ESCAPE) => Ok(pattern.into()),
63        EscapeBehavior::Char(custom_escape_char) => {
64            let mut p = String::with_capacity(2 * pattern.len());
65            let mut cs = pattern.chars();
66            while let Some(c) = cs.next() {
67                if c == custom_escape_char {
68                    match cs.next() {
69                        Some(c2) => {
70                            p.push(DEFAULT_ESCAPE);
71                            p.push(c2);
72                        }
73                        None => return Err(EvalError::UnterminatedLikeEscapeSequence),
74                    }
75                } else if c == DEFAULT_ESCAPE {
76                    p.push_str(DOUBLED_ESCAPE);
77                } else {
78                    p.push(c);
79                }
80            }
81            p.shrink_to_fit();
82            Ok(p)
83        }
84    }
85}
86
87// This implementation supports a couple of different methods of matching
88// text against a SQL LIKE or ILIKE pattern.
89//
90// The most general approach is to convert the LIKE pattern into a
91// regular expression and use the well-tested Regex library to perform the
92// match. This works well with complex patterns and case-insensitive matches
93// that are hard to get right.
94//
95// That said, regular expressions aren't that efficient. For most patterns
96// we can do better using built-in string matching.
97
98pub use matcher::Matcher;
99use matcher::MatcherImpl;
100
101// This lint interacts poorly with `derivative` here; we are confident it generates
102// compatible `PartialOrd` and `Ord` impls. Unfortunately it also requires we introduce
103// this module to allow it.
104#[allow(clippy::non_canonical_partial_ord_impl)]
105mod matcher {
106    use super::*;
107
108    /// An object that can test whether a string matches a LIKE or ILIKE pattern.
109    #[derive(Debug, Clone, Deserialize, Serialize, Derivative, MzReflect)]
110    #[derivative(Eq, PartialEq, Ord, PartialOrd, Hash)]
111    pub struct Matcher {
112        pub pattern: String,
113        pub case_insensitive: bool,
114        #[derivative(
115            PartialEq = "ignore",
116            Hash = "ignore",
117            Ord = "ignore",
118            PartialOrd = "ignore"
119        )]
120        pub(super) matcher_impl: MatcherImpl,
121    }
122
123    impl Matcher {
124        pub fn is_match(&self, text: &str) -> bool {
125            match &self.matcher_impl {
126                MatcherImpl::String(subpatterns) => is_match_subpatterns(subpatterns, text),
127                MatcherImpl::Regex(r) => r.is_match(text),
128            }
129        }
130    }
131
132    #[derive(Debug, Clone, Deserialize, Serialize, MzReflect)]
133    pub(super) enum MatcherImpl {
134        String(Vec<Subpattern>),
135        Regex(Regex),
136    }
137}
138
139/// Builds a Matcher that matches a SQL LIKE pattern.
140pub fn compile(pattern: &str, case_insensitive: bool) -> Result<Matcher, EvalError> {
141    // We would like to have a consistent, documented limit to the size of
142    // supported LIKE patterns. The real limiting factor is the number of states
143    // that can be handled by the Regex library. In testing, I was able to
144    // create an adversarial pattern "%a%b%c%d%e..." that started failing around
145    // 9 KiB, so we chose 8 KiB as the limit. This is consistent with limits
146    // set by other databases, like SQL Server.
147    // On the other hand, PostgreSQL does not have a documented limit.
148    if pattern.len() > 8 << 10 {
149        return Err(EvalError::LikePatternTooLong);
150    }
151    let subpatterns = build_subpatterns(pattern)?;
152    // `is_match_subpatterns` resolves each `%` (a `many` subpattern) by searching
153    // for the following suffix and backtracking over every candidate position. A
154    // single `%` is near-linear, but with two or more the backtracking nests and
155    // the cost becomes super-linear in the text length — an adversarial pattern
156    // like `%a%a%a` against a long run of `a`s takes time proportional to
157    // `len(text)^(number of %)`, which can stall a worker for many minutes. The
158    // regex engine matches the same patterns in linear time with no backtracking,
159    // so fall back to it whenever more than one `many` subpattern is present (and
160    // for the existing case-insensitive / too-many-subpatterns reasons).
161    let many_subpatterns = subpatterns.iter().filter(|s| s.many).count();
162    let use_regex = case_insensitive || subpatterns.len() > MAX_SUBPATTERNS || many_subpatterns > 1;
163    let matcher_impl = match use_regex {
164        false => MatcherImpl::String(subpatterns),
165        true => MatcherImpl::Regex(build_regex(&subpatterns, case_insensitive)?),
166    };
167    Ok(Matcher {
168        pattern: pattern.into(),
169        case_insensitive,
170        matcher_impl,
171    })
172}
173
174// The algorithm below is based on the observation that any LIKE pattern can be
175// decomposed into multiple parts:
176//     <PATTERN> := <SUB-PATTERN> (<SUB-PATTERN> ...)
177//     <SUB-PATTERN> := <WILDCARDS> <SUFFIX>
178//
179// The sub-patterns start with zero or more wildcard characters, eventually
180// followed by (non-wildcard) literal characters. The last sub-pattern may
181// have an empty SUFFIX.
182//
183// Example: the PATTERN "n__dl%" can be broken into the following parts:
184//   1. SUB-PATTERN = <WILDCARDS ""> <SUFFIX "n">
185//   2. SUB-PATTERN = <WILDCARDS "__"> <SUFFIX "dl">
186//   3. SUB-PATTERN = <WILDCARDS "%"> <SUFFIX "">
187//
188// The WILDCARDS can be any combination of '_', which matches exactly 1 char,
189// and '%' which matches zero or more chars. These wildcards can be simplified
190// down to the (min, max) of characters they might consume:
191//     ""  = (0, 0)    // doesn't consume any characters
192//     "_" = (1, 1)    // consumes exactly one
193//     "%" = (0, many) // zero or more
194// These are additive, so:
195//     "_%"    = (1, many)
196//     "__%__" = (4, many)
197//     "%%%_"  = (1, many)
198
199#[derive(Debug, Default, Clone, Deserialize, Serialize, MzReflect)]
200struct Subpattern {
201    /// The minimum number of characters that can be consumed by the wildcard expression.
202    consume: usize,
203    /// Whether the wildcard expression can consume an arbitrary number of characters.
204    many: bool,
205    /// A string literal that is expected after the wildcards.
206    suffix: String,
207}
208
209impl Subpattern {
210    /// Converts a Subpattern to an equivalent regular expression and writes it to a given string.
211    fn write_regex_to(&self, r: &mut String) {
212        match self.consume {
213            0 => {
214                if self.many {
215                    r.push_str(".*");
216                }
217            }
218            1 => {
219                r.push('.');
220                if self.many {
221                    r.push('+');
222                }
223            }
224            n => {
225                r.push_str(".{");
226                write!(r, "{}", n);
227                if self.many {
228                    r.push(',');
229                }
230                r.push('}');
231            }
232        }
233        regex_syntax::escape_into(&self.suffix, r);
234    }
235}
236
237fn is_match_subpatterns(subpatterns: &[Subpattern], mut text: &str) -> bool {
238    let (subpattern, subpatterns) = match subpatterns {
239        [] => return text.is_empty(),
240        [subpattern, subpatterns @ ..] => (subpattern, subpatterns),
241    };
242    // Go ahead and skip the minimum number of characters the sub-pattern consumes:
243    if subpattern.consume > 0 {
244        let mut chars = text.chars();
245        if chars.nth(subpattern.consume - 1).is_none() {
246            return false;
247        }
248        text = chars.as_str();
249    }
250    if subpattern.many {
251        // The sub-pattern might consume any number of characters, but we need to find
252        // where it terminates so we can match any subsequent sub-patterns. We do this
253        // by searching for the suffix string using str::find.
254        //
255        // We could investigate using a fancier substring search like Boyer-Moore:
256        // https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
257        //
258        // .. but it's likely not worth it. It's slower for small strings,
259        // and doesn't really start outperforming the naive approach until
260        // haystack sizes of 1KB or greater. See benchmarking results from:
261        // https://github.com/killerswan/boyer-moore-search/blob/master/README.md
262        //
263        // Another approach that may be interesting to look at is a
264        // hardware-optimized search:
265        // http://0x80.pl/articles/simd-strfind.html
266        if subpattern.suffix.len() == 0 {
267            // Nothing to find... This should only happen in the last sub-pattern.
268            assert!(
269                subpatterns.is_empty(),
270                "empty suffix in middle of a pattern"
271            );
272            return true;
273        }
274        // Use rfind so we perform a greedy capture, like Regex.
275        let mut found = text.rfind(&subpattern.suffix);
276        loop {
277            match found {
278                None => return false,
279                Some(offset) => {
280                    let mut end = offset + subpattern.suffix.len();
281                    if is_match_subpatterns(subpatterns, &text[end..]) {
282                        return true;
283                    }
284                    // Didn't match, look for the next rfind.
285                    if offset == 0 {
286                        return false;
287                    }
288                    // Find the previous valid char byte.
289                    loop {
290                        end -= 1;
291                        if text.is_char_boundary(end) {
292                            break;
293                        }
294                    }
295                    found = text[..end].rfind(&subpattern.suffix);
296                }
297            }
298        }
299    }
300    // No string search needed, we just use a prefix match on rest.
301    if !text.starts_with(&subpattern.suffix) {
302        return false;
303    }
304    is_match_subpatterns(subpatterns, &text[subpattern.suffix.len()..])
305}
306
307/// Breaks a LIKE pattern into a chain of sub-patterns.
308fn build_subpatterns(pattern: &str) -> Result<Vec<Subpattern>, EvalError> {
309    let mut subpatterns = Vec::with_capacity(MAX_SUBPATTERNS);
310    let mut current = Subpattern::default();
311    let mut in_wildcard = true;
312    let mut in_escape = false;
313    for c in pattern.chars() {
314        match c {
315            c if !in_escape && c == DEFAULT_ESCAPE => {
316                in_escape = true;
317                in_wildcard = false;
318            }
319            '_' if !in_escape => {
320                if !in_wildcard {
321                    current.suffix.shrink_to_fit();
322                    subpatterns.push(mem::take(&mut current));
323                    in_wildcard = true;
324                }
325                current.consume += 1;
326            }
327            '%' if !in_escape => {
328                if !in_wildcard {
329                    current.suffix.shrink_to_fit();
330                    subpatterns.push(mem::take(&mut current));
331                    in_wildcard = true;
332                }
333                current.many = true;
334            }
335            c => {
336                current.suffix.push(c);
337                in_escape = false;
338                in_wildcard = false;
339            }
340        }
341    }
342    if in_escape {
343        return Err(EvalError::UnterminatedLikeEscapeSequence);
344    }
345    current.suffix.shrink_to_fit();
346    subpatterns.push(current);
347    subpatterns.shrink_to_fit();
348    Ok(subpatterns)
349}
350
351/// Builds a regular expression that matches some parsed Subpatterns.
352fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Regex, EvalError> {
353    let mut r = String::from("^");
354    for sp in subpatterns {
355        sp.write_regex_to(&mut r);
356    }
357    r.push('$');
358    match Regex::new(&r, case_insensitive) {
359        Ok(regex) => Ok(regex),
360        Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
361        Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
362            Err(EvalError::LikePatternTooLong)
363        }
364        Err(e) => Err(EvalError::Internal(
365            format!("build_regex produced invalid regex: {}", e).into(),
366        )),
367    }
368}
369
370// Unit Tests
371//
372// Most of the unit tests for LIKE and ILIKE can be found in:
373//    test/sqllogictest/cockroach/like.slt
374// These tests are here as a convenient place to run quick tests while
375// actively working on changes to the implementation. Make sure you
376// run the full test suite before submitting any changes.
377
378#[cfg(test)]
379mod test {
380    use super::*;
381
382    #[mz_ore::test]
383    fn test_normalize_pattern() {
384        struct TestCase<'a> {
385            pattern: &'a str,
386            escape: EscapeBehavior,
387            expected: &'a str,
388        }
389        let test_cases = vec![
390            TestCase {
391                pattern: "",
392                escape: EscapeBehavior::Disabled,
393                expected: "",
394            },
395            TestCase {
396                pattern: "ban%na!",
397                escape: EscapeBehavior::default(),
398                expected: "ban%na!",
399            },
400            TestCase {
401                pattern: "ban%%%na!",
402                escape: EscapeBehavior::Char('%'),
403                expected: "ban\\%\\na!",
404            },
405            TestCase {
406                pattern: "ban%na\\!",
407                escape: EscapeBehavior::Char('n'),
408                expected: "ba\\%\\a\\\\!",
409            },
410            TestCase {
411                pattern: "ban%na\\!",
412                escape: EscapeBehavior::Disabled,
413                expected: "ban%na\\\\!",
414            },
415            TestCase {
416                pattern: "ban\\na!",
417                escape: EscapeBehavior::Char('n'),
418                expected: "ba\\\\\\a!",
419            },
420            TestCase {
421                pattern: "ban\\\\na!",
422                escape: EscapeBehavior::Char('n'),
423                expected: "ba\\\\\\\\\\a!",
424            },
425            TestCase {
426                pattern: "food",
427                escape: EscapeBehavior::Char('o'),
428                expected: "f\\od",
429            },
430            TestCase {
431                pattern: "漢漢",
432                escape: EscapeBehavior::Char('漢'),
433                expected: "\\漢",
434            },
435        ];
436
437        for input in test_cases {
438            let actual = normalize_pattern(input.pattern, input.escape).unwrap();
439            assert!(
440                actual == input.expected,
441                "normalize_pattern({:?}, {:?}):\n\tactual: {:?}\n\texpected: {:?}\n",
442                input.pattern,
443                input.escape,
444                actual,
445                input.expected,
446            );
447        }
448    }
449
450    #[mz_ore::test]
451    fn test_escape_too_long() {
452        match EscapeBehavior::from_str("foo") {
453            Err(EvalError::LikeEscapeTooLong) => {}
454            _ => {
455                panic!("expected error when using escape string with >1 character");
456            }
457        }
458    }
459
460    #[mz_ore::test]
461    fn test_like() {
462        struct Input<'a> {
463            haystack: &'a str,
464            matches: bool,
465        }
466        let input = |haystack, matches| Input { haystack, matches };
467        struct Pattern<'a> {
468            needle: &'a str,
469            case_insensitive: bool,
470            inputs: Vec<Input<'a>>,
471        }
472        let test_cases = vec![
473            Pattern {
474                needle: "ban%na!",
475                case_insensitive: false,
476                inputs: vec![input("banana!", true)],
477            },
478            Pattern {
479                needle: "foo",
480                case_insensitive: true,
481                inputs: vec![
482                    input("", false),
483                    input("f", false),
484                    input("fo", false),
485                    input("foo", true),
486                    input("FOO", true),
487                    input("Foo", true),
488                    input("fOO", true),
489                    input("food", false),
490                ],
491            },
492        ];
493
494        for tc in test_cases {
495            let matcher = compile(tc.needle, tc.case_insensitive).unwrap();
496            for input in tc.inputs {
497                let actual = matcher.is_match(input.haystack);
498                assert!(
499                    actual == input.matches,
500                    "{:?} {} {:?}:\n\tactual: {:?}\n\texpected: {:?}\n",
501                    input.haystack,
502                    match tc.case_insensitive {
503                        true => "ILIKE",
504                        false => "LIKE",
505                    },
506                    tc.needle,
507                    actual,
508                    input.matches,
509                );
510            }
511        }
512    }
513
514    // Patterns with two or more `%` wildcards now compile to the linear regex
515    // matcher instead of the back-tracking string matcher, which was
516    // super-linear: an adversarial pattern like `%a%a%a` against a long run of
517    // `a`s took time proportional to `len(text)^(number of %)`. Verify the
518    // routing change still yields correct match results (the regex matcher and
519    // the string matcher must agree), including the previously-pathological
520    // shape, which now completes instantly regardless of text length.
521    #[mz_ore::test]
522    fn test_many_wildcards_match_correctly() {
523        let long_a = "a".repeat(64);
524        let cases: &[(&str, &str, bool)] = &[
525            ("%a%a%a", "xaxaxa", true),
526            ("%a%a%a", "aaa", true),
527            ("%a%a%a", "aa", false),
528            ("%a%b%c%", "zzabqcz", true),
529            ("%a%b%c%", "cba", false),
530            ("a%b%c", "abc", true),
531            ("a%b%c", "axxbxxc", true),
532            ("a%b%c", "abcd", false),
533            // A single `%` still uses the (near-linear) string matcher.
534            ("%_%_%", "ab", true),
535            ("%%%%", "", true),
536            // The exact pathological shape from the fuzzer; super-linear before
537            // the fix, instant after.
538            ("%a%a%a%a%a", &long_a, true),
539            ("%a%a%a%a%a", "aaaa", false),
540        ];
541        for (pat, text, expected) in cases {
542            let m = compile(pat, false).unwrap();
543            assert_eq!(
544                m.is_match(text),
545                *expected,
546                "pattern {pat:?} against {text:?}"
547            );
548        }
549    }
550}