Skip to main content

mz_expr/scalar/
like_pattern.rs

1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::mem;
11use std::str::FromStr;
12
13use derivative::Derivative;
14use mz_lowertest::MzReflect;
15use mz_ore::fmt::FormatBuffer;
16use mz_repr::adt::regex::{Regex, RegexCompilationError};
17use serde::{Deserialize, Serialize};
18
19use crate::scalar::EvalError;
20
21/// The number of subpatterns after which using regexes would be more efficient.
22const MAX_SUBPATTERNS: usize = 5;
23
24/// The escape character to use by default in LIKE patterns.
25const DEFAULT_ESCAPE: char = '\\';
26const DOUBLED_ESCAPE: &str = "\\\\";
27
28/// Specifies escape behavior for the LIKE pattern.
29#[derive(Clone, Copy, Debug)]
30pub enum EscapeBehavior {
31    /// No escape character.
32    Disabled,
33    /// Use a custom escape character.
34    Char(char),
35}
36
37impl Default for EscapeBehavior {
38    fn default() -> EscapeBehavior {
39        EscapeBehavior::Char(DEFAULT_ESCAPE)
40    }
41}
42
43impl FromStr for EscapeBehavior {
44    type Err = EvalError;
45
46    fn from_str(s: &str) -> Result<EscapeBehavior, EvalError> {
47        let mut chars = s.chars();
48        match chars.next() {
49            None => Ok(EscapeBehavior::Disabled),
50            Some(c) => match chars.next() {
51                None => Ok(EscapeBehavior::Char(c)),
52                Some(_) => Err(EvalError::LikeEscapeTooLong),
53            },
54        }
55    }
56}
57
58/// Converts a pattern string that uses a custom escape character to one that uses the default.
59pub fn normalize_pattern(pattern: &str, escape: EscapeBehavior) -> Result<String, EvalError> {
60    match escape {
61        EscapeBehavior::Disabled => Ok(pattern.replace(DEFAULT_ESCAPE, DOUBLED_ESCAPE)),
62        EscapeBehavior::Char(DEFAULT_ESCAPE) => Ok(pattern.into()),
63        EscapeBehavior::Char(custom_escape_char) => {
64            let mut p = String::with_capacity(2 * pattern.len());
65            let mut cs = pattern.chars();
66            while let Some(c) = cs.next() {
67                if c == custom_escape_char {
68                    match cs.next() {
69                        Some(c2) => {
70                            p.push(DEFAULT_ESCAPE);
71                            p.push(c2);
72                        }
73                        None => return Err(EvalError::UnterminatedLikeEscapeSequence),
74                    }
75                } else if c == DEFAULT_ESCAPE {
76                    p.push_str(DOUBLED_ESCAPE);
77                } else {
78                    p.push(c);
79                }
80            }
81            p.shrink_to_fit();
82            Ok(p)
83        }
84    }
85}
86
87// This implementation supports a couple of different methods of matching
88// text against a SQL LIKE or ILIKE pattern.
89//
90// The most general approach is to convert the LIKE pattern into a
91// regular expression and use the well-tested Regex library to perform the
92// match. This works well with complex patterns and case-insensitive matches
93// that are hard to get right.
94//
95// That said, regular expressions aren't that efficient. For most patterns
96// we can do better using built-in string matching.
97
98pub use matcher::Matcher;
99use matcher::MatcherImpl;
100
101// This lint interacts poorly with `derivative` here; we are confident it generates
102// compatible `PartialOrd` and `Ord` impls. Unfortunately it also requires we introduce
103// this module to allow it.
104#[allow(clippy::non_canonical_partial_ord_impl)]
105mod matcher {
106    use super::*;
107
108    /// An object that can test whether a string matches a LIKE or ILIKE pattern.
109    #[derive(Debug, Clone, Deserialize, Serialize, Derivative, MzReflect)]
110    #[derivative(Eq, PartialEq, Ord, PartialOrd, Hash)]
111    pub struct Matcher {
112        pub pattern: String,
113        pub case_insensitive: bool,
114        #[derivative(
115            PartialEq = "ignore",
116            Hash = "ignore",
117            Ord = "ignore",
118            PartialOrd = "ignore"
119        )]
120        pub(super) matcher_impl: MatcherImpl,
121    }
122
123    impl Matcher {
124        pub fn is_match(&self, text: &str) -> bool {
125            match &self.matcher_impl {
126                MatcherImpl::String(subpatterns) => is_match_subpatterns(subpatterns, text),
127                MatcherImpl::Regex(r) => r.is_match(text),
128            }
129        }
130    }
131
132    #[derive(Debug, Clone, Deserialize, Serialize, MzReflect)]
133    pub(super) enum MatcherImpl {
134        String(Vec<Subpattern>),
135        Regex(Regex),
136    }
137}
138
139/// Builds a Matcher that matches a SQL LIKE pattern.
140pub fn compile(pattern: &str, case_insensitive: bool) -> Result<Matcher, EvalError> {
141    // We would like to have a consistent, documented limit to the size of
142    // supported LIKE patterns. The real limiting factor is the number of states
143    // that can be handled by the Regex library. In testing, I was able to
144    // create an adversarial pattern "%a%b%c%d%e..." that started failing around
145    // 9 KiB, so we chose 8 KiB as the limit. This is consistent with limits
146    // set by other databases, like SQL Server.
147    // On the other hand, PostgreSQL does not have a documented limit.
148    if pattern.len() > 8 << 10 {
149        return Err(EvalError::LikePatternTooLong);
150    }
151    let subpatterns = build_subpatterns(pattern)?;
152    // `is_match_subpatterns` resolves each `%` (a `many` subpattern) by searching
153    // for the following suffix and backtracking over every candidate position. A
154    // single `%` is near-linear, but with two or more the backtracking nests and
155    // the cost becomes super-linear in the text length — an adversarial pattern
156    // like `%a%a%a` against a long run of `a`s takes time proportional to
157    // `len(text)^(number of %)`, which can stall a worker for many minutes. The
158    // regex engine matches the same patterns in linear time with no backtracking,
159    // so fall back to it whenever more than one backtracking `%` is present (and
160    // for the existing case-insensitive / too-many-subpatterns reasons).
161    //
162    // A `%` with an *empty* suffix short-circuits in `is_match_subpatterns`
163    // without any `rfind`/backtracking, and (per `build_subpatterns`) such a `%`
164    // can only ever be the trailing subpattern. Only `%` subpatterns with a
165    // non-empty suffix nest the backtracking loops, so we count just those. This
166    // keeps the common "contains" pattern `%foo%` — which decomposes into one
167    // non-empty-suffix `%` plus a trailing empty-suffix `%` — on the fast string
168    // matcher, where a single `rfind` is far cheaper than a forward regex scan.
169    let backtracking_manys = subpatterns
170        .iter()
171        .filter(|s| s.many && !s.suffix.is_empty())
172        .count();
173    let use_regex =
174        case_insensitive || subpatterns.len() > MAX_SUBPATTERNS || backtracking_manys > 1;
175    let matcher_impl = match use_regex {
176        false => MatcherImpl::String(subpatterns),
177        true => MatcherImpl::Regex(build_regex(&subpatterns, case_insensitive)?),
178    };
179    Ok(Matcher {
180        pattern: pattern.into(),
181        case_insensitive,
182        matcher_impl,
183    })
184}
185
186// The algorithm below is based on the observation that any LIKE pattern can be
187// decomposed into multiple parts:
188//     <PATTERN> := <SUB-PATTERN> (<SUB-PATTERN> ...)
189//     <SUB-PATTERN> := <WILDCARDS> <SUFFIX>
190//
191// The sub-patterns start with zero or more wildcard characters, eventually
192// followed by (non-wildcard) literal characters. The last sub-pattern may
193// have an empty SUFFIX.
194//
195// Example: the PATTERN "n__dl%" can be broken into the following parts:
196//   1. SUB-PATTERN = <WILDCARDS ""> <SUFFIX "n">
197//   2. SUB-PATTERN = <WILDCARDS "__"> <SUFFIX "dl">
198//   3. SUB-PATTERN = <WILDCARDS "%"> <SUFFIX "">
199//
200// The WILDCARDS can be any combination of '_', which matches exactly 1 char,
201// and '%' which matches zero or more chars. These wildcards can be simplified
202// down to the (min, max) of characters they might consume:
203//     ""  = (0, 0)    // doesn't consume any characters
204//     "_" = (1, 1)    // consumes exactly one
205//     "%" = (0, many) // zero or more
206// These are additive, so:
207//     "_%"    = (1, many)
208//     "__%__" = (4, many)
209//     "%%%_"  = (1, many)
210
211#[derive(Debug, Default, Clone, Deserialize, Serialize, MzReflect)]
212struct Subpattern {
213    /// The minimum number of characters that can be consumed by the wildcard expression.
214    consume: usize,
215    /// Whether the wildcard expression can consume an arbitrary number of characters.
216    many: bool,
217    /// A string literal that is expected after the wildcards.
218    suffix: String,
219}
220
221impl Subpattern {
222    /// Converts a Subpattern to an equivalent regular expression and writes it to a given string.
223    fn write_regex_to(&self, r: &mut String) {
224        match self.consume {
225            0 => {
226                if self.many {
227                    r.push_str(".*");
228                }
229            }
230            1 => {
231                r.push('.');
232                if self.many {
233                    r.push('+');
234                }
235            }
236            n => {
237                r.push_str(".{");
238                write!(r, "{}", n);
239                if self.many {
240                    r.push(',');
241                }
242                r.push('}');
243            }
244        }
245        regex_syntax::escape_into(&self.suffix, r);
246    }
247}
248
249fn is_match_subpatterns(subpatterns: &[Subpattern], mut text: &str) -> bool {
250    let (subpattern, subpatterns) = match subpatterns {
251        [] => return text.is_empty(),
252        [subpattern, subpatterns @ ..] => (subpattern, subpatterns),
253    };
254    // Go ahead and skip the minimum number of characters the sub-pattern consumes:
255    if subpattern.consume > 0 {
256        let mut chars = text.chars();
257        if chars.nth(subpattern.consume - 1).is_none() {
258            return false;
259        }
260        text = chars.as_str();
261    }
262    if subpattern.many {
263        // The sub-pattern might consume any number of characters, but we need to find
264        // where it terminates so we can match any subsequent sub-patterns. We do this
265        // by searching for the suffix string using str::find.
266        //
267        // We could investigate using a fancier substring search like Boyer-Moore:
268        // https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
269        //
270        // .. but it's likely not worth it. It's slower for small strings,
271        // and doesn't really start outperforming the naive approach until
272        // haystack sizes of 1KB or greater. See benchmarking results from:
273        // https://github.com/killerswan/boyer-moore-search/blob/master/README.md
274        //
275        // Another approach that may be interesting to look at is a
276        // hardware-optimized search:
277        // http://0x80.pl/articles/simd-strfind.html
278        if subpattern.suffix.len() == 0 {
279            // Nothing to find... This should only happen in the last sub-pattern.
280            assert!(
281                subpatterns.is_empty(),
282                "empty suffix in middle of a pattern"
283            );
284            return true;
285        }
286        // Use rfind so we perform a greedy capture, like Regex.
287        let mut found = text.rfind(&subpattern.suffix);
288        loop {
289            match found {
290                None => return false,
291                Some(offset) => {
292                    let mut end = offset + subpattern.suffix.len();
293                    if is_match_subpatterns(subpatterns, &text[end..]) {
294                        return true;
295                    }
296                    // Didn't match, look for the next rfind.
297                    if offset == 0 {
298                        return false;
299                    }
300                    // Find the previous valid char byte.
301                    loop {
302                        end -= 1;
303                        if text.is_char_boundary(end) {
304                            break;
305                        }
306                    }
307                    found = text[..end].rfind(&subpattern.suffix);
308                }
309            }
310        }
311    }
312    // No string search needed, we just use a prefix match on rest.
313    if !text.starts_with(&subpattern.suffix) {
314        return false;
315    }
316    is_match_subpatterns(subpatterns, &text[subpattern.suffix.len()..])
317}
318
319/// Breaks a LIKE pattern into a chain of sub-patterns.
320fn build_subpatterns(pattern: &str) -> Result<Vec<Subpattern>, EvalError> {
321    let mut subpatterns = Vec::with_capacity(MAX_SUBPATTERNS);
322    let mut current = Subpattern::default();
323    let mut in_wildcard = true;
324    let mut in_escape = false;
325    for c in pattern.chars() {
326        match c {
327            c if !in_escape && c == DEFAULT_ESCAPE => {
328                in_escape = true;
329                in_wildcard = false;
330            }
331            '_' if !in_escape => {
332                if !in_wildcard {
333                    current.suffix.shrink_to_fit();
334                    subpatterns.push(mem::take(&mut current));
335                    in_wildcard = true;
336                }
337                current.consume += 1;
338            }
339            '%' if !in_escape => {
340                if !in_wildcard {
341                    current.suffix.shrink_to_fit();
342                    subpatterns.push(mem::take(&mut current));
343                    in_wildcard = true;
344                }
345                current.many = true;
346            }
347            c => {
348                current.suffix.push(c);
349                in_escape = false;
350                in_wildcard = false;
351            }
352        }
353    }
354    if in_escape {
355        return Err(EvalError::UnterminatedLikeEscapeSequence);
356    }
357    current.suffix.shrink_to_fit();
358    subpatterns.push(current);
359    subpatterns.shrink_to_fit();
360    Ok(subpatterns)
361}
362
363/// Builds a regular expression that matches some parsed Subpatterns.
364fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Regex, EvalError> {
365    let mut r = String::from("^");
366    for sp in subpatterns {
367        sp.write_regex_to(&mut r);
368    }
369    r.push('$');
370    match Regex::new(&r, case_insensitive) {
371        Ok(regex) => Ok(regex),
372        Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
373        Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
374            Err(EvalError::LikePatternTooLong)
375        }
376        Err(e) => Err(EvalError::Internal(
377            format!("build_regex produced invalid regex: {}", e).into(),
378        )),
379    }
380}
381
382// Unit Tests
383//
384// Most of the unit tests for LIKE and ILIKE can be found in:
385//    test/sqllogictest/cockroach/like.slt
386// These tests are here as a convenient place to run quick tests while
387// actively working on changes to the implementation. Make sure you
388// run the full test suite before submitting any changes.
389
390#[cfg(test)]
391mod test {
392    use super::*;
393
394    #[mz_ore::test]
395    fn test_normalize_pattern() {
396        struct TestCase<'a> {
397            pattern: &'a str,
398            escape: EscapeBehavior,
399            expected: &'a str,
400        }
401        let test_cases = vec![
402            TestCase {
403                pattern: "",
404                escape: EscapeBehavior::Disabled,
405                expected: "",
406            },
407            TestCase {
408                pattern: "ban%na!",
409                escape: EscapeBehavior::default(),
410                expected: "ban%na!",
411            },
412            TestCase {
413                pattern: "ban%%%na!",
414                escape: EscapeBehavior::Char('%'),
415                expected: "ban\\%\\na!",
416            },
417            TestCase {
418                pattern: "ban%na\\!",
419                escape: EscapeBehavior::Char('n'),
420                expected: "ba\\%\\a\\\\!",
421            },
422            TestCase {
423                pattern: "ban%na\\!",
424                escape: EscapeBehavior::Disabled,
425                expected: "ban%na\\\\!",
426            },
427            TestCase {
428                pattern: "ban\\na!",
429                escape: EscapeBehavior::Char('n'),
430                expected: "ba\\\\\\a!",
431            },
432            TestCase {
433                pattern: "ban\\\\na!",
434                escape: EscapeBehavior::Char('n'),
435                expected: "ba\\\\\\\\\\a!",
436            },
437            TestCase {
438                pattern: "food",
439                escape: EscapeBehavior::Char('o'),
440                expected: "f\\od",
441            },
442            TestCase {
443                pattern: "漢漢",
444                escape: EscapeBehavior::Char('漢'),
445                expected: "\\漢",
446            },
447        ];
448
449        for input in test_cases {
450            let actual = normalize_pattern(input.pattern, input.escape).unwrap();
451            assert!(
452                actual == input.expected,
453                "normalize_pattern({:?}, {:?}):\n\tactual: {:?}\n\texpected: {:?}\n",
454                input.pattern,
455                input.escape,
456                actual,
457                input.expected,
458            );
459        }
460    }
461
462    #[mz_ore::test]
463    fn test_escape_too_long() {
464        match EscapeBehavior::from_str("foo") {
465            Err(EvalError::LikeEscapeTooLong) => {}
466            _ => {
467                panic!("expected error when using escape string with >1 character");
468            }
469        }
470    }
471
472    #[mz_ore::test]
473    fn test_like() {
474        struct Input<'a> {
475            haystack: &'a str,
476            matches: bool,
477        }
478        let input = |haystack, matches| Input { haystack, matches };
479        struct Pattern<'a> {
480            needle: &'a str,
481            case_insensitive: bool,
482            inputs: Vec<Input<'a>>,
483        }
484        let test_cases = vec![
485            Pattern {
486                needle: "ban%na!",
487                case_insensitive: false,
488                inputs: vec![input("banana!", true)],
489            },
490            Pattern {
491                needle: "foo",
492                case_insensitive: true,
493                inputs: vec![
494                    input("", false),
495                    input("f", false),
496                    input("fo", false),
497                    input("foo", true),
498                    input("FOO", true),
499                    input("Foo", true),
500                    input("fOO", true),
501                    input("food", false),
502                ],
503            },
504        ];
505
506        for tc in test_cases {
507            let matcher = compile(tc.needle, tc.case_insensitive).unwrap();
508            for input in tc.inputs {
509                let actual = matcher.is_match(input.haystack);
510                assert!(
511                    actual == input.matches,
512                    "{:?} {} {:?}:\n\tactual: {:?}\n\texpected: {:?}\n",
513                    input.haystack,
514                    match tc.case_insensitive {
515                        true => "ILIKE",
516                        false => "LIKE",
517                    },
518                    tc.needle,
519                    actual,
520                    input.matches,
521                );
522            }
523        }
524    }
525
526    // Patterns with two or more `%` wildcards now compile to the linear regex
527    // matcher instead of the back-tracking string matcher, which was
528    // super-linear: an adversarial pattern like `%a%a%a` against a long run of
529    // `a`s took time proportional to `len(text)^(number of %)`. Verify the
530    // routing change still yields correct match results (the regex matcher and
531    // the string matcher must agree), including the previously-pathological
532    // shape, which now completes instantly regardless of text length.
533    #[mz_ore::test]
534    fn test_many_wildcards_match_correctly() {
535        let long_a = "a".repeat(64);
536        let cases: &[(&str, &str, bool)] = &[
537            ("%a%a%a", "xaxaxa", true),
538            ("%a%a%a", "aaa", true),
539            ("%a%a%a", "aa", false),
540            ("%a%b%c%", "zzabqcz", true),
541            ("%a%b%c%", "cba", false),
542            ("a%b%c", "abc", true),
543            ("a%b%c", "axxbxxc", true),
544            ("a%b%c", "abcd", false),
545            // A single `%` still uses the (near-linear) string matcher.
546            ("%_%_%", "ab", true),
547            ("%%%%", "", true),
548            // The exact pathological shape from the fuzzer; super-linear before
549            // the fix, instant after.
550            ("%a%a%a%a%a", &long_a, true),
551            ("%a%a%a%a%a", "aaaa", false),
552        ];
553        for (pat, text, expected) in cases {
554            let m = compile(pat, false).unwrap();
555            assert_eq!(
556                m.is_match(text),
557                *expected,
558                "pattern {pat:?} against {text:?}"
559            );
560        }
561    }
562
563    // Only `%` subpatterns with a non-empty suffix nest the back-tracking loops,
564    // so only two or more of *those* should reroute to the regex engine. A `%`
565    // with an empty suffix (always trailing) short-circuits without back-tracking
566    // and must not count. In particular the ubiquitous "contains"/prefix/suffix
567    // shapes (`%foo%`, `%foo`, `foo%`) must stay on the fast string matcher, where
568    // a single `rfind` is far cheaper than a forward regex scan. Lock the routing
569    // decision in so a future tweak to the predicate can't silently de-optimize
570    // the common case.
571    #[mz_ore::test]
572    fn test_string_matcher_routing() {
573        let uses_regex = |pat: &str| -> bool {
574            match compile(pat, false).unwrap().matcher_impl {
575                MatcherImpl::String(_) => false,
576                MatcherImpl::Regex(_) => true,
577            }
578        };
579        // Fast string-matcher path: at most one back-tracking `%`. The trailing
580        // `%` in `%foo%` has an empty suffix and short-circuits, so it does not
581        // count.
582        for pat in ["%foo%", "%foo", "foo%", "foo", "f_o", "%_%_%", "%%%%"] {
583            assert!(
584                !uses_regex(pat),
585                "pattern {pat:?} should use the fast string matcher"
586            );
587        }
588        // Regex path: two or more non-empty-suffix `%`, which are super-linear in
589        // the string matcher.
590        for pat in ["%foo%bar%", "%foo%bar", "a%b%c", "%a%a%a"] {
591            assert!(
592                uses_regex(pat),
593                "pattern {pat:?} should use the regex engine"
594            );
595        }
596        // Case-insensitive always uses regex, regardless of `%` count.
597        assert!(matches!(
598            compile("%foo%", true).unwrap().matcher_impl,
599            MatcherImpl::Regex(_)
600        ));
601    }
602}