mz_expr/scalar/like_pattern.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::mem;
11use std::str::FromStr;
12
13use derivative::Derivative;
14use mz_lowertest::MzReflect;
15use mz_ore::fmt::FormatBuffer;
16use mz_repr::adt::regex::{Regex, RegexCompilationError};
17use serde::{Deserialize, Serialize};
18
19use crate::scalar::EvalError;
20
21/// The number of subpatterns after which using regexes would be more efficient.
22const MAX_SUBPATTERNS: usize = 5;
23
24/// The escape character to use by default in LIKE patterns.
25const DEFAULT_ESCAPE: char = '\\';
26const DOUBLED_ESCAPE: &str = "\\\\";
27
28/// Specifies escape behavior for the LIKE pattern.
29#[derive(Clone, Copy, Debug)]
30pub enum EscapeBehavior {
31 /// No escape character.
32 Disabled,
33 /// Use a custom escape character.
34 Char(char),
35}
36
37impl Default for EscapeBehavior {
38 fn default() -> EscapeBehavior {
39 EscapeBehavior::Char(DEFAULT_ESCAPE)
40 }
41}
42
43impl FromStr for EscapeBehavior {
44 type Err = EvalError;
45
46 fn from_str(s: &str) -> Result<EscapeBehavior, EvalError> {
47 let mut chars = s.chars();
48 match chars.next() {
49 None => Ok(EscapeBehavior::Disabled),
50 Some(c) => match chars.next() {
51 None => Ok(EscapeBehavior::Char(c)),
52 Some(_) => Err(EvalError::LikeEscapeTooLong),
53 },
54 }
55 }
56}
57
58/// Converts a pattern string that uses a custom escape character to one that uses the default.
59pub fn normalize_pattern(pattern: &str, escape: EscapeBehavior) -> Result<String, EvalError> {
60 match escape {
61 EscapeBehavior::Disabled => Ok(pattern.replace(DEFAULT_ESCAPE, DOUBLED_ESCAPE)),
62 EscapeBehavior::Char(DEFAULT_ESCAPE) => Ok(pattern.into()),
63 EscapeBehavior::Char(custom_escape_char) => {
64 let mut p = String::with_capacity(2 * pattern.len());
65 let mut cs = pattern.chars();
66 while let Some(c) = cs.next() {
67 if c == custom_escape_char {
68 match cs.next() {
69 Some(c2) => {
70 p.push(DEFAULT_ESCAPE);
71 p.push(c2);
72 }
73 None => return Err(EvalError::UnterminatedLikeEscapeSequence),
74 }
75 } else if c == DEFAULT_ESCAPE {
76 p.push_str(DOUBLED_ESCAPE);
77 } else {
78 p.push(c);
79 }
80 }
81 p.shrink_to_fit();
82 Ok(p)
83 }
84 }
85}
86
87// This implementation supports a couple of different methods of matching
88// text against a SQL LIKE or ILIKE pattern.
89//
90// The most general approach is to convert the LIKE pattern into a
91// regular expression and use the well-tested Regex library to perform the
92// match. This works well with complex patterns and case-insensitive matches
93// that are hard to get right.
94//
95// That said, regular expressions aren't that efficient. For most patterns
96// we can do better using built-in string matching.
97
98pub use matcher::Matcher;
99use matcher::MatcherImpl;
100
101// This lint interacts poorly with `derivative` here; we are confident it generates
102// compatible `PartialOrd` and `Ord` impls. Unfortunately it also requires we introduce
103// this module to allow it.
104#[allow(clippy::non_canonical_partial_ord_impl)]
105mod matcher {
106 use super::*;
107
108 /// An object that can test whether a string matches a LIKE or ILIKE pattern.
109 #[derive(Debug, Clone, Deserialize, Serialize, Derivative, MzReflect)]
110 #[derivative(Eq, PartialEq, Ord, PartialOrd, Hash)]
111 pub struct Matcher {
112 pub pattern: String,
113 pub case_insensitive: bool,
114 #[derivative(
115 PartialEq = "ignore",
116 Hash = "ignore",
117 Ord = "ignore",
118 PartialOrd = "ignore"
119 )]
120 pub(super) matcher_impl: MatcherImpl,
121 }
122
123 impl Matcher {
124 pub fn is_match(&self, text: &str) -> bool {
125 match &self.matcher_impl {
126 MatcherImpl::String(subpatterns) => is_match_subpatterns(subpatterns, text),
127 MatcherImpl::Regex(r) => r.is_match(text),
128 }
129 }
130 }
131
132 #[derive(Debug, Clone, Deserialize, Serialize, MzReflect)]
133 pub(super) enum MatcherImpl {
134 String(Vec<Subpattern>),
135 Regex(Regex),
136 }
137}
138
139/// Builds a Matcher that matches a SQL LIKE pattern.
140pub fn compile(pattern: &str, case_insensitive: bool) -> Result<Matcher, EvalError> {
141 // We would like to have a consistent, documented limit to the size of
142 // supported LIKE patterns. The real limiting factor is the number of states
143 // that can be handled by the Regex library. In testing, I was able to
144 // create an adversarial pattern "%a%b%c%d%e..." that started failing around
145 // 9 KiB, so we chose 8 KiB as the limit. This is consistent with limits
146 // set by other databases, like SQL Server.
147 // On the other hand, PostgreSQL does not have a documented limit.
148 if pattern.len() > 8 << 10 {
149 return Err(EvalError::LikePatternTooLong);
150 }
151 let subpatterns = build_subpatterns(pattern)?;
152 // `is_match_subpatterns` resolves each `%` (a `many` subpattern) by searching
153 // for the following suffix and backtracking over every candidate position. A
154 // single `%` is near-linear, but with two or more the backtracking nests and
155 // the cost becomes super-linear in the text length — an adversarial pattern
156 // like `%a%a%a` against a long run of `a`s takes time proportional to
157 // `len(text)^(number of %)`, which can stall a worker for many minutes. The
158 // regex engine matches the same patterns in linear time with no backtracking,
159 // so fall back to it whenever more than one backtracking `%` is present (and
160 // for the existing case-insensitive / too-many-subpatterns reasons).
161 //
162 // A `%` with an *empty* suffix short-circuits in `is_match_subpatterns`
163 // without any `rfind`/backtracking, and (per `build_subpatterns`) such a `%`
164 // can only ever be the trailing subpattern. Only `%` subpatterns with a
165 // non-empty suffix nest the backtracking loops, so we count just those. This
166 // keeps the common "contains" pattern `%foo%` — which decomposes into one
167 // non-empty-suffix `%` plus a trailing empty-suffix `%` — on the fast string
168 // matcher, where a single `rfind` is far cheaper than a forward regex scan.
169 let backtracking_manys = subpatterns
170 .iter()
171 .filter(|s| s.many && !s.suffix.is_empty())
172 .count();
173 let use_regex =
174 case_insensitive || subpatterns.len() > MAX_SUBPATTERNS || backtracking_manys > 1;
175 let matcher_impl = match use_regex {
176 false => MatcherImpl::String(subpatterns),
177 true => MatcherImpl::Regex(build_regex(&subpatterns, case_insensitive)?),
178 };
179 Ok(Matcher {
180 pattern: pattern.into(),
181 case_insensitive,
182 matcher_impl,
183 })
184}
185
186// The algorithm below is based on the observation that any LIKE pattern can be
187// decomposed into multiple parts:
188// <PATTERN> := <SUB-PATTERN> (<SUB-PATTERN> ...)
189// <SUB-PATTERN> := <WILDCARDS> <SUFFIX>
190//
191// The sub-patterns start with zero or more wildcard characters, eventually
192// followed by (non-wildcard) literal characters. The last sub-pattern may
193// have an empty SUFFIX.
194//
195// Example: the PATTERN "n__dl%" can be broken into the following parts:
196// 1. SUB-PATTERN = <WILDCARDS ""> <SUFFIX "n">
197// 2. SUB-PATTERN = <WILDCARDS "__"> <SUFFIX "dl">
198// 3. SUB-PATTERN = <WILDCARDS "%"> <SUFFIX "">
199//
200// The WILDCARDS can be any combination of '_', which matches exactly 1 char,
201// and '%' which matches zero or more chars. These wildcards can be simplified
202// down to the (min, max) of characters they might consume:
203// "" = (0, 0) // doesn't consume any characters
204// "_" = (1, 1) // consumes exactly one
205// "%" = (0, many) // zero or more
206// These are additive, so:
207// "_%" = (1, many)
208// "__%__" = (4, many)
209// "%%%_" = (1, many)
210
211#[derive(Debug, Default, Clone, Deserialize, Serialize, MzReflect)]
212struct Subpattern {
213 /// The minimum number of characters that can be consumed by the wildcard expression.
214 consume: usize,
215 /// Whether the wildcard expression can consume an arbitrary number of characters.
216 many: bool,
217 /// A string literal that is expected after the wildcards.
218 suffix: String,
219}
220
221impl Subpattern {
222 /// Converts a Subpattern to an equivalent regular expression and writes it to a given string.
223 fn write_regex_to(&self, r: &mut String) {
224 match self.consume {
225 0 => {
226 if self.many {
227 r.push_str(".*");
228 }
229 }
230 1 => {
231 r.push('.');
232 if self.many {
233 r.push('+');
234 }
235 }
236 n => {
237 r.push_str(".{");
238 write!(r, "{}", n);
239 if self.many {
240 r.push(',');
241 }
242 r.push('}');
243 }
244 }
245 regex_syntax::escape_into(&self.suffix, r);
246 }
247}
248
249fn is_match_subpatterns(subpatterns: &[Subpattern], mut text: &str) -> bool {
250 let (subpattern, subpatterns) = match subpatterns {
251 [] => return text.is_empty(),
252 [subpattern, subpatterns @ ..] => (subpattern, subpatterns),
253 };
254 // Go ahead and skip the minimum number of characters the sub-pattern consumes:
255 if subpattern.consume > 0 {
256 let mut chars = text.chars();
257 if chars.nth(subpattern.consume - 1).is_none() {
258 return false;
259 }
260 text = chars.as_str();
261 }
262 if subpattern.many {
263 // The sub-pattern might consume any number of characters, but we need to find
264 // where it terminates so we can match any subsequent sub-patterns. We do this
265 // by searching for the suffix string using str::find.
266 //
267 // We could investigate using a fancier substring search like Boyer-Moore:
268 // https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
269 //
270 // .. but it's likely not worth it. It's slower for small strings,
271 // and doesn't really start outperforming the naive approach until
272 // haystack sizes of 1KB or greater. See benchmarking results from:
273 // https://github.com/killerswan/boyer-moore-search/blob/master/README.md
274 //
275 // Another approach that may be interesting to look at is a
276 // hardware-optimized search:
277 // http://0x80.pl/articles/simd-strfind.html
278 if subpattern.suffix.len() == 0 {
279 // Nothing to find... This should only happen in the last sub-pattern.
280 assert!(
281 subpatterns.is_empty(),
282 "empty suffix in middle of a pattern"
283 );
284 return true;
285 }
286 // Use rfind so we perform a greedy capture, like Regex.
287 let mut found = text.rfind(&subpattern.suffix);
288 loop {
289 match found {
290 None => return false,
291 Some(offset) => {
292 let mut end = offset + subpattern.suffix.len();
293 if is_match_subpatterns(subpatterns, &text[end..]) {
294 return true;
295 }
296 // Didn't match, look for the next rfind.
297 if offset == 0 {
298 return false;
299 }
300 // Find the previous valid char byte.
301 loop {
302 end -= 1;
303 if text.is_char_boundary(end) {
304 break;
305 }
306 }
307 found = text[..end].rfind(&subpattern.suffix);
308 }
309 }
310 }
311 }
312 // No string search needed, we just use a prefix match on rest.
313 if !text.starts_with(&subpattern.suffix) {
314 return false;
315 }
316 is_match_subpatterns(subpatterns, &text[subpattern.suffix.len()..])
317}
318
319/// Breaks a LIKE pattern into a chain of sub-patterns.
320fn build_subpatterns(pattern: &str) -> Result<Vec<Subpattern>, EvalError> {
321 let mut subpatterns = Vec::with_capacity(MAX_SUBPATTERNS);
322 let mut current = Subpattern::default();
323 let mut in_wildcard = true;
324 let mut in_escape = false;
325 for c in pattern.chars() {
326 match c {
327 c if !in_escape && c == DEFAULT_ESCAPE => {
328 in_escape = true;
329 in_wildcard = false;
330 }
331 '_' if !in_escape => {
332 if !in_wildcard {
333 current.suffix.shrink_to_fit();
334 subpatterns.push(mem::take(&mut current));
335 in_wildcard = true;
336 }
337 current.consume += 1;
338 }
339 '%' if !in_escape => {
340 if !in_wildcard {
341 current.suffix.shrink_to_fit();
342 subpatterns.push(mem::take(&mut current));
343 in_wildcard = true;
344 }
345 current.many = true;
346 }
347 c => {
348 current.suffix.push(c);
349 in_escape = false;
350 in_wildcard = false;
351 }
352 }
353 }
354 if in_escape {
355 return Err(EvalError::UnterminatedLikeEscapeSequence);
356 }
357 current.suffix.shrink_to_fit();
358 subpatterns.push(current);
359 subpatterns.shrink_to_fit();
360 Ok(subpatterns)
361}
362
363/// Builds a regular expression that matches some parsed Subpatterns.
364fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Regex, EvalError> {
365 let mut r = String::from("^");
366 for sp in subpatterns {
367 sp.write_regex_to(&mut r);
368 }
369 r.push('$');
370 match Regex::new(&r, case_insensitive) {
371 Ok(regex) => Ok(regex),
372 Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
373 Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
374 Err(EvalError::LikePatternTooLong)
375 }
376 Err(e) => Err(EvalError::Internal(
377 format!("build_regex produced invalid regex: {}", e).into(),
378 )),
379 }
380}
381
382// Unit Tests
383//
384// Most of the unit tests for LIKE and ILIKE can be found in:
385// test/sqllogictest/cockroach/like.slt
386// These tests are here as a convenient place to run quick tests while
387// actively working on changes to the implementation. Make sure you
388// run the full test suite before submitting any changes.
389
390#[cfg(test)]
391mod test {
392 use super::*;
393
394 #[mz_ore::test]
395 fn test_normalize_pattern() {
396 struct TestCase<'a> {
397 pattern: &'a str,
398 escape: EscapeBehavior,
399 expected: &'a str,
400 }
401 let test_cases = vec![
402 TestCase {
403 pattern: "",
404 escape: EscapeBehavior::Disabled,
405 expected: "",
406 },
407 TestCase {
408 pattern: "ban%na!",
409 escape: EscapeBehavior::default(),
410 expected: "ban%na!",
411 },
412 TestCase {
413 pattern: "ban%%%na!",
414 escape: EscapeBehavior::Char('%'),
415 expected: "ban\\%\\na!",
416 },
417 TestCase {
418 pattern: "ban%na\\!",
419 escape: EscapeBehavior::Char('n'),
420 expected: "ba\\%\\a\\\\!",
421 },
422 TestCase {
423 pattern: "ban%na\\!",
424 escape: EscapeBehavior::Disabled,
425 expected: "ban%na\\\\!",
426 },
427 TestCase {
428 pattern: "ban\\na!",
429 escape: EscapeBehavior::Char('n'),
430 expected: "ba\\\\\\a!",
431 },
432 TestCase {
433 pattern: "ban\\\\na!",
434 escape: EscapeBehavior::Char('n'),
435 expected: "ba\\\\\\\\\\a!",
436 },
437 TestCase {
438 pattern: "food",
439 escape: EscapeBehavior::Char('o'),
440 expected: "f\\od",
441 },
442 TestCase {
443 pattern: "漢漢",
444 escape: EscapeBehavior::Char('漢'),
445 expected: "\\漢",
446 },
447 ];
448
449 for input in test_cases {
450 let actual = normalize_pattern(input.pattern, input.escape).unwrap();
451 assert!(
452 actual == input.expected,
453 "normalize_pattern({:?}, {:?}):\n\tactual: {:?}\n\texpected: {:?}\n",
454 input.pattern,
455 input.escape,
456 actual,
457 input.expected,
458 );
459 }
460 }
461
462 #[mz_ore::test]
463 fn test_escape_too_long() {
464 match EscapeBehavior::from_str("foo") {
465 Err(EvalError::LikeEscapeTooLong) => {}
466 _ => {
467 panic!("expected error when using escape string with >1 character");
468 }
469 }
470 }
471
472 #[mz_ore::test]
473 fn test_like() {
474 struct Input<'a> {
475 haystack: &'a str,
476 matches: bool,
477 }
478 let input = |haystack, matches| Input { haystack, matches };
479 struct Pattern<'a> {
480 needle: &'a str,
481 case_insensitive: bool,
482 inputs: Vec<Input<'a>>,
483 }
484 let test_cases = vec![
485 Pattern {
486 needle: "ban%na!",
487 case_insensitive: false,
488 inputs: vec![input("banana!", true)],
489 },
490 Pattern {
491 needle: "foo",
492 case_insensitive: true,
493 inputs: vec![
494 input("", false),
495 input("f", false),
496 input("fo", false),
497 input("foo", true),
498 input("FOO", true),
499 input("Foo", true),
500 input("fOO", true),
501 input("food", false),
502 ],
503 },
504 ];
505
506 for tc in test_cases {
507 let matcher = compile(tc.needle, tc.case_insensitive).unwrap();
508 for input in tc.inputs {
509 let actual = matcher.is_match(input.haystack);
510 assert!(
511 actual == input.matches,
512 "{:?} {} {:?}:\n\tactual: {:?}\n\texpected: {:?}\n",
513 input.haystack,
514 match tc.case_insensitive {
515 true => "ILIKE",
516 false => "LIKE",
517 },
518 tc.needle,
519 actual,
520 input.matches,
521 );
522 }
523 }
524 }
525
526 // Patterns with two or more `%` wildcards now compile to the linear regex
527 // matcher instead of the back-tracking string matcher, which was
528 // super-linear: an adversarial pattern like `%a%a%a` against a long run of
529 // `a`s took time proportional to `len(text)^(number of %)`. Verify the
530 // routing change still yields correct match results (the regex matcher and
531 // the string matcher must agree), including the previously-pathological
532 // shape, which now completes instantly regardless of text length.
533 #[mz_ore::test]
534 fn test_many_wildcards_match_correctly() {
535 let long_a = "a".repeat(64);
536 let cases: &[(&str, &str, bool)] = &[
537 ("%a%a%a", "xaxaxa", true),
538 ("%a%a%a", "aaa", true),
539 ("%a%a%a", "aa", false),
540 ("%a%b%c%", "zzabqcz", true),
541 ("%a%b%c%", "cba", false),
542 ("a%b%c", "abc", true),
543 ("a%b%c", "axxbxxc", true),
544 ("a%b%c", "abcd", false),
545 // A single `%` still uses the (near-linear) string matcher.
546 ("%_%_%", "ab", true),
547 ("%%%%", "", true),
548 // The exact pathological shape from the fuzzer; super-linear before
549 // the fix, instant after.
550 ("%a%a%a%a%a", &long_a, true),
551 ("%a%a%a%a%a", "aaaa", false),
552 ];
553 for (pat, text, expected) in cases {
554 let m = compile(pat, false).unwrap();
555 assert_eq!(
556 m.is_match(text),
557 *expected,
558 "pattern {pat:?} against {text:?}"
559 );
560 }
561 }
562
563 // Only `%` subpatterns with a non-empty suffix nest the back-tracking loops,
564 // so only two or more of *those* should reroute to the regex engine. A `%`
565 // with an empty suffix (always trailing) short-circuits without back-tracking
566 // and must not count. In particular the ubiquitous "contains"/prefix/suffix
567 // shapes (`%foo%`, `%foo`, `foo%`) must stay on the fast string matcher, where
568 // a single `rfind` is far cheaper than a forward regex scan. Lock the routing
569 // decision in so a future tweak to the predicate can't silently de-optimize
570 // the common case.
571 #[mz_ore::test]
572 fn test_string_matcher_routing() {
573 let uses_regex = |pat: &str| -> bool {
574 match compile(pat, false).unwrap().matcher_impl {
575 MatcherImpl::String(_) => false,
576 MatcherImpl::Regex(_) => true,
577 }
578 };
579 // Fast string-matcher path: at most one back-tracking `%`. The trailing
580 // `%` in `%foo%` has an empty suffix and short-circuits, so it does not
581 // count.
582 for pat in ["%foo%", "%foo", "foo%", "foo", "f_o", "%_%_%", "%%%%"] {
583 assert!(
584 !uses_regex(pat),
585 "pattern {pat:?} should use the fast string matcher"
586 );
587 }
588 // Regex path: two or more non-empty-suffix `%`, which are super-linear in
589 // the string matcher.
590 for pat in ["%foo%bar%", "%foo%bar", "a%b%c", "%a%a%a"] {
591 assert!(
592 uses_regex(pat),
593 "pattern {pat:?} should use the regex engine"
594 );
595 }
596 // Case-insensitive always uses regex, regardless of `%` count.
597 assert!(matches!(
598 compile("%foo%", true).unwrap().matcher_impl,
599 MatcherImpl::Regex(_)
600 ));
601 }
602}