mz_expr/scalar/like_pattern.rs
1// Copyright Materialize, Inc. and contributors. All rights reserved.
2//
3// Use of this software is governed by the Business Source License
4// included in the LICENSE file.
5//
6// As of the Change Date specified in that file, in accordance with
7// the Business Source License, use of this software will be governed
8// by the Apache License, Version 2.0.
9
10use std::mem;
11use std::str::FromStr;
12
13use derivative::Derivative;
14use mz_lowertest::MzReflect;
15use mz_ore::fmt::FormatBuffer;
16use mz_repr::adt::regex::{Regex, RegexCompilationError};
17use serde::{Deserialize, Serialize};
18
19use crate::scalar::EvalError;
20
21/// The number of subpatterns after which using regexes would be more efficient.
22const MAX_SUBPATTERNS: usize = 5;
23
24/// The escape character to use by default in LIKE patterns.
25const DEFAULT_ESCAPE: char = '\\';
26const DOUBLED_ESCAPE: &str = "\\\\";
27
28/// Specifies escape behavior for the LIKE pattern.
29#[derive(Clone, Copy, Debug)]
30pub enum EscapeBehavior {
31 /// No escape character.
32 Disabled,
33 /// Use a custom escape character.
34 Char(char),
35}
36
37impl Default for EscapeBehavior {
38 fn default() -> EscapeBehavior {
39 EscapeBehavior::Char(DEFAULT_ESCAPE)
40 }
41}
42
43impl FromStr for EscapeBehavior {
44 type Err = EvalError;
45
46 fn from_str(s: &str) -> Result<EscapeBehavior, EvalError> {
47 let mut chars = s.chars();
48 match chars.next() {
49 None => Ok(EscapeBehavior::Disabled),
50 Some(c) => match chars.next() {
51 None => Ok(EscapeBehavior::Char(c)),
52 Some(_) => Err(EvalError::LikeEscapeTooLong),
53 },
54 }
55 }
56}
57
58/// Converts a pattern string that uses a custom escape character to one that uses the default.
59pub fn normalize_pattern(pattern: &str, escape: EscapeBehavior) -> Result<String, EvalError> {
60 match escape {
61 EscapeBehavior::Disabled => Ok(pattern.replace(DEFAULT_ESCAPE, DOUBLED_ESCAPE)),
62 EscapeBehavior::Char(DEFAULT_ESCAPE) => Ok(pattern.into()),
63 EscapeBehavior::Char(custom_escape_char) => {
64 let mut p = String::with_capacity(2 * pattern.len());
65 let mut cs = pattern.chars();
66 while let Some(c) = cs.next() {
67 if c == custom_escape_char {
68 match cs.next() {
69 Some(c2) => {
70 p.push(DEFAULT_ESCAPE);
71 p.push(c2);
72 }
73 None => return Err(EvalError::UnterminatedLikeEscapeSequence),
74 }
75 } else if c == DEFAULT_ESCAPE {
76 p.push_str(DOUBLED_ESCAPE);
77 } else {
78 p.push(c);
79 }
80 }
81 p.shrink_to_fit();
82 Ok(p)
83 }
84 }
85}
86
87// This implementation supports a couple of different methods of matching
88// text against a SQL LIKE or ILIKE pattern.
89//
90// The most general approach is to convert the LIKE pattern into a
91// regular expression and use the well-tested Regex library to perform the
92// match. This works well with complex patterns and case-insensitive matches
93// that are hard to get right.
94//
95// That said, regular expressions aren't that efficient. For most patterns
96// we can do better using built-in string matching.
97
98pub use matcher::Matcher;
99use matcher::MatcherImpl;
100
101// This lint interacts poorly with `derivative` here; we are confident it generates
102// compatible `PartialOrd` and `Ord` impls. Unfortunately it also requires we introduce
103// this module to allow it.
104#[allow(clippy::non_canonical_partial_ord_impl)]
105mod matcher {
106 use super::*;
107
108 /// An object that can test whether a string matches a LIKE or ILIKE pattern.
109 #[derive(Debug, Clone, Deserialize, Serialize, Derivative, MzReflect)]
110 #[derivative(Eq, PartialEq, Ord, PartialOrd, Hash)]
111 pub struct Matcher {
112 pub pattern: String,
113 pub case_insensitive: bool,
114 #[derivative(
115 PartialEq = "ignore",
116 Hash = "ignore",
117 Ord = "ignore",
118 PartialOrd = "ignore"
119 )]
120 pub(super) matcher_impl: MatcherImpl,
121 }
122
123 impl Matcher {
124 pub fn is_match(&self, text: &str) -> bool {
125 match &self.matcher_impl {
126 MatcherImpl::String(subpatterns) => is_match_subpatterns(subpatterns, text),
127 MatcherImpl::Regex(r) => r.is_match(text),
128 }
129 }
130 }
131
132 #[derive(Debug, Clone, Deserialize, Serialize, MzReflect)]
133 pub(super) enum MatcherImpl {
134 String(Vec<Subpattern>),
135 Regex(Regex),
136 }
137}
138
139/// Builds a Matcher that matches a SQL LIKE pattern.
140pub fn compile(pattern: &str, case_insensitive: bool) -> Result<Matcher, EvalError> {
141 // We would like to have a consistent, documented limit to the size of
142 // supported LIKE patterns. The real limiting factor is the number of states
143 // that can be handled by the Regex library. In testing, I was able to
144 // create an adversarial pattern "%a%b%c%d%e..." that started failing around
145 // 9 KiB, so we chose 8 KiB as the limit. This is consistent with limits
146 // set by other databases, like SQL Server.
147 // On the other hand, PostgreSQL does not have a documented limit.
148 if pattern.len() > 8 << 10 {
149 return Err(EvalError::LikePatternTooLong);
150 }
151 let subpatterns = build_subpatterns(pattern)?;
152 // `is_match_subpatterns` resolves each `%` (a `many` subpattern) by searching
153 // for the following suffix and backtracking over every candidate position. A
154 // single `%` is near-linear, but with two or more the backtracking nests and
155 // the cost becomes super-linear in the text length — an adversarial pattern
156 // like `%a%a%a` against a long run of `a`s takes time proportional to
157 // `len(text)^(number of %)`, which can stall a worker for many minutes. The
158 // regex engine matches the same patterns in linear time with no backtracking,
159 // so fall back to it whenever more than one `many` subpattern is present (and
160 // for the existing case-insensitive / too-many-subpatterns reasons).
161 let many_subpatterns = subpatterns.iter().filter(|s| s.many).count();
162 let use_regex = case_insensitive || subpatterns.len() > MAX_SUBPATTERNS || many_subpatterns > 1;
163 let matcher_impl = match use_regex {
164 false => MatcherImpl::String(subpatterns),
165 true => MatcherImpl::Regex(build_regex(&subpatterns, case_insensitive)?),
166 };
167 Ok(Matcher {
168 pattern: pattern.into(),
169 case_insensitive,
170 matcher_impl,
171 })
172}
173
174// The algorithm below is based on the observation that any LIKE pattern can be
175// decomposed into multiple parts:
176// <PATTERN> := <SUB-PATTERN> (<SUB-PATTERN> ...)
177// <SUB-PATTERN> := <WILDCARDS> <SUFFIX>
178//
179// The sub-patterns start with zero or more wildcard characters, eventually
180// followed by (non-wildcard) literal characters. The last sub-pattern may
181// have an empty SUFFIX.
182//
183// Example: the PATTERN "n__dl%" can be broken into the following parts:
184// 1. SUB-PATTERN = <WILDCARDS ""> <SUFFIX "n">
185// 2. SUB-PATTERN = <WILDCARDS "__"> <SUFFIX "dl">
186// 3. SUB-PATTERN = <WILDCARDS "%"> <SUFFIX "">
187//
188// The WILDCARDS can be any combination of '_', which matches exactly 1 char,
189// and '%' which matches zero or more chars. These wildcards can be simplified
190// down to the (min, max) of characters they might consume:
191// "" = (0, 0) // doesn't consume any characters
192// "_" = (1, 1) // consumes exactly one
193// "%" = (0, many) // zero or more
194// These are additive, so:
195// "_%" = (1, many)
196// "__%__" = (4, many)
197// "%%%_" = (1, many)
198
199#[derive(Debug, Default, Clone, Deserialize, Serialize, MzReflect)]
200struct Subpattern {
201 /// The minimum number of characters that can be consumed by the wildcard expression.
202 consume: usize,
203 /// Whether the wildcard expression can consume an arbitrary number of characters.
204 many: bool,
205 /// A string literal that is expected after the wildcards.
206 suffix: String,
207}
208
209impl Subpattern {
210 /// Converts a Subpattern to an equivalent regular expression and writes it to a given string.
211 fn write_regex_to(&self, r: &mut String) {
212 match self.consume {
213 0 => {
214 if self.many {
215 r.push_str(".*");
216 }
217 }
218 1 => {
219 r.push('.');
220 if self.many {
221 r.push('+');
222 }
223 }
224 n => {
225 r.push_str(".{");
226 write!(r, "{}", n);
227 if self.many {
228 r.push(',');
229 }
230 r.push('}');
231 }
232 }
233 regex_syntax::escape_into(&self.suffix, r);
234 }
235}
236
237fn is_match_subpatterns(subpatterns: &[Subpattern], mut text: &str) -> bool {
238 let (subpattern, subpatterns) = match subpatterns {
239 [] => return text.is_empty(),
240 [subpattern, subpatterns @ ..] => (subpattern, subpatterns),
241 };
242 // Go ahead and skip the minimum number of characters the sub-pattern consumes:
243 if subpattern.consume > 0 {
244 let mut chars = text.chars();
245 if chars.nth(subpattern.consume - 1).is_none() {
246 return false;
247 }
248 text = chars.as_str();
249 }
250 if subpattern.many {
251 // The sub-pattern might consume any number of characters, but we need to find
252 // where it terminates so we can match any subsequent sub-patterns. We do this
253 // by searching for the suffix string using str::find.
254 //
255 // We could investigate using a fancier substring search like Boyer-Moore:
256 // https://en.wikipedia.org/wiki/Boyer%E2%80%93Moore_string-search_algorithm
257 //
258 // .. but it's likely not worth it. It's slower for small strings,
259 // and doesn't really start outperforming the naive approach until
260 // haystack sizes of 1KB or greater. See benchmarking results from:
261 // https://github.com/killerswan/boyer-moore-search/blob/master/README.md
262 //
263 // Another approach that may be interesting to look at is a
264 // hardware-optimized search:
265 // http://0x80.pl/articles/simd-strfind.html
266 if subpattern.suffix.len() == 0 {
267 // Nothing to find... This should only happen in the last sub-pattern.
268 assert!(
269 subpatterns.is_empty(),
270 "empty suffix in middle of a pattern"
271 );
272 return true;
273 }
274 // Use rfind so we perform a greedy capture, like Regex.
275 let mut found = text.rfind(&subpattern.suffix);
276 loop {
277 match found {
278 None => return false,
279 Some(offset) => {
280 let mut end = offset + subpattern.suffix.len();
281 if is_match_subpatterns(subpatterns, &text[end..]) {
282 return true;
283 }
284 // Didn't match, look for the next rfind.
285 if offset == 0 {
286 return false;
287 }
288 // Find the previous valid char byte.
289 loop {
290 end -= 1;
291 if text.is_char_boundary(end) {
292 break;
293 }
294 }
295 found = text[..end].rfind(&subpattern.suffix);
296 }
297 }
298 }
299 }
300 // No string search needed, we just use a prefix match on rest.
301 if !text.starts_with(&subpattern.suffix) {
302 return false;
303 }
304 is_match_subpatterns(subpatterns, &text[subpattern.suffix.len()..])
305}
306
307/// Breaks a LIKE pattern into a chain of sub-patterns.
308fn build_subpatterns(pattern: &str) -> Result<Vec<Subpattern>, EvalError> {
309 let mut subpatterns = Vec::with_capacity(MAX_SUBPATTERNS);
310 let mut current = Subpattern::default();
311 let mut in_wildcard = true;
312 let mut in_escape = false;
313 for c in pattern.chars() {
314 match c {
315 c if !in_escape && c == DEFAULT_ESCAPE => {
316 in_escape = true;
317 in_wildcard = false;
318 }
319 '_' if !in_escape => {
320 if !in_wildcard {
321 current.suffix.shrink_to_fit();
322 subpatterns.push(mem::take(&mut current));
323 in_wildcard = true;
324 }
325 current.consume += 1;
326 }
327 '%' if !in_escape => {
328 if !in_wildcard {
329 current.suffix.shrink_to_fit();
330 subpatterns.push(mem::take(&mut current));
331 in_wildcard = true;
332 }
333 current.many = true;
334 }
335 c => {
336 current.suffix.push(c);
337 in_escape = false;
338 in_wildcard = false;
339 }
340 }
341 }
342 if in_escape {
343 return Err(EvalError::UnterminatedLikeEscapeSequence);
344 }
345 current.suffix.shrink_to_fit();
346 subpatterns.push(current);
347 subpatterns.shrink_to_fit();
348 Ok(subpatterns)
349}
350
351/// Builds a regular expression that matches some parsed Subpatterns.
352fn build_regex(subpatterns: &[Subpattern], case_insensitive: bool) -> Result<Regex, EvalError> {
353 let mut r = String::from("^");
354 for sp in subpatterns {
355 sp.write_regex_to(&mut r);
356 }
357 r.push('$');
358 match Regex::new(&r, case_insensitive) {
359 Ok(regex) => Ok(regex),
360 Err(RegexCompilationError::PatternTooLarge { .. }) => Err(EvalError::LikePatternTooLong),
361 Err(RegexCompilationError::RegexError(regex::Error::CompiledTooBig(_))) => {
362 Err(EvalError::LikePatternTooLong)
363 }
364 Err(e) => Err(EvalError::Internal(
365 format!("build_regex produced invalid regex: {}", e).into(),
366 )),
367 }
368}
369
370// Unit Tests
371//
372// Most of the unit tests for LIKE and ILIKE can be found in:
373// test/sqllogictest/cockroach/like.slt
374// These tests are here as a convenient place to run quick tests while
375// actively working on changes to the implementation. Make sure you
376// run the full test suite before submitting any changes.
377
378#[cfg(test)]
379mod test {
380 use super::*;
381
382 #[mz_ore::test]
383 fn test_normalize_pattern() {
384 struct TestCase<'a> {
385 pattern: &'a str,
386 escape: EscapeBehavior,
387 expected: &'a str,
388 }
389 let test_cases = vec![
390 TestCase {
391 pattern: "",
392 escape: EscapeBehavior::Disabled,
393 expected: "",
394 },
395 TestCase {
396 pattern: "ban%na!",
397 escape: EscapeBehavior::default(),
398 expected: "ban%na!",
399 },
400 TestCase {
401 pattern: "ban%%%na!",
402 escape: EscapeBehavior::Char('%'),
403 expected: "ban\\%\\na!",
404 },
405 TestCase {
406 pattern: "ban%na\\!",
407 escape: EscapeBehavior::Char('n'),
408 expected: "ba\\%\\a\\\\!",
409 },
410 TestCase {
411 pattern: "ban%na\\!",
412 escape: EscapeBehavior::Disabled,
413 expected: "ban%na\\\\!",
414 },
415 TestCase {
416 pattern: "ban\\na!",
417 escape: EscapeBehavior::Char('n'),
418 expected: "ba\\\\\\a!",
419 },
420 TestCase {
421 pattern: "ban\\\\na!",
422 escape: EscapeBehavior::Char('n'),
423 expected: "ba\\\\\\\\\\a!",
424 },
425 TestCase {
426 pattern: "food",
427 escape: EscapeBehavior::Char('o'),
428 expected: "f\\od",
429 },
430 TestCase {
431 pattern: "漢漢",
432 escape: EscapeBehavior::Char('漢'),
433 expected: "\\漢",
434 },
435 ];
436
437 for input in test_cases {
438 let actual = normalize_pattern(input.pattern, input.escape).unwrap();
439 assert!(
440 actual == input.expected,
441 "normalize_pattern({:?}, {:?}):\n\tactual: {:?}\n\texpected: {:?}\n",
442 input.pattern,
443 input.escape,
444 actual,
445 input.expected,
446 );
447 }
448 }
449
450 #[mz_ore::test]
451 fn test_escape_too_long() {
452 match EscapeBehavior::from_str("foo") {
453 Err(EvalError::LikeEscapeTooLong) => {}
454 _ => {
455 panic!("expected error when using escape string with >1 character");
456 }
457 }
458 }
459
460 #[mz_ore::test]
461 fn test_like() {
462 struct Input<'a> {
463 haystack: &'a str,
464 matches: bool,
465 }
466 let input = |haystack, matches| Input { haystack, matches };
467 struct Pattern<'a> {
468 needle: &'a str,
469 case_insensitive: bool,
470 inputs: Vec<Input<'a>>,
471 }
472 let test_cases = vec![
473 Pattern {
474 needle: "ban%na!",
475 case_insensitive: false,
476 inputs: vec![input("banana!", true)],
477 },
478 Pattern {
479 needle: "foo",
480 case_insensitive: true,
481 inputs: vec![
482 input("", false),
483 input("f", false),
484 input("fo", false),
485 input("foo", true),
486 input("FOO", true),
487 input("Foo", true),
488 input("fOO", true),
489 input("food", false),
490 ],
491 },
492 ];
493
494 for tc in test_cases {
495 let matcher = compile(tc.needle, tc.case_insensitive).unwrap();
496 for input in tc.inputs {
497 let actual = matcher.is_match(input.haystack);
498 assert!(
499 actual == input.matches,
500 "{:?} {} {:?}:\n\tactual: {:?}\n\texpected: {:?}\n",
501 input.haystack,
502 match tc.case_insensitive {
503 true => "ILIKE",
504 false => "LIKE",
505 },
506 tc.needle,
507 actual,
508 input.matches,
509 );
510 }
511 }
512 }
513
514 // Patterns with two or more `%` wildcards now compile to the linear regex
515 // matcher instead of the back-tracking string matcher, which was
516 // super-linear: an adversarial pattern like `%a%a%a` against a long run of
517 // `a`s took time proportional to `len(text)^(number of %)`. Verify the
518 // routing change still yields correct match results (the regex matcher and
519 // the string matcher must agree), including the previously-pathological
520 // shape, which now completes instantly regardless of text length.
521 #[mz_ore::test]
522 fn test_many_wildcards_match_correctly() {
523 let long_a = "a".repeat(64);
524 let cases: &[(&str, &str, bool)] = &[
525 ("%a%a%a", "xaxaxa", true),
526 ("%a%a%a", "aaa", true),
527 ("%a%a%a", "aa", false),
528 ("%a%b%c%", "zzabqcz", true),
529 ("%a%b%c%", "cba", false),
530 ("a%b%c", "abc", true),
531 ("a%b%c", "axxbxxc", true),
532 ("a%b%c", "abcd", false),
533 // A single `%` still uses the (near-linear) string matcher.
534 ("%_%_%", "ab", true),
535 ("%%%%", "", true),
536 // The exact pathological shape from the fuzzer; super-linear before
537 // the fix, instant after.
538 ("%a%a%a%a%a", &long_a, true),
539 ("%a%a%a%a%a", "aaaa", false),
540 ];
541 for (pat, text, expected) in cases {
542 let m = compile(pat, false).unwrap();
543 assert_eq!(
544 m.is_match(text),
545 *expected,
546 "pattern {pat:?} against {text:?}"
547 );
548 }
549 }
550}