regex_automata/dfa/
search.rs

1use crate::{
2    dfa::{
3        accel,
4        automaton::{Automaton, OverlappingState},
5    },
6    util::{
7        prefilter::Prefilter,
8        primitives::StateID,
9        search::{Anchored, HalfMatch, Input, Span},
10    },
11    MatchError,
12};
13
14#[inline(never)]
15pub fn find_fwd<A: Automaton + ?Sized>(
16    dfa: &A,
17    input: &Input<'_>,
18) -> Result<Option<HalfMatch>, MatchError> {
19    if input.is_done() {
20        return Ok(None);
21    }
22    let pre = if input.get_anchored().is_anchored() {
23        None
24    } else {
25        dfa.get_prefilter()
26    };
27    // Searching with a pattern ID is always anchored, so we should never use
28    // a prefilter.
29    if pre.is_some() {
30        if input.get_earliest() {
31            find_fwd_imp(dfa, input, pre, true)
32        } else {
33            find_fwd_imp(dfa, input, pre, false)
34        }
35    } else {
36        if input.get_earliest() {
37            find_fwd_imp(dfa, input, None, true)
38        } else {
39            find_fwd_imp(dfa, input, None, false)
40        }
41    }
42}
43
44#[cfg_attr(feature = "perf-inline", inline(always))]
45fn find_fwd_imp<A: Automaton + ?Sized>(
46    dfa: &A,
47    input: &Input<'_>,
48    pre: Option<&'_ Prefilter>,
49    earliest: bool,
50) -> Result<Option<HalfMatch>, MatchError> {
51    // See 'prefilter_restart' docs for explanation.
52    let universal_start = dfa.universal_start_state(Anchored::No).is_some();
53    let mut mat = None;
54    let mut sid = init_fwd(dfa, input)?;
55    let mut at = input.start();
56    // This could just be a closure, but then I think it would be unsound
57    // because it would need to be safe to invoke. This way, the lack of safety
58    // is clearer in the code below.
59    macro_rules! next_unchecked {
60        ($sid:expr, $at:expr) => {{
61            let byte = *input.haystack().get_unchecked($at);
62            dfa.next_state_unchecked($sid, byte)
63        }};
64    }
65
66    if let Some(ref pre) = pre {
67        let span = Span::from(at..input.end());
68        // If a prefilter doesn't report false positives, then we don't need to
69        // touch the DFA at all. However, since all matches include the pattern
70        // ID, and the prefilter infrastructure doesn't report pattern IDs, we
71        // limit this optimization to cases where there is exactly one pattern.
72        // In that case, any match must be the 0th pattern.
73        match pre.find(input.haystack(), span) {
74            None => return Ok(mat),
75            Some(ref span) => {
76                at = span.start;
77                if !universal_start {
78                    sid = prefilter_restart(dfa, &input, at)?;
79                }
80            }
81        }
82    }
83    while at < input.end() {
84        // SAFETY: There are two safety invariants we need to uphold here in
85        // the loops below: that 'sid' and 'prev_sid' are valid state IDs
86        // for this DFA, and that 'at' is a valid index into 'haystack'.
87        // For the former, we rely on the invariant that next_state* and
88        // start_state_forward always returns a valid state ID (given a valid
89        // state ID in the former case). For the latter safety invariant, we
90        // always guard unchecked access with a check that 'at' is less than
91        // 'end', where 'end <= haystack.len()'. In the unrolled loop below, we
92        // ensure that 'at' is always in bounds.
93        //
94        // PERF: See a similar comment in src/hybrid/search.rs that justifies
95        // this extra work to make the search loop fast. The same reasoning and
96        // benchmarks apply here.
97        let mut prev_sid;
98        while at < input.end() {
99            prev_sid = unsafe { next_unchecked!(sid, at) };
100            if dfa.is_special_state(prev_sid) || at + 3 >= input.end() {
101                core::mem::swap(&mut prev_sid, &mut sid);
102                break;
103            }
104            at += 1;
105
106            sid = unsafe { next_unchecked!(prev_sid, at) };
107            if dfa.is_special_state(sid) {
108                break;
109            }
110            at += 1;
111
112            prev_sid = unsafe { next_unchecked!(sid, at) };
113            if dfa.is_special_state(prev_sid) {
114                core::mem::swap(&mut prev_sid, &mut sid);
115                break;
116            }
117            at += 1;
118
119            sid = unsafe { next_unchecked!(prev_sid, at) };
120            if dfa.is_special_state(sid) {
121                break;
122            }
123            at += 1;
124        }
125        if dfa.is_special_state(sid) {
126            if dfa.is_start_state(sid) {
127                if let Some(ref pre) = pre {
128                    let span = Span::from(at..input.end());
129                    match pre.find(input.haystack(), span) {
130                        None => return Ok(mat),
131                        Some(ref span) => {
132                            // We want to skip any update to 'at' below
133                            // at the end of this iteration and just
134                            // jump immediately back to the next state
135                            // transition at the leading position of the
136                            // candidate match.
137                            //
138                            // ... but only if we actually made progress
139                            // with our prefilter, otherwise if the start
140                            // state has a self-loop, we can get stuck.
141                            if span.start > at {
142                                at = span.start;
143                                if !universal_start {
144                                    sid = prefilter_restart(dfa, &input, at)?;
145                                }
146                                continue;
147                            }
148                        }
149                    }
150                } else if dfa.is_accel_state(sid) {
151                    let needles = dfa.accelerator(sid);
152                    at = accel::find_fwd(needles, input.haystack(), at + 1)
153                        .unwrap_or(input.end());
154                    continue;
155                }
156            } else if dfa.is_match_state(sid) {
157                let pattern = dfa.match_pattern(sid, 0);
158                mat = Some(HalfMatch::new(pattern, at));
159                if earliest {
160                    return Ok(mat);
161                }
162                if dfa.is_accel_state(sid) {
163                    let needles = dfa.accelerator(sid);
164                    at = accel::find_fwd(needles, input.haystack(), at + 1)
165                        .unwrap_or(input.end());
166                    continue;
167                }
168            } else if dfa.is_accel_state(sid) {
169                let needs = dfa.accelerator(sid);
170                at = accel::find_fwd(needs, input.haystack(), at + 1)
171                    .unwrap_or(input.end());
172                continue;
173            } else if dfa.is_dead_state(sid) {
174                return Ok(mat);
175            } else {
176                // It's important that this is a debug_assert, since this can
177                // actually be tripped even if DFA::from_bytes succeeds and
178                // returns a supposedly valid DFA.
179                return Err(MatchError::quit(input.haystack()[at], at));
180            }
181        }
182        at += 1;
183    }
184    eoi_fwd(dfa, input, &mut sid, &mut mat)?;
185    Ok(mat)
186}
187
188#[inline(never)]
189pub fn find_rev<A: Automaton + ?Sized>(
190    dfa: &A,
191    input: &Input<'_>,
192) -> Result<Option<HalfMatch>, MatchError> {
193    if input.is_done() {
194        return Ok(None);
195    }
196    if input.get_earliest() {
197        find_rev_imp(dfa, input, true)
198    } else {
199        find_rev_imp(dfa, input, false)
200    }
201}
202
203#[cfg_attr(feature = "perf-inline", inline(always))]
204fn find_rev_imp<A: Automaton + ?Sized>(
205    dfa: &A,
206    input: &Input<'_>,
207    earliest: bool,
208) -> Result<Option<HalfMatch>, MatchError> {
209    let mut mat = None;
210    let mut sid = init_rev(dfa, input)?;
211    // In reverse search, the loop below can't handle the case of searching an
212    // empty slice. Ideally we could write something congruent to the forward
213    // search, i.e., 'while at >= start', but 'start' might be 0. Since we use
214    // an unsigned offset, 'at >= 0' is trivially always true. We could avoid
215    // this extra case handling by using a signed offset, but Rust makes it
216    // annoying to do. So... We just handle the empty case separately.
217    if input.start() == input.end() {
218        eoi_rev(dfa, input, &mut sid, &mut mat)?;
219        return Ok(mat);
220    }
221
222    let mut at = input.end() - 1;
223    macro_rules! next_unchecked {
224        ($sid:expr, $at:expr) => {{
225            let byte = *input.haystack().get_unchecked($at);
226            dfa.next_state_unchecked($sid, byte)
227        }};
228    }
229    loop {
230        // SAFETY: See comments in 'find_fwd' for a safety argument.
231        let mut prev_sid;
232        while at >= input.start() {
233            prev_sid = unsafe { next_unchecked!(sid, at) };
234            if dfa.is_special_state(prev_sid)
235                || at <= input.start().saturating_add(3)
236            {
237                core::mem::swap(&mut prev_sid, &mut sid);
238                break;
239            }
240            at -= 1;
241
242            sid = unsafe { next_unchecked!(prev_sid, at) };
243            if dfa.is_special_state(sid) {
244                break;
245            }
246            at -= 1;
247
248            prev_sid = unsafe { next_unchecked!(sid, at) };
249            if dfa.is_special_state(prev_sid) {
250                core::mem::swap(&mut prev_sid, &mut sid);
251                break;
252            }
253            at -= 1;
254
255            sid = unsafe { next_unchecked!(prev_sid, at) };
256            if dfa.is_special_state(sid) {
257                break;
258            }
259            at -= 1;
260        }
261        if dfa.is_special_state(sid) {
262            if dfa.is_start_state(sid) {
263                if dfa.is_accel_state(sid) {
264                    let needles = dfa.accelerator(sid);
265                    at = accel::find_rev(needles, input.haystack(), at)
266                        .map(|i| i + 1)
267                        .unwrap_or(input.start());
268                }
269            } else if dfa.is_match_state(sid) {
270                let pattern = dfa.match_pattern(sid, 0);
271                // Since reverse searches report the beginning of a match
272                // and the beginning is inclusive (not exclusive like the
273                // end of a match), we add 1 to make it inclusive.
274                mat = Some(HalfMatch::new(pattern, at + 1));
275                if earliest {
276                    return Ok(mat);
277                }
278                if dfa.is_accel_state(sid) {
279                    let needles = dfa.accelerator(sid);
280                    at = accel::find_rev(needles, input.haystack(), at)
281                        .map(|i| i + 1)
282                        .unwrap_or(input.start());
283                }
284            } else if dfa.is_accel_state(sid) {
285                let needles = dfa.accelerator(sid);
286                // If the accelerator returns nothing, why don't we quit the
287                // search? Well, if the accelerator doesn't find anything, that
288                // doesn't mean we don't have a match. It just means that we
289                // can't leave the current state given one of the 255 possible
290                // byte values. However, there might be an EOI transition. So
291                // we set 'at' to the end of the haystack, which will cause
292                // this loop to stop and fall down into the EOI transition.
293                at = accel::find_rev(needles, input.haystack(), at)
294                    .map(|i| i + 1)
295                    .unwrap_or(input.start());
296            } else if dfa.is_dead_state(sid) {
297                return Ok(mat);
298            } else {
299                return Err(MatchError::quit(input.haystack()[at], at));
300            }
301        }
302        if at == input.start() {
303            break;
304        }
305        at -= 1;
306    }
307    eoi_rev(dfa, input, &mut sid, &mut mat)?;
308    Ok(mat)
309}
310
311#[inline(never)]
312pub fn find_overlapping_fwd<A: Automaton + ?Sized>(
313    dfa: &A,
314    input: &Input<'_>,
315    state: &mut OverlappingState,
316) -> Result<(), MatchError> {
317    state.mat = None;
318    if input.is_done() {
319        return Ok(());
320    }
321    let pre = if input.get_anchored().is_anchored() {
322        None
323    } else {
324        dfa.get_prefilter()
325    };
326    if pre.is_some() {
327        find_overlapping_fwd_imp(dfa, input, pre, state)
328    } else {
329        find_overlapping_fwd_imp(dfa, input, None, state)
330    }
331}
332
333#[cfg_attr(feature = "perf-inline", inline(always))]
334fn find_overlapping_fwd_imp<A: Automaton + ?Sized>(
335    dfa: &A,
336    input: &Input<'_>,
337    pre: Option<&'_ Prefilter>,
338    state: &mut OverlappingState,
339) -> Result<(), MatchError> {
340    // See 'prefilter_restart' docs for explanation.
341    let universal_start = dfa.universal_start_state(Anchored::No).is_some();
342    let mut sid = match state.id {
343        None => {
344            state.at = input.start();
345            init_fwd(dfa, input)?
346        }
347        Some(sid) => {
348            if let Some(match_index) = state.next_match_index {
349                let match_len = dfa.match_len(sid);
350                if match_index < match_len {
351                    state.next_match_index = Some(match_index + 1);
352                    let pattern = dfa.match_pattern(sid, match_index);
353                    state.mat = Some(HalfMatch::new(pattern, state.at));
354                    return Ok(());
355                }
356            }
357            // Once we've reported all matches at a given position, we need to
358            // advance the search to the next position.
359            state.at += 1;
360            if state.at > input.end() {
361                return Ok(());
362            }
363            sid
364        }
365    };
366
367    // NOTE: We don't optimize the crap out of this routine primarily because
368    // it seems like most find_overlapping searches will have higher match
369    // counts, and thus, throughput is perhaps not as important. But if you
370    // have a use case for something faster, feel free to file an issue.
371    while state.at < input.end() {
372        sid = dfa.next_state(sid, input.haystack()[state.at]);
373        if dfa.is_special_state(sid) {
374            state.id = Some(sid);
375            if dfa.is_start_state(sid) {
376                if let Some(ref pre) = pre {
377                    let span = Span::from(state.at..input.end());
378                    match pre.find(input.haystack(), span) {
379                        None => return Ok(()),
380                        Some(ref span) => {
381                            if span.start > state.at {
382                                state.at = span.start;
383                                if !universal_start {
384                                    sid = prefilter_restart(
385                                        dfa, &input, state.at,
386                                    )?;
387                                }
388                                continue;
389                            }
390                        }
391                    }
392                } else if dfa.is_accel_state(sid) {
393                    let needles = dfa.accelerator(sid);
394                    state.at = accel::find_fwd(
395                        needles,
396                        input.haystack(),
397                        state.at + 1,
398                    )
399                    .unwrap_or(input.end());
400                    continue;
401                }
402            } else if dfa.is_match_state(sid) {
403                state.next_match_index = Some(1);
404                let pattern = dfa.match_pattern(sid, 0);
405                state.mat = Some(HalfMatch::new(pattern, state.at));
406                return Ok(());
407            } else if dfa.is_accel_state(sid) {
408                let needs = dfa.accelerator(sid);
409                // If the accelerator returns nothing, why don't we quit the
410                // search? Well, if the accelerator doesn't find anything, that
411                // doesn't mean we don't have a match. It just means that we
412                // can't leave the current state given one of the 255 possible
413                // byte values. However, there might be an EOI transition. So
414                // we set 'at' to the end of the haystack, which will cause
415                // this loop to stop and fall down into the EOI transition.
416                state.at =
417                    accel::find_fwd(needs, input.haystack(), state.at + 1)
418                        .unwrap_or(input.end());
419                continue;
420            } else if dfa.is_dead_state(sid) {
421                return Ok(());
422            } else {
423                return Err(MatchError::quit(
424                    input.haystack()[state.at],
425                    state.at,
426                ));
427            }
428        }
429        state.at += 1;
430    }
431
432    let result = eoi_fwd(dfa, input, &mut sid, &mut state.mat);
433    state.id = Some(sid);
434    if state.mat.is_some() {
435        // '1' is always correct here since if we get to this point, this
436        // always corresponds to the first (index '0') match discovered at
437        // this position. So the next match to report at this position (if
438        // it exists) is at index '1'.
439        state.next_match_index = Some(1);
440    }
441    result
442}
443
444#[inline(never)]
445pub(crate) fn find_overlapping_rev<A: Automaton + ?Sized>(
446    dfa: &A,
447    input: &Input<'_>,
448    state: &mut OverlappingState,
449) -> Result<(), MatchError> {
450    state.mat = None;
451    if input.is_done() {
452        return Ok(());
453    }
454    let mut sid = match state.id {
455        None => {
456            let sid = init_rev(dfa, input)?;
457            state.id = Some(sid);
458            if input.start() == input.end() {
459                state.rev_eoi = true;
460            } else {
461                state.at = input.end() - 1;
462            }
463            sid
464        }
465        Some(sid) => {
466            if let Some(match_index) = state.next_match_index {
467                let match_len = dfa.match_len(sid);
468                if match_index < match_len {
469                    state.next_match_index = Some(match_index + 1);
470                    let pattern = dfa.match_pattern(sid, match_index);
471                    state.mat = Some(HalfMatch::new(pattern, state.at));
472                    return Ok(());
473                }
474            }
475            // Once we've reported all matches at a given position, we need
476            // to advance the search to the next position. However, if we've
477            // already followed the EOI transition, then we know we're done
478            // with the search and there cannot be any more matches to report.
479            if state.rev_eoi {
480                return Ok(());
481            } else if state.at == input.start() {
482                // At this point, we should follow the EOI transition. This
483                // will cause us the skip the main loop below and fall through
484                // to the final 'eoi_rev' transition.
485                state.rev_eoi = true;
486            } else {
487                // We haven't hit the end of the search yet, so move on.
488                state.at -= 1;
489            }
490            sid
491        }
492    };
493    while !state.rev_eoi {
494        sid = dfa.next_state(sid, input.haystack()[state.at]);
495        if dfa.is_special_state(sid) {
496            state.id = Some(sid);
497            if dfa.is_start_state(sid) {
498                if dfa.is_accel_state(sid) {
499                    let needles = dfa.accelerator(sid);
500                    state.at =
501                        accel::find_rev(needles, input.haystack(), state.at)
502                            .map(|i| i + 1)
503                            .unwrap_or(input.start());
504                }
505            } else if dfa.is_match_state(sid) {
506                state.next_match_index = Some(1);
507                let pattern = dfa.match_pattern(sid, 0);
508                state.mat = Some(HalfMatch::new(pattern, state.at + 1));
509                return Ok(());
510            } else if dfa.is_accel_state(sid) {
511                let needles = dfa.accelerator(sid);
512                // If the accelerator returns nothing, why don't we quit the
513                // search? Well, if the accelerator doesn't find anything, that
514                // doesn't mean we don't have a match. It just means that we
515                // can't leave the current state given one of the 255 possible
516                // byte values. However, there might be an EOI transition. So
517                // we set 'at' to the end of the haystack, which will cause
518                // this loop to stop and fall down into the EOI transition.
519                state.at =
520                    accel::find_rev(needles, input.haystack(), state.at)
521                        .map(|i| i + 1)
522                        .unwrap_or(input.start());
523            } else if dfa.is_dead_state(sid) {
524                return Ok(());
525            } else {
526                return Err(MatchError::quit(
527                    input.haystack()[state.at],
528                    state.at,
529                ));
530            }
531        }
532        if state.at == input.start() {
533            break;
534        }
535        state.at -= 1;
536    }
537
538    let result = eoi_rev(dfa, input, &mut sid, &mut state.mat);
539    state.rev_eoi = true;
540    state.id = Some(sid);
541    if state.mat.is_some() {
542        // '1' is always correct here since if we get to this point, this
543        // always corresponds to the first (index '0') match discovered at
544        // this position. So the next match to report at this position (if
545        // it exists) is at index '1'.
546        state.next_match_index = Some(1);
547    }
548    result
549}
550
551#[cfg_attr(feature = "perf-inline", inline(always))]
552fn init_fwd<A: Automaton + ?Sized>(
553    dfa: &A,
554    input: &Input<'_>,
555) -> Result<StateID, MatchError> {
556    let sid = dfa.start_state_forward(input)?;
557    // Start states can never be match states, since all matches are delayed
558    // by 1 byte.
559    debug_assert!(!dfa.is_match_state(sid));
560    Ok(sid)
561}
562
563#[cfg_attr(feature = "perf-inline", inline(always))]
564fn init_rev<A: Automaton + ?Sized>(
565    dfa: &A,
566    input: &Input<'_>,
567) -> Result<StateID, MatchError> {
568    let sid = dfa.start_state_reverse(input)?;
569    // Start states can never be match states, since all matches are delayed
570    // by 1 byte.
571    debug_assert!(!dfa.is_match_state(sid));
572    Ok(sid)
573}
574
575#[cfg_attr(feature = "perf-inline", inline(always))]
576fn eoi_fwd<A: Automaton + ?Sized>(
577    dfa: &A,
578    input: &Input<'_>,
579    sid: &mut StateID,
580    mat: &mut Option<HalfMatch>,
581) -> Result<(), MatchError> {
582    let sp = input.get_span();
583    match input.haystack().get(sp.end) {
584        Some(&b) => {
585            *sid = dfa.next_state(*sid, b);
586            if dfa.is_match_state(*sid) {
587                let pattern = dfa.match_pattern(*sid, 0);
588                *mat = Some(HalfMatch::new(pattern, sp.end));
589            } else if dfa.is_quit_state(*sid) {
590                return Err(MatchError::quit(b, sp.end));
591            }
592        }
593        None => {
594            *sid = dfa.next_eoi_state(*sid);
595            if dfa.is_match_state(*sid) {
596                let pattern = dfa.match_pattern(*sid, 0);
597                *mat = Some(HalfMatch::new(pattern, input.haystack().len()));
598            }
599        }
600    }
601    Ok(())
602}
603
604#[cfg_attr(feature = "perf-inline", inline(always))]
605fn eoi_rev<A: Automaton + ?Sized>(
606    dfa: &A,
607    input: &Input<'_>,
608    sid: &mut StateID,
609    mat: &mut Option<HalfMatch>,
610) -> Result<(), MatchError> {
611    let sp = input.get_span();
612    if sp.start > 0 {
613        let byte = input.haystack()[sp.start - 1];
614        *sid = dfa.next_state(*sid, byte);
615        if dfa.is_match_state(*sid) {
616            let pattern = dfa.match_pattern(*sid, 0);
617            *mat = Some(HalfMatch::new(pattern, sp.start));
618        } else if dfa.is_quit_state(*sid) {
619            return Err(MatchError::quit(byte, sp.start - 1));
620        }
621    } else {
622        *sid = dfa.next_eoi_state(*sid);
623        if dfa.is_match_state(*sid) {
624            let pattern = dfa.match_pattern(*sid, 0);
625            *mat = Some(HalfMatch::new(pattern, 0));
626        }
627    }
628    Ok(())
629}
630
631/// Re-compute the starting state that a DFA should be in after finding a
632/// prefilter candidate match at the position `at`.
633///
634/// The function with the same name has a bit more docs in hybrid/search.rs.
635#[cfg_attr(feature = "perf-inline", inline(always))]
636fn prefilter_restart<A: Automaton + ?Sized>(
637    dfa: &A,
638    input: &Input<'_>,
639    at: usize,
640) -> Result<StateID, MatchError> {
641    let mut input = input.clone();
642    input.set_start(at);
643    init_fwd(dfa, &input)
644}