regex_lite/
pikevm.rs

Help
1use alloc::{vec, vec::Vec};
2
3use crate::{
4    int::{NonMaxUsize, U32},
5    nfa::{State, StateID, NFA},
6    pool::CachePoolGuard,
7    utf8,
8};
9
10/// A PikeVM searcher.
11///
12/// A PikeVM uses the standard Thompson NFA linear time search algorithm, but
13/// augmented to support tracking the offsets of matching capture groups.
14#[derive(Clone, Debug)]
15pub(crate) struct PikeVM {
16    nfa: NFA,
17}
18
19impl PikeVM {
20    /// Create a new PikeVM searcher that uses the given NFA.
21    pub(crate) fn new(nfa: NFA) -> PikeVM {
22        PikeVM { nfa }
23    }
24
25    /// Return the underlying NFA used by this PikeVM.
26    pub(crate) fn nfa(&self) -> &NFA {
27        &self.nfa
28    }
29
30    /// Returns an iterator of non-overlapping matches in the given haystack.
31    pub(crate) fn find_iter<'r, 'h>(
32        &'r self,
33        cache: CachePoolGuard<'r>,
34        haystack: &'h [u8],
35    ) -> FindMatches<'r, 'h> {
36        FindMatches {
37            pikevm: self,
38            cache,
39            haystack,
40            at: 0,
41            slots: vec![None, None],
42            last_match_end: None,
43        }
44    }
45
46    /// Returns an iterator of non-overlapping capture matches in the given
47    /// haystack.
48    pub(crate) fn captures_iter<'r, 'h>(
49        &'r self,
50        cache: CachePoolGuard<'r>,
51        haystack: &'h [u8],
52    ) -> CapturesMatches<'r, 'h> {
53        // OK because the NFA wouldn't have compiled if this could overflow.
54        let len = self.nfa().group_len().checked_mul(2).unwrap();
55        CapturesMatches {
56            it: FindMatches {
57                pikevm: self,
58                cache,
59                haystack,
60                at: 0,
61                slots: vec![None; len],
62                last_match_end: None,
63            },
64        }
65    }
66
67    /// The implementation of standard leftmost search.
68    ///
69    /// Capturing group spans are written to `slots`, but only if requested.
70    /// `slots` can be any length. Any slot in the NFA that is activated but
71    /// which is out of bounds for the given `slots` is ignored.
72    pub(crate) fn search(
73        &self,
74        cache: &mut Cache,
75        haystack: &[u8],
76        start: usize,
77        end: usize,
78        earliest: bool,
79        slots: &mut [Option<NonMaxUsize>],
80    ) -> bool {
81        cache.setup_search(slots.len());
82        if start > end {
83            return false;
84        }
85        // Why do we even care about this? Well, in our `slots` representation,
86        // we use usize::MAX as a sentinel to indicate "no match." This isn't
87        // problematic so long as our haystack doesn't have a maximal length.
88        // Byte slices are guaranteed by Rust to have a length that fits into
89        // isize, and so this assert should always pass. But we put it here to
90        // make our assumption explicit.
91        assert!(
92            haystack.len() < core::usize::MAX,
93            "byte slice lengths must be less than usize MAX",
94        );
95
96        let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
97        let start_id = self.nfa().start();
98        let anchored = self.nfa().is_start_anchored();
99        let mut matched = false;
100        // Yes, our search doesn't end at `end`, but includes it. This is
101        // necessary because matches are delayed by one byte. The delay is used
102        // to handle look-behind assertions. In the case of the PikeVM, the
103        // delay is implemented by not considering a match to exist until it
104        // is visited in `nexts`. Technically, we know a match exists in the
105        // previous iteration via `epsilon_closure`.
106        let mut at = start;
107        while at <= end {
108            // If we have no states left to visit, then there are some cases
109            // where we know we can quit early or even skip ahead.
110            if curr.set.is_empty() {
111                // We have a match so we can quit.
112                if matched {
113                    break;
114                }
115                // If we're running an anchored search and we've advanced
116                // beyond the start position with no other states to try, then
117                // we will never observe a match and thus can stop.
118                if anchored && at > start {
119                    break;
120                }
121            }
122            // Instead of using a hypothetical unanchored start state in the
123            // NFA (which doesn't exist, but we could add it), we actually
124            // always use its anchored starting state. As a result, when doing
125            // an unanchored search, we need to simulate our own '(?s:.)*?'
126            // prefix, to permit a match to appear anywhere.
127            //
128            // Now, we don't *have* to do things this way. We could create
129            // a proper unanchored start state in the NFA and do one
130            // `epsilon_closure` call from that starting state before the main
131            // loop here. And that is just as correct. However, it turns out to
132            // be slower than our approach here because it slightly increases
133            // the cost of processing each byte by requiring us to visit
134            // more NFA states to deal with the additional NFA states in the
135            // unanchored prefix. By simulating it explicitly here, we lower
136            // those costs substantially. The cost is itself small, but it adds
137            // up for large haystacks.
138            //
139            // In order to simulate the '(?s:.)*?' prefix---which is not
140            // greedy---we are careful not to perform an epsilon closure on
141            // the start state if we already have a match. Namely, if we
142            // did otherwise, we would never reach a terminating condition
143            // because there would always be additional states to process.
144            if !matched {
145                // Since we are adding to the 'curr' active states and since
146                // this is for the start ID, we use a slots slice that is
147                // guaranteed to have the right length but where every element
148                // is absent. This is exactly what we want, because this
149                // epsilon closure is responsible for simulating an unanchored
150                // '(?s:.)*?' prefix. It is specifically outside of any
151                // capturing groups, and thus, using slots that are always
152                // absent is correct.
153                //
154                // Note though that we can't just use `&mut []` here, since
155                // this epsilon closure may traverse through `Capture` states
156                // transitions, and thus must be able to write offsets to the
157                // slots given which are later copied to slot values in `curr`.
158                let slots = next.slot_table.all_absent();
159                self.epsilon_closure(
160                    stack, slots, curr, haystack, at, start_id,
161                );
162            }
163            let (ch, len) = utf8::decode_lossy(&haystack[at..]);
164            if self.nexts(stack, curr, next, haystack, at, ch, len, slots) {
165                matched = true;
166            }
167            // Unless the caller asked us to return early, we need to mush
168            // on to see if we can extend our match. (But note that 'nexts'
169            // will quit right after seeing a match, as is consistent with
170            // leftmost-first match priority.)
171            if (earliest && matched) || len == 0 {
172                break;
173            }
174            core::mem::swap(curr, next);
175            next.set.clear();
176            at += len;
177        }
178        matched
179    }
180
181    /// Process the active states in 'curr' to find the states (written to
182    /// 'next') we should process for the next byte in the haystack.
183    ///
184    /// 'stack' is used to perform a depth first traversal of the NFA when
185    /// computing an epsilon closure.
186    ///
187    /// When a match is found, the slots for that match state (in 'curr') are
188    /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
189    /// stops (unless the PikeVM was configured with MatchKind::All semantics).
190    ///
191    /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
192    /// in `haystack`.
193    ///
194    /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
195    /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
196    /// where `at_ch` is the replacement codepoint that results from decoding
197    /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
198    fn nexts(
199        &self,
200        stack: &mut Vec<FollowEpsilon>,
201        curr: &mut ActiveStates,
202        next: &mut ActiveStates,
203        haystack: &[u8],
204        at: usize,
205        at_ch: char,
206        at_len: usize,
207        slots: &mut [Option<NonMaxUsize>],
208    ) -> bool {
209        let ActiveStates { ref set, ref mut slot_table } = *curr;
210        for sid in set.iter() {
211            if self.next(
212                stack, slot_table, next, haystack, at, at_ch, at_len, sid,
213            ) {
214                slots.copy_from_slice(slot_table.for_state(sid));
215                return true;
216            }
217        }
218        false
219    }
220
221    /// Starting from `sid`, if the position `at` in the `haystack` has a
222    /// transition defined out of `sid`, then add the state transitioned to and
223    /// its epsilon closure to the `next` set of states to explore.
224    ///
225    /// `stack` is used by the epsilon closure computation to perform a depth
226    /// first traversal of the NFA.
227    ///
228    /// `curr_slot_table` should be the table of slots for the current set of
229    /// states being explored. If there is a transition out of `sid`, then
230    /// sid's row in the slot table is used to perform the epsilon closure.
231    ///
232    /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
233    /// in `haystack`. The caller provides it so that this routine doesn't
234    /// need to re-decode it. (Since it's expected that this routine is called
235    /// multiple times for each position.)
236    ///
237    /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
238    /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
239    /// where `at_ch` is the replacement codepoint that results from decoding
240    /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
241    fn next(
242        &self,
243        stack: &mut Vec<FollowEpsilon>,
244        curr_slot_table: &mut SlotTable,
245        next: &mut ActiveStates,
246        haystack: &[u8],
247        at: usize,
248        at_ch: char,
249        at_len: usize,
250        sid: StateID,
251    ) -> bool {
252        match *self.nfa.state(sid) {
253            State::Fail
254            | State::Goto { .. }
255            | State::Splits { .. }
256            | State::Capture { .. } => false,
257            State::Char { target, ch } => {
258                if at_ch == ch && at_len > 0 {
259                    let slots = curr_slot_table.for_state(sid);
260                    // OK because `at_len` is always derived from the number
261                    // of bytes read from `at` that make up `at_ch`. So this
262                    // will never wrap.
263                    let at = at.wrapping_add(at_len);
264                    self.epsilon_closure(
265                        stack, slots, next, haystack, at, target,
266                    );
267                }
268                false
269            }
270            State::Ranges { target, ref ranges } => {
271                for (start, end) in ranges.iter().copied() {
272                    if start > at_ch {
273                        break;
274                    } else if start <= at_ch && at_ch <= end {
275                        if at_len == 0 {
276                            return false;
277                        }
278                        let slots = curr_slot_table.for_state(sid);
279                        // OK because `at_len` is always derived from the
280                        // number of bytes read from `at` that make up `at_ch`.
281                        // So this will never wrap.
282                        let at = at.wrapping_add(at_len);
283                        self.epsilon_closure(
284                            stack, slots, next, haystack, at, target,
285                        );
286                    }
287                }
288                false
289            }
290            State::Match => true,
291        }
292    }
293
294    /// Compute the epsilon closure of `sid`, writing the closure into `next`
295    /// while copying slot values from `curr_slots` into corresponding states
296    /// in `next`. `curr_slots` should be the slot values corresponding to
297    /// `sid`.
298    ///
299    /// The given `stack` is used to perform a depth first traversal of the
300    /// NFA by recursively following all epsilon transitions out of `sid`.
301    /// Conditional epsilon transitions are followed if and only if they are
302    /// satisfied for the position `at` in the `input` haystack.
303    ///
304    /// While this routine may write to `curr_slots`, once it returns, any
305    /// writes are undone and the original values (even if absent) are
306    /// restored.
307    fn epsilon_closure(
308        &self,
309        stack: &mut Vec<FollowEpsilon>,
310        curr_slots: &mut [Option<NonMaxUsize>],
311        next: &mut ActiveStates,
312        haystack: &[u8],
313        at: usize,
314        sid: StateID,
315    ) {
316        stack.push(FollowEpsilon::Explore(sid));
317        while let Some(frame) = stack.pop() {
318            match frame {
319                FollowEpsilon::RestoreCapture { slot, offset } => {
320                    curr_slots[slot.as_usize()] = offset;
321                }
322                FollowEpsilon::Explore(sid) => {
323                    self.epsilon_closure_explore(
324                        stack, curr_slots, next, haystack, at, sid,
325                    );
326                }
327            }
328        }
329    }
330
331    /// Explore all of the epsilon transitions out of `sid`. This is mostly
332    /// split out from `epsilon_closure` in order to clearly delineate
333    /// the actual work of computing an epsilon closure from the stack
334    /// book-keeping.
335    ///
336    /// This will push any additional explorations needed on to `stack`.
337    ///
338    /// `curr_slots` should refer to the slots for the currently active NFA
339    /// state. That is, the current state we are stepping through. These
340    /// slots are mutated in place as new `Captures` states are traversed
341    /// during epsilon closure, but the slots are restored to their original
342    /// values once the full epsilon closure is completed. The ultimate use of
343    /// `curr_slots` is to copy them to the corresponding `next_slots`, so that
344    /// the capturing group spans are forwarded from the currently active state
345    /// to the next.
346    ///
347    /// `next` refers to the next set of active states. Computing an epsilon
348    /// closure may increase the next set of active states.
349    ///
350    /// `haystack` refers to the what we're searching and `at` refers to the
351    /// current position in the haystack. These are used to check whether
352    /// conditional epsilon transitions (like look-around) are satisfied at
353    /// the current position. If they aren't, then the epsilon closure won't
354    /// include them.
355    fn epsilon_closure_explore(
356        &self,
357        stack: &mut Vec<FollowEpsilon>,
358        curr_slots: &mut [Option<NonMaxUsize>],
359        next: &mut ActiveStates,
360        haystack: &[u8],
361        at: usize,
362        mut sid: StateID,
363    ) {
364        // We can avoid pushing some state IDs on to our stack in precisely
365        // the cases where a 'push(x)' would be immediately followed by a 'x
366        // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
367        // to be the next state ID we want to explore once we're done with
368        // our initial exploration. In practice, this avoids a lot of stack
369        // thrashing.
370        loop {
371            // Record this state as part of our next set of active states. If
372            // we've already explored it, then no need to do it again.
373            if !next.set.insert(sid) {
374                return;
375            }
376            match *self.nfa.state(sid) {
377                State::Fail
378                | State::Match { .. }
379                | State::Char { .. }
380                | State::Ranges { .. } => {
381                    next.slot_table.for_state(sid).copy_from_slice(curr_slots);
382                    return;
383                }
384                State::Goto { target, look: None } => {
385                    sid = target;
386                }
387                State::Goto { target, look: Some(look) } => {
388                    if !look.is_match(haystack, at) {
389                        return;
390                    }
391                    sid = target;
392                }
393                State::Splits { ref targets, reverse: false } => {
394                    sid = match targets.get(0) {
395                        None => return,
396                        Some(&sid) => sid,
397                    };
398                    stack.extend(
399                        targets[1..]
400                            .iter()
401                            .copied()
402                            .rev()
403                            .map(FollowEpsilon::Explore),
404                    );
405                }
406                State::Splits { ref targets, reverse: true } => {
407                    sid = match targets.last() {
408                        None => return,
409                        Some(&sid) => sid,
410                    };
411                    stack.extend(
412                        targets[..targets.len() - 1]
413                            .iter()
414                            .copied()
415                            .map(FollowEpsilon::Explore),
416                    );
417                }
418                State::Capture { target, slot } => {
419                    // There's no need to do anything with slots that
420                    // ultimately won't be copied into the caller-provided
421                    // 'Captures' value. So we just skip dealing with them at
422                    // all.
423                    if slot.as_usize() < curr_slots.len() {
424                        stack.push(FollowEpsilon::RestoreCapture {
425                            slot,
426                            offset: curr_slots[slot.as_usize()],
427                        });
428                        // OK because length of a slice must fit into an isize.
429                        curr_slots[slot.as_usize()] =
430                            Some(NonMaxUsize::new(at).unwrap());
431                    }
432                    sid = target;
433                }
434            }
435        }
436    }
437}
438
439/// An iterator over all successive non-overlapping matches in a particular
440/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
441/// of the cache and `'h` represents the lifetime of the haystack.
442#[derive(Debug)]
443pub(crate) struct FindMatches<'r, 'h> {
444    pikevm: &'r PikeVM,
445    cache: CachePoolGuard<'r>,
446    haystack: &'h [u8],
447    at: usize,
448    slots: Vec<Option<NonMaxUsize>>,
449    last_match_end: Option<usize>,
450}
451
452impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
453    type Item = (usize, usize);
454
455    fn next(&mut self) -> Option<(usize, usize)> {
456        if !self.pikevm.search(
457            &mut self.cache,
458            self.haystack,
459            self.at,
460            self.haystack.len(),
461            false,
462            &mut self.slots,
463        ) {
464            return None;
465        }
466        let mut m =
467            (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
468        if m.0 >= m.1 {
469            m = self.handle_overlapping_empty_match(m)?;
470        }
471        self.at = m.1;
472        self.last_match_end = Some(m.1);
473        Some(m)
474    }
475}
476
477impl<'r, 'h> FindMatches<'r, 'h> {
478    /// Handles the special case of an empty match by ensuring that 1) the
479    /// iterator always advances and 2) empty matches never overlap with other
480    /// matches.
481    ///
482    /// Note that we mark this cold and forcefully prevent inlining because
483    /// handling empty matches like this is extremely rare and does require a
484    /// bit of code, comparatively. Keeping this code out of the main iterator
485    /// function keeps it smaller and more amenable to inlining itself.
486    #[cold]
487    #[inline(never)]
488    fn handle_overlapping_empty_match(
489        &mut self,
490        mut m: (usize, usize),
491    ) -> Option<(usize, usize)> {
492        assert!(m.0 >= m.1);
493        if Some(m.1) == self.last_match_end {
494            let len =
495                core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1);
496            self.at = self.at.checked_add(len).unwrap();
497            if !self.pikevm.search(
498                &mut self.cache,
499                self.haystack,
500                self.at,
501                self.haystack.len(),
502                false,
503                &mut self.slots,
504            ) {
505                return None;
506            }
507            m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
508        }
509        Some(m)
510    }
511}
512
513/// An iterator over all successive non-overlapping capture matches in a particular
514/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
515/// of the cache and `'h` represents the lifetime of the haystack.
516#[derive(Debug)]
517pub(crate) struct CapturesMatches<'r, 'h> {
518    it: FindMatches<'r, 'h>,
519}
520
521impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
522    type Item = Vec<Option<NonMaxUsize>>;
523
524    fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> {
525        self.it.next()?;
526        Some(self.it.slots.clone())
527    }
528}
529
530/// A cache represents mutable state that a `PikeVM` requires during a search.
531///
532/// For a given `PikeVM`, its corresponding cache may be created either via
533/// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every
534/// way, except the former does not require explicitly importing `Cache`.
535///
536/// A particular `Cache` is coupled with the `PikeVM` from which it was
537/// created. It may only be used with that `PikeVM`. A cache and its
538/// allocations may be re-purposed via `Cache::reset`, in which case, it can
539/// only be used with the new `PikeVM` (and not the old one).
540#[derive(Clone, Debug)]
541pub(crate) struct Cache {
542    /// Stack used while computing epsilon closure. This effectively lets us
543    /// move what is more naturally expressed through recursion to a stack
544    /// on the heap.
545    stack: Vec<FollowEpsilon>,
546    /// The current active states being explored for the current byte in the
547    /// haystack.
548    curr: ActiveStates,
549    /// The next set of states we're building that will be explored for the
550    /// next byte in the haystack.
551    next: ActiveStates,
552}
553
554impl Cache {
555    /// Create a new `PikeVM` cache.
556    ///
557    /// A potentially more convenient routine to create a cache is
558    /// `PikeVM::create_cache`, as it does not require also importing the
559    /// `Cache` type.
560    ///
561    /// If you want to reuse the returned `Cache` with some other `PikeVM`,
562    /// then you must call `Cache::reset` with the desired `PikeVM`.
563    pub(crate) fn new(re: &PikeVM) -> Cache {
564        Cache {
565            stack: vec![],
566            curr: ActiveStates::new(re),
567            next: ActiveStates::new(re),
568        }
569    }
570
571    /// Clears this cache. This should be called at the start of every search
572    /// to ensure we start with a clean slate.
573    ///
574    /// This also sets the length of the capturing groups used in the current
575    /// search. This permits an optimization where by 'SlotTable::for_state'
576    /// only returns the number of slots equivalent to the number of slots
577    /// given in the 'Captures' value. This may be less than the total number
578    /// of possible slots, e.g., when one only wants to track overall match
579    /// offsets. This in turn permits less copying of capturing group spans
580    /// in the PikeVM.
581    fn setup_search(&mut self, captures_slot_len: usize) {
582        self.stack.clear();
583        self.curr.setup_search(captures_slot_len);
584        self.next.setup_search(captures_slot_len);
585    }
586}
587
588/// A set of active states used to "simulate" the execution of an NFA via the
589/// PikeVM.
590///
591/// There are two sets of these used during NFA simulation. One set corresponds
592/// to the "current" set of states being traversed for the current position
593/// in a haystack. The other set corresponds to the "next" set of states being
594/// built, which will become the new "current" set for the next position in the
595/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
596/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
597///
598/// In addition to representing a set of NFA states, this also maintains slot
599/// values for each state. These slot values are what turn the NFA simulation
600/// into the "Pike VM." Namely, they track capturing group values for each
601/// state. During the computation of epsilon closure, we copy slot values from
602/// states in the "current" set to the "next" set. Eventually, once a match
603/// is found, the slot values for that match state are what we write to the
604/// caller provided slots.
605#[derive(Clone, Debug)]
606struct ActiveStates {
607    /// The set of active NFA states. This set preserves insertion order, which
608    /// is critical for simulating the match semantics of backtracking regex
609    /// engines.
610    set: SparseSet,
611    /// The slots for every NFA state, where each slot stores a (possibly
612    /// absent) offset. Every capturing group has two slots. One for a start
613    /// offset and one for an end offset.
614    slot_table: SlotTable,
615}
616
617impl ActiveStates {
618    /// Create a new set of active states for the given PikeVM. The active
619    /// states returned may only be used with the given PikeVM. (Use 'reset'
620    /// to re-purpose the allocation for a different PikeVM.)
621    fn new(re: &PikeVM) -> ActiveStates {
622        let mut active = ActiveStates {
623            set: SparseSet::new(0),
624            slot_table: SlotTable::new(),
625        };
626        active.reset(re);
627        active
628    }
629
630    /// Reset this set of active states such that it can be used with the given
631    /// PikeVM (and only that PikeVM).
632    fn reset(&mut self, re: &PikeVM) {
633        self.set.resize(re.nfa().len());
634        self.slot_table.reset(re);
635    }
636
637    /// Setup this set of active states for a new search. The given slot
638    /// length should be the number of slots in a caller provided 'Captures'
639    /// (and may be zero).
640    fn setup_search(&mut self, captures_slot_len: usize) {
641        self.set.clear();
642        self.slot_table.setup_search(captures_slot_len);
643    }
644}
645
646/// A table of slots, where each row represent a state in an NFA. Thus, the
647/// table has room for storing slots for every single state in an NFA.
648///
649/// This table is represented with a single contiguous allocation. In general,
650/// the notion of "capturing group" doesn't really exist at this level of
651/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
652/// maps to a pair of slots, one for the start offset and one for the end
653/// offset.) Slots are indexed by the `Captures` NFA state.
654#[derive(Clone, Debug)]
655struct SlotTable {
656    /// The actual table of offsets.
657    table: Vec<Option<NonMaxUsize>>,
658    /// The number of slots per state, i.e., the table's stride or the length
659    /// of each row.
660    slots_per_state: usize,
661    /// The number of slots in the caller-provided `Captures` value for the
662    /// current search. Setting this to `slots_per_state` is always correct,
663    /// but may be wasteful.
664    slots_for_captures: usize,
665}
666
667impl SlotTable {
668    /// Create a new slot table.
669    ///
670    /// One should call 'reset' with the corresponding PikeVM before use.
671    fn new() -> SlotTable {
672        SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
673    }
674
675    /// Reset this slot table such that it can be used with the given PikeVM
676    /// (and only that PikeVM).
677    fn reset(&mut self, re: &PikeVM) {
678        let nfa = re.nfa();
679        // OK because NFA construction would have failed if this overflowed.
680        self.slots_per_state = nfa.group_len().checked_mul(2).unwrap();
681        // This is always correct, but may be reduced for a particular search
682        // if fewer slots were given by the caller, e.g., none at all or only
683        // slots for tracking the overall match instead of all slots for every
684        // group.
685        self.slots_for_captures = self.slots_per_state;
686        let len = nfa
687            .len()
688            // We add 1 so that our last row is always empty. We use it as
689            // "scratch" space for computing the epsilon closure off of the
690            // starting state.
691            .checked_add(1)
692            .and_then(|x| x.checked_mul(self.slots_per_state))
693            // It seems like this could actually panic on legitimate inputs
694            // on 32-bit targets. Should we somehow convert this to an error?
695            // What about something similar for the lazy DFA cache? If you're
696            // tripping this assert, please file a bug.
697            .expect("slot table length doesn't overflow");
698        self.table.resize(len, None);
699    }
700
701    /// Perform any per-search setup for this slot table.
702    ///
703    /// In particular, this sets the length of the number of slots used in the
704    /// slots given by the caller (if any at all). This number may be smaller
705    /// than the total number of slots available, e.g., when the caller is only
706    /// interested in tracking the overall match and not the spans of every
707    /// matching capturing group. Only tracking the overall match can save a
708    /// substantial amount of time copying capturing spans during a search.
709    fn setup_search(&mut self, captures_slot_len: usize) {
710        self.slots_for_captures = captures_slot_len;
711    }
712
713    /// Return a mutable slice of the slots for the given state.
714    ///
715    /// Note that the length of the slice returned may be less than the total
716    /// number of slots available for this state. In particular, the length
717    /// always matches the number of slots indicated via `setup_search`.
718    fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
719        let i = sid.as_usize() * self.slots_per_state;
720        &mut self.table[i..i + self.slots_for_captures]
721    }
722
723    /// Return a slice of slots of appropriate length where every slot offset
724    /// is guaranteed to be absent. This is useful in cases where you need to
725    /// compute an epsilon closure outside of the user supplied regex, and thus
726    /// never want it to have any capturing slots set.
727    fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
728        let i = self.table.len() - self.slots_per_state;
729        &mut self.table[i..i + self.slots_for_captures]
730    }
731}
732
733/// Represents a stack frame for use while computing an epsilon closure.
734///
735/// (An "epsilon closure" refers to the set of reachable NFA states from a
736/// single state without consuming any input. That is, the set of all epsilon
737/// transitions not only from that single state, but from every other state
738/// reachable by an epsilon transition as well. This is why it's called a
739/// "closure.")
740///
741/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
742/// first traversal over all epsilon transitions from a particular state.
743/// (A depth first traversal is important because it emulates the same priority
744/// of matches that is typically found in backtracking regex engines.) This
745/// depth first traversal is naturally expressed using recursion, but to avoid
746/// a call stack size proportional to the size of a regex, we put our stack on
747/// the heap instead.
748///
749/// This stack thus consists of call frames. The typical call frame is
750/// `Explore`, which instructs epsilon closure to explore the epsilon
751/// transitions from that state. (Subsequent epsilon transitions are then
752/// pushed on to the stack as more `Explore` frames.) If the state ID being
753/// explored has no epsilon transitions, then the capturing group slots are
754/// copied from the original state that sparked the epsilon closure (from the
755/// 'step' routine) to the state ID being explored. This way, capturing group
756/// slots are forwarded from the previous state to the next.
757///
758/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
759/// set the position for a particular slot back to some particular offset. This
760/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
761/// set the offset of the slot indicated in `Capture` to the current offset,
762/// and then push the old offset on to the stack as a `RestoreCapture` frame.
763/// Thus, the new offset is only used until the epsilon closure reverts back to
764/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
765/// transition its "scope" to only states that come "after" it during depth
766/// first traversal.
767#[derive(Clone, Debug)]
768enum FollowEpsilon {
769    /// Explore the epsilon transitions from a state ID.
770    Explore(StateID),
771    /// Reset the given `slot` to the given `offset` (which might be `None`).
772    RestoreCapture { slot: u32, offset: Option<NonMaxUsize> },
773}
774
775/// A sparse set used for representing ordered NFA states.
776///
777/// This supports constant time addition and membership testing. Clearing an
778/// entire set can also be done in constant time. Iteration yields elements
779/// in the order in which they were inserted.
780///
781/// The data structure is based on: https://research.swtch.com/sparse
782/// Note though that we don't actually use uninitialized memory. We generally
783/// reuse sparse sets, so the initial allocation cost is bareable. However, its
784/// other properties listed above are extremely useful.
785#[derive(Clone)]
786struct SparseSet {
787    /// The number of elements currently in this set.
788    len: usize,
789    /// Dense contains the ids in the order in which they were inserted.
790    dense: Vec<StateID>,
791    /// Sparse maps ids to their location in dense.
792    ///
793    /// A state ID is in the set if and only if
794    /// sparse[id] < len && id == dense[sparse[id]].
795    ///
796    /// Note that these are indices into 'dense'. It's a little weird to use
797    /// StateID here, but we know our length can never exceed the bounds of
798    /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
799    /// where as a usize is likely double that in most cases.
800    sparse: Vec<StateID>,
801}
802
803impl SparseSet {
804    /// Create a new sparse set with the given capacity.
805    ///
806    /// Sparse sets have a fixed size and they cannot grow. Attempting to
807    /// insert more distinct elements than the total capacity of the set will
808    /// result in a panic.
809    ///
810    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
811    fn new(capacity: usize) -> SparseSet {
812        let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
813        set.resize(capacity);
814        set
815    }
816
817    /// Resizes this sparse set to have the new capacity given.
818    ///
819    /// This set is automatically cleared.
820    ///
821    /// This panics if the capacity given is bigger than `StateID::LIMIT`.
822    fn resize(&mut self, new_capacity: usize) {
823        assert!(
824            new_capacity <= u32::MAX.as_usize(),
825            "sparse set capacity cannot excced {:?}",
826            u32::MAX,
827        );
828        self.clear();
829        self.dense.resize(new_capacity, 0);
830        self.sparse.resize(new_capacity, 0);
831    }
832
833    /// Returns the capacity of this set.
834    ///
835    /// The capacity represents a fixed limit on the number of distinct
836    /// elements that are allowed in this set. The capacity cannot be changed.
837    fn capacity(&self) -> usize {
838        self.dense.len()
839    }
840
841    /// Returns the number of elements in this set.
842    fn len(&self) -> usize {
843        self.len
844    }
845
846    /// Returns true if and only if this set is empty.
847    fn is_empty(&self) -> bool {
848        self.len() == 0
849    }
850
851    /// Insert the state ID value into this set and return true if the given
852    /// state ID was not previously in this set.
853    ///
854    /// This operation is idempotent. If the given value is already in this
855    /// set, then this is a no-op.
856    ///
857    /// If more than `capacity` ids are inserted, then this panics.
858    fn insert(&mut self, id: StateID) -> bool {
859        if self.contains(id) {
860            return false;
861        }
862
863        let index = self.len();
864        assert!(
865            index < self.capacity(),
866            "{:?} exceeds capacity of {:?} when inserting {:?}",
867            index,
868            self.capacity(),
869            id,
870        );
871        self.dense[index] = id;
872        // OK because we don't permit the capacity to be set higher than
873        // u32::MAX.
874        self.sparse[id.as_usize()] = u32::try_from(index).unwrap();
875        self.len += 1;
876        true
877    }
878
879    /// Returns true if and only if this set contains the given value.
880    fn contains(&self, id: StateID) -> bool {
881        let index = self.sparse[id.as_usize()];
882        index.as_usize() < self.len() && self.dense[index.as_usize()] == id
883    }
884
885    /// Clear this set such that it has no members.
886    fn clear(&mut self) {
887        self.len = 0;
888    }
889
890    /// Returns an iterator over all the state IDs in this set in the order in
891    /// which they were inserted.
892    fn iter(&self) -> SparseSetIter<'_> {
893        SparseSetIter(self.dense[..self.len()].iter())
894    }
895}
896
897impl core::fmt::Debug for SparseSet {
898    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
899        let elements: Vec<StateID> = self.iter().collect();
900        f.debug_tuple("SparseSet").field(&elements).finish()
901    }
902}
903
904/// An iterator over all elements in a sparse set.
905///
906/// The lifetime `'a` refers to the lifetime of the set being iterated over.
907#[derive(Debug)]
908struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
909
910impl<'a> Iterator for SparseSetIter<'a> {
911    type Item = StateID;
912
913    fn next(&mut self) -> Option<StateID> {
914        self.0.next().map(|&id| id)
915    }
916}
regex_lite/pikevm.rs

regex_lite/
pikevm.rs