regex_lite/pikevm.rs
1use alloc::{vec, vec::Vec};
2
3use crate::{
4 int::{NonMaxUsize, U32},
5 nfa::{State, StateID, NFA},
6 pool::CachePoolGuard,
7 utf8,
8};
9
10/// A PikeVM searcher.
11///
12/// A PikeVM uses the standard Thompson NFA linear time search algorithm, but
13/// augmented to support tracking the offsets of matching capture groups.
14#[derive(Clone, Debug)]
15pub(crate) struct PikeVM {
16 nfa: NFA,
17}
18
19impl PikeVM {
20 /// Create a new PikeVM searcher that uses the given NFA.
21 pub(crate) fn new(nfa: NFA) -> PikeVM {
22 PikeVM { nfa }
23 }
24
25 /// Return the underlying NFA used by this PikeVM.
26 pub(crate) fn nfa(&self) -> &NFA {
27 &self.nfa
28 }
29
30 /// Returns an iterator of non-overlapping matches in the given haystack.
31 pub(crate) fn find_iter<'r, 'h>(
32 &'r self,
33 cache: CachePoolGuard<'r>,
34 haystack: &'h [u8],
35 ) -> FindMatches<'r, 'h> {
36 FindMatches {
37 pikevm: self,
38 cache,
39 haystack,
40 at: 0,
41 slots: vec![None, None],
42 last_match_end: None,
43 }
44 }
45
46 /// Returns an iterator of non-overlapping capture matches in the given
47 /// haystack.
48 pub(crate) fn captures_iter<'r, 'h>(
49 &'r self,
50 cache: CachePoolGuard<'r>,
51 haystack: &'h [u8],
52 ) -> CapturesMatches<'r, 'h> {
53 // OK because the NFA wouldn't have compiled if this could overflow.
54 let len = self.nfa().group_len().checked_mul(2).unwrap();
55 CapturesMatches {
56 it: FindMatches {
57 pikevm: self,
58 cache,
59 haystack,
60 at: 0,
61 slots: vec![None; len],
62 last_match_end: None,
63 },
64 }
65 }
66
67 /// The implementation of standard leftmost search.
68 ///
69 /// Capturing group spans are written to `slots`, but only if requested.
70 /// `slots` can be any length. Any slot in the NFA that is activated but
71 /// which is out of bounds for the given `slots` is ignored.
72 pub(crate) fn search(
73 &self,
74 cache: &mut Cache,
75 haystack: &[u8],
76 start: usize,
77 end: usize,
78 earliest: bool,
79 slots: &mut [Option<NonMaxUsize>],
80 ) -> bool {
81 cache.setup_search(slots.len());
82 if start > end {
83 return false;
84 }
85 // Why do we even care about this? Well, in our `slots` representation,
86 // we use usize::MAX as a sentinel to indicate "no match." This isn't
87 // problematic so long as our haystack doesn't have a maximal length.
88 // Byte slices are guaranteed by Rust to have a length that fits into
89 // isize, and so this assert should always pass. But we put it here to
90 // make our assumption explicit.
91 assert!(
92 haystack.len() < core::usize::MAX,
93 "byte slice lengths must be less than usize MAX",
94 );
95
96 let Cache { ref mut stack, ref mut curr, ref mut next } = cache;
97 let start_id = self.nfa().start();
98 let anchored = self.nfa().is_start_anchored();
99 let mut matched = false;
100 // Yes, our search doesn't end at `end`, but includes it. This is
101 // necessary because matches are delayed by one byte. The delay is used
102 // to handle look-behind assertions. In the case of the PikeVM, the
103 // delay is implemented by not considering a match to exist until it
104 // is visited in `nexts`. Technically, we know a match exists in the
105 // previous iteration via `epsilon_closure`.
106 let mut at = start;
107 while at <= end {
108 // If we have no states left to visit, then there are some cases
109 // where we know we can quit early or even skip ahead.
110 if curr.set.is_empty() {
111 // We have a match so we can quit.
112 if matched {
113 break;
114 }
115 // If we're running an anchored search and we've advanced
116 // beyond the start position with no other states to try, then
117 // we will never observe a match and thus can stop.
118 if anchored && at > start {
119 break;
120 }
121 }
122 // Instead of using a hypothetical unanchored start state in the
123 // NFA (which doesn't exist, but we could add it), we actually
124 // always use its anchored starting state. As a result, when doing
125 // an unanchored search, we need to simulate our own '(?s:.)*?'
126 // prefix, to permit a match to appear anywhere.
127 //
128 // Now, we don't *have* to do things this way. We could create
129 // a proper unanchored start state in the NFA and do one
130 // `epsilon_closure` call from that starting state before the main
131 // loop here. And that is just as correct. However, it turns out to
132 // be slower than our approach here because it slightly increases
133 // the cost of processing each byte by requiring us to visit
134 // more NFA states to deal with the additional NFA states in the
135 // unanchored prefix. By simulating it explicitly here, we lower
136 // those costs substantially. The cost is itself small, but it adds
137 // up for large haystacks.
138 //
139 // In order to simulate the '(?s:.)*?' prefix---which is not
140 // greedy---we are careful not to perform an epsilon closure on
141 // the start state if we already have a match. Namely, if we
142 // did otherwise, we would never reach a terminating condition
143 // because there would always be additional states to process.
144 if !matched {
145 // Since we are adding to the 'curr' active states and since
146 // this is for the start ID, we use a slots slice that is
147 // guaranteed to have the right length but where every element
148 // is absent. This is exactly what we want, because this
149 // epsilon closure is responsible for simulating an unanchored
150 // '(?s:.)*?' prefix. It is specifically outside of any
151 // capturing groups, and thus, using slots that are always
152 // absent is correct.
153 //
154 // Note though that we can't just use `&mut []` here, since
155 // this epsilon closure may traverse through `Capture` states
156 // transitions, and thus must be able to write offsets to the
157 // slots given which are later copied to slot values in `curr`.
158 let slots = next.slot_table.all_absent();
159 self.epsilon_closure(
160 stack, slots, curr, haystack, at, start_id,
161 );
162 }
163 let (ch, len) = utf8::decode_lossy(&haystack[at..]);
164 if self.nexts(stack, curr, next, haystack, at, ch, len, slots) {
165 matched = true;
166 }
167 // Unless the caller asked us to return early, we need to mush
168 // on to see if we can extend our match. (But note that 'nexts'
169 // will quit right after seeing a match, as is consistent with
170 // leftmost-first match priority.)
171 if (earliest && matched) || len == 0 {
172 break;
173 }
174 core::mem::swap(curr, next);
175 next.set.clear();
176 at += len;
177 }
178 matched
179 }
180
181 /// Process the active states in 'curr' to find the states (written to
182 /// 'next') we should process for the next byte in the haystack.
183 ///
184 /// 'stack' is used to perform a depth first traversal of the NFA when
185 /// computing an epsilon closure.
186 ///
187 /// When a match is found, the slots for that match state (in 'curr') are
188 /// copied to 'caps'. Moreover, once a match is seen, processing for 'curr'
189 /// stops (unless the PikeVM was configured with MatchKind::All semantics).
190 ///
191 /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
192 /// in `haystack`.
193 ///
194 /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
195 /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
196 /// where `at_ch` is the replacement codepoint that results from decoding
197 /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
198 fn nexts(
199 &self,
200 stack: &mut Vec<FollowEpsilon>,
201 curr: &mut ActiveStates,
202 next: &mut ActiveStates,
203 haystack: &[u8],
204 at: usize,
205 at_ch: char,
206 at_len: usize,
207 slots: &mut [Option<NonMaxUsize>],
208 ) -> bool {
209 let ActiveStates { ref set, ref mut slot_table } = *curr;
210 for sid in set.iter() {
211 if self.next(
212 stack, slot_table, next, haystack, at, at_ch, at_len, sid,
213 ) {
214 slots.copy_from_slice(slot_table.for_state(sid));
215 return true;
216 }
217 }
218 false
219 }
220
221 /// Starting from `sid`, if the position `at` in the `haystack` has a
222 /// transition defined out of `sid`, then add the state transitioned to and
223 /// its epsilon closure to the `next` set of states to explore.
224 ///
225 /// `stack` is used by the epsilon closure computation to perform a depth
226 /// first traversal of the NFA.
227 ///
228 /// `curr_slot_table` should be the table of slots for the current set of
229 /// states being explored. If there is a transition out of `sid`, then
230 /// sid's row in the slot table is used to perform the epsilon closure.
231 ///
232 /// `at_ch` is the Unicode scalar value whose UTF-8 encoding begins at `at`
233 /// in `haystack`. The caller provides it so that this routine doesn't
234 /// need to re-decode it. (Since it's expected that this routine is called
235 /// multiple times for each position.)
236 ///
237 /// `at_len` is the number of bytes consumed by `at_ch`. This is usually
238 /// equal to `at_ch.len_utf8()`, but not always. For example, in the case
239 /// where `at_ch` is the replacement codepoint that results from decoding
240 /// invalid UTF-8. In that case, `at_len` can be 1, 2 or 3.
241 fn next(
242 &self,
243 stack: &mut Vec<FollowEpsilon>,
244 curr_slot_table: &mut SlotTable,
245 next: &mut ActiveStates,
246 haystack: &[u8],
247 at: usize,
248 at_ch: char,
249 at_len: usize,
250 sid: StateID,
251 ) -> bool {
252 match *self.nfa.state(sid) {
253 State::Fail
254 | State::Goto { .. }
255 | State::Splits { .. }
256 | State::Capture { .. } => false,
257 State::Char { target, ch } => {
258 if at_ch == ch && at_len > 0 {
259 let slots = curr_slot_table.for_state(sid);
260 // OK because `at_len` is always derived from the number
261 // of bytes read from `at` that make up `at_ch`. So this
262 // will never wrap.
263 let at = at.wrapping_add(at_len);
264 self.epsilon_closure(
265 stack, slots, next, haystack, at, target,
266 );
267 }
268 false
269 }
270 State::Ranges { target, ref ranges } => {
271 for (start, end) in ranges.iter().copied() {
272 if start > at_ch {
273 break;
274 } else if start <= at_ch && at_ch <= end {
275 if at_len == 0 {
276 return false;
277 }
278 let slots = curr_slot_table.for_state(sid);
279 // OK because `at_len` is always derived from the
280 // number of bytes read from `at` that make up `at_ch`.
281 // So this will never wrap.
282 let at = at.wrapping_add(at_len);
283 self.epsilon_closure(
284 stack, slots, next, haystack, at, target,
285 );
286 }
287 }
288 false
289 }
290 State::Match => true,
291 }
292 }
293
294 /// Compute the epsilon closure of `sid`, writing the closure into `next`
295 /// while copying slot values from `curr_slots` into corresponding states
296 /// in `next`. `curr_slots` should be the slot values corresponding to
297 /// `sid`.
298 ///
299 /// The given `stack` is used to perform a depth first traversal of the
300 /// NFA by recursively following all epsilon transitions out of `sid`.
301 /// Conditional epsilon transitions are followed if and only if they are
302 /// satisfied for the position `at` in the `input` haystack.
303 ///
304 /// While this routine may write to `curr_slots`, once it returns, any
305 /// writes are undone and the original values (even if absent) are
306 /// restored.
307 fn epsilon_closure(
308 &self,
309 stack: &mut Vec<FollowEpsilon>,
310 curr_slots: &mut [Option<NonMaxUsize>],
311 next: &mut ActiveStates,
312 haystack: &[u8],
313 at: usize,
314 sid: StateID,
315 ) {
316 stack.push(FollowEpsilon::Explore(sid));
317 while let Some(frame) = stack.pop() {
318 match frame {
319 FollowEpsilon::RestoreCapture { slot, offset } => {
320 curr_slots[slot.as_usize()] = offset;
321 }
322 FollowEpsilon::Explore(sid) => {
323 self.epsilon_closure_explore(
324 stack, curr_slots, next, haystack, at, sid,
325 );
326 }
327 }
328 }
329 }
330
331 /// Explore all of the epsilon transitions out of `sid`. This is mostly
332 /// split out from `epsilon_closure` in order to clearly delineate
333 /// the actual work of computing an epsilon closure from the stack
334 /// book-keeping.
335 ///
336 /// This will push any additional explorations needed on to `stack`.
337 ///
338 /// `curr_slots` should refer to the slots for the currently active NFA
339 /// state. That is, the current state we are stepping through. These
340 /// slots are mutated in place as new `Captures` states are traversed
341 /// during epsilon closure, but the slots are restored to their original
342 /// values once the full epsilon closure is completed. The ultimate use of
343 /// `curr_slots` is to copy them to the corresponding `next_slots`, so that
344 /// the capturing group spans are forwarded from the currently active state
345 /// to the next.
346 ///
347 /// `next` refers to the next set of active states. Computing an epsilon
348 /// closure may increase the next set of active states.
349 ///
350 /// `haystack` refers to the what we're searching and `at` refers to the
351 /// current position in the haystack. These are used to check whether
352 /// conditional epsilon transitions (like look-around) are satisfied at
353 /// the current position. If they aren't, then the epsilon closure won't
354 /// include them.
355 fn epsilon_closure_explore(
356 &self,
357 stack: &mut Vec<FollowEpsilon>,
358 curr_slots: &mut [Option<NonMaxUsize>],
359 next: &mut ActiveStates,
360 haystack: &[u8],
361 at: usize,
362 mut sid: StateID,
363 ) {
364 // We can avoid pushing some state IDs on to our stack in precisely
365 // the cases where a 'push(x)' would be immediately followed by a 'x
366 // = pop()'. This is achieved by this outer-loop. We simply set 'sid'
367 // to be the next state ID we want to explore once we're done with
368 // our initial exploration. In practice, this avoids a lot of stack
369 // thrashing.
370 loop {
371 // Record this state as part of our next set of active states. If
372 // we've already explored it, then no need to do it again.
373 if !next.set.insert(sid) {
374 return;
375 }
376 match *self.nfa.state(sid) {
377 State::Fail
378 | State::Match { .. }
379 | State::Char { .. }
380 | State::Ranges { .. } => {
381 next.slot_table.for_state(sid).copy_from_slice(curr_slots);
382 return;
383 }
384 State::Goto { target, look: None } => {
385 sid = target;
386 }
387 State::Goto { target, look: Some(look) } => {
388 if !look.is_match(haystack, at) {
389 return;
390 }
391 sid = target;
392 }
393 State::Splits { ref targets, reverse: false } => {
394 sid = match targets.get(0) {
395 None => return,
396 Some(&sid) => sid,
397 };
398 stack.extend(
399 targets[1..]
400 .iter()
401 .copied()
402 .rev()
403 .map(FollowEpsilon::Explore),
404 );
405 }
406 State::Splits { ref targets, reverse: true } => {
407 sid = match targets.last() {
408 None => return,
409 Some(&sid) => sid,
410 };
411 stack.extend(
412 targets[..targets.len() - 1]
413 .iter()
414 .copied()
415 .map(FollowEpsilon::Explore),
416 );
417 }
418 State::Capture { target, slot } => {
419 // There's no need to do anything with slots that
420 // ultimately won't be copied into the caller-provided
421 // 'Captures' value. So we just skip dealing with them at
422 // all.
423 if slot.as_usize() < curr_slots.len() {
424 stack.push(FollowEpsilon::RestoreCapture {
425 slot,
426 offset: curr_slots[slot.as_usize()],
427 });
428 // OK because length of a slice must fit into an isize.
429 curr_slots[slot.as_usize()] =
430 Some(NonMaxUsize::new(at).unwrap());
431 }
432 sid = target;
433 }
434 }
435 }
436 }
437}
438
439/// An iterator over all successive non-overlapping matches in a particular
440/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
441/// of the cache and `'h` represents the lifetime of the haystack.
442#[derive(Debug)]
443pub(crate) struct FindMatches<'r, 'h> {
444 pikevm: &'r PikeVM,
445 cache: CachePoolGuard<'r>,
446 haystack: &'h [u8],
447 at: usize,
448 slots: Vec<Option<NonMaxUsize>>,
449 last_match_end: Option<usize>,
450}
451
452impl<'r, 'h> Iterator for FindMatches<'r, 'h> {
453 type Item = (usize, usize);
454
455 fn next(&mut self) -> Option<(usize, usize)> {
456 if !self.pikevm.search(
457 &mut self.cache,
458 self.haystack,
459 self.at,
460 self.haystack.len(),
461 false,
462 &mut self.slots,
463 ) {
464 return None;
465 }
466 let mut m =
467 (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
468 if m.0 >= m.1 {
469 m = self.handle_overlapping_empty_match(m)?;
470 }
471 self.at = m.1;
472 self.last_match_end = Some(m.1);
473 Some(m)
474 }
475}
476
477impl<'r, 'h> FindMatches<'r, 'h> {
478 /// Handles the special case of an empty match by ensuring that 1) the
479 /// iterator always advances and 2) empty matches never overlap with other
480 /// matches.
481 ///
482 /// Note that we mark this cold and forcefully prevent inlining because
483 /// handling empty matches like this is extremely rare and does require a
484 /// bit of code, comparatively. Keeping this code out of the main iterator
485 /// function keeps it smaller and more amenable to inlining itself.
486 #[cold]
487 #[inline(never)]
488 fn handle_overlapping_empty_match(
489 &mut self,
490 mut m: (usize, usize),
491 ) -> Option<(usize, usize)> {
492 assert!(m.0 >= m.1);
493 if Some(m.1) == self.last_match_end {
494 let len =
495 core::cmp::max(1, utf8::decode(&self.haystack[self.at..]).1);
496 self.at = self.at.checked_add(len).unwrap();
497 if !self.pikevm.search(
498 &mut self.cache,
499 self.haystack,
500 self.at,
501 self.haystack.len(),
502 false,
503 &mut self.slots,
504 ) {
505 return None;
506 }
507 m = (self.slots[0].unwrap().get(), self.slots[1].unwrap().get());
508 }
509 Some(m)
510 }
511}
512
513/// An iterator over all successive non-overlapping capture matches in a particular
514/// haystack. `'r` represents the lifetime of the regex, `'c` is the lifetime
515/// of the cache and `'h` represents the lifetime of the haystack.
516#[derive(Debug)]
517pub(crate) struct CapturesMatches<'r, 'h> {
518 it: FindMatches<'r, 'h>,
519}
520
521impl<'r, 'h> Iterator for CapturesMatches<'r, 'h> {
522 type Item = Vec<Option<NonMaxUsize>>;
523
524 fn next(&mut self) -> Option<Vec<Option<NonMaxUsize>>> {
525 self.it.next()?;
526 Some(self.it.slots.clone())
527 }
528}
529
530/// A cache represents mutable state that a `PikeVM` requires during a search.
531///
532/// For a given `PikeVM`, its corresponding cache may be created either via
533/// `PikeVM::create_cache`, or via `Cache::new`. They are equivalent in every
534/// way, except the former does not require explicitly importing `Cache`.
535///
536/// A particular `Cache` is coupled with the `PikeVM` from which it was
537/// created. It may only be used with that `PikeVM`. A cache and its
538/// allocations may be re-purposed via `Cache::reset`, in which case, it can
539/// only be used with the new `PikeVM` (and not the old one).
540#[derive(Clone, Debug)]
541pub(crate) struct Cache {
542 /// Stack used while computing epsilon closure. This effectively lets us
543 /// move what is more naturally expressed through recursion to a stack
544 /// on the heap.
545 stack: Vec<FollowEpsilon>,
546 /// The current active states being explored for the current byte in the
547 /// haystack.
548 curr: ActiveStates,
549 /// The next set of states we're building that will be explored for the
550 /// next byte in the haystack.
551 next: ActiveStates,
552}
553
554impl Cache {
555 /// Create a new `PikeVM` cache.
556 ///
557 /// A potentially more convenient routine to create a cache is
558 /// `PikeVM::create_cache`, as it does not require also importing the
559 /// `Cache` type.
560 ///
561 /// If you want to reuse the returned `Cache` with some other `PikeVM`,
562 /// then you must call `Cache::reset` with the desired `PikeVM`.
563 pub(crate) fn new(re: &PikeVM) -> Cache {
564 Cache {
565 stack: vec![],
566 curr: ActiveStates::new(re),
567 next: ActiveStates::new(re),
568 }
569 }
570
571 /// Clears this cache. This should be called at the start of every search
572 /// to ensure we start with a clean slate.
573 ///
574 /// This also sets the length of the capturing groups used in the current
575 /// search. This permits an optimization where by 'SlotTable::for_state'
576 /// only returns the number of slots equivalent to the number of slots
577 /// given in the 'Captures' value. This may be less than the total number
578 /// of possible slots, e.g., when one only wants to track overall match
579 /// offsets. This in turn permits less copying of capturing group spans
580 /// in the PikeVM.
581 fn setup_search(&mut self, captures_slot_len: usize) {
582 self.stack.clear();
583 self.curr.setup_search(captures_slot_len);
584 self.next.setup_search(captures_slot_len);
585 }
586}
587
588/// A set of active states used to "simulate" the execution of an NFA via the
589/// PikeVM.
590///
591/// There are two sets of these used during NFA simulation. One set corresponds
592/// to the "current" set of states being traversed for the current position
593/// in a haystack. The other set corresponds to the "next" set of states being
594/// built, which will become the new "current" set for the next position in the
595/// haystack. These two sets correspond to CLIST and NLIST in Thompson's
596/// original paper regexes: https://dl.acm.org/doi/pdf/10.1145/363347.363387
597///
598/// In addition to representing a set of NFA states, this also maintains slot
599/// values for each state. These slot values are what turn the NFA simulation
600/// into the "Pike VM." Namely, they track capturing group values for each
601/// state. During the computation of epsilon closure, we copy slot values from
602/// states in the "current" set to the "next" set. Eventually, once a match
603/// is found, the slot values for that match state are what we write to the
604/// caller provided slots.
605#[derive(Clone, Debug)]
606struct ActiveStates {
607 /// The set of active NFA states. This set preserves insertion order, which
608 /// is critical for simulating the match semantics of backtracking regex
609 /// engines.
610 set: SparseSet,
611 /// The slots for every NFA state, where each slot stores a (possibly
612 /// absent) offset. Every capturing group has two slots. One for a start
613 /// offset and one for an end offset.
614 slot_table: SlotTable,
615}
616
617impl ActiveStates {
618 /// Create a new set of active states for the given PikeVM. The active
619 /// states returned may only be used with the given PikeVM. (Use 'reset'
620 /// to re-purpose the allocation for a different PikeVM.)
621 fn new(re: &PikeVM) -> ActiveStates {
622 let mut active = ActiveStates {
623 set: SparseSet::new(0),
624 slot_table: SlotTable::new(),
625 };
626 active.reset(re);
627 active
628 }
629
630 /// Reset this set of active states such that it can be used with the given
631 /// PikeVM (and only that PikeVM).
632 fn reset(&mut self, re: &PikeVM) {
633 self.set.resize(re.nfa().len());
634 self.slot_table.reset(re);
635 }
636
637 /// Setup this set of active states for a new search. The given slot
638 /// length should be the number of slots in a caller provided 'Captures'
639 /// (and may be zero).
640 fn setup_search(&mut self, captures_slot_len: usize) {
641 self.set.clear();
642 self.slot_table.setup_search(captures_slot_len);
643 }
644}
645
646/// A table of slots, where each row represent a state in an NFA. Thus, the
647/// table has room for storing slots for every single state in an NFA.
648///
649/// This table is represented with a single contiguous allocation. In general,
650/// the notion of "capturing group" doesn't really exist at this level of
651/// abstraction, hence the name "slot" instead. (Indeed, every capturing group
652/// maps to a pair of slots, one for the start offset and one for the end
653/// offset.) Slots are indexed by the `Captures` NFA state.
654#[derive(Clone, Debug)]
655struct SlotTable {
656 /// The actual table of offsets.
657 table: Vec<Option<NonMaxUsize>>,
658 /// The number of slots per state, i.e., the table's stride or the length
659 /// of each row.
660 slots_per_state: usize,
661 /// The number of slots in the caller-provided `Captures` value for the
662 /// current search. Setting this to `slots_per_state` is always correct,
663 /// but may be wasteful.
664 slots_for_captures: usize,
665}
666
667impl SlotTable {
668 /// Create a new slot table.
669 ///
670 /// One should call 'reset' with the corresponding PikeVM before use.
671 fn new() -> SlotTable {
672 SlotTable { table: vec![], slots_for_captures: 0, slots_per_state: 0 }
673 }
674
675 /// Reset this slot table such that it can be used with the given PikeVM
676 /// (and only that PikeVM).
677 fn reset(&mut self, re: &PikeVM) {
678 let nfa = re.nfa();
679 // OK because NFA construction would have failed if this overflowed.
680 self.slots_per_state = nfa.group_len().checked_mul(2).unwrap();
681 // This is always correct, but may be reduced for a particular search
682 // if fewer slots were given by the caller, e.g., none at all or only
683 // slots for tracking the overall match instead of all slots for every
684 // group.
685 self.slots_for_captures = self.slots_per_state;
686 let len = nfa
687 .len()
688 // We add 1 so that our last row is always empty. We use it as
689 // "scratch" space for computing the epsilon closure off of the
690 // starting state.
691 .checked_add(1)
692 .and_then(|x| x.checked_mul(self.slots_per_state))
693 // It seems like this could actually panic on legitimate inputs
694 // on 32-bit targets. Should we somehow convert this to an error?
695 // What about something similar for the lazy DFA cache? If you're
696 // tripping this assert, please file a bug.
697 .expect("slot table length doesn't overflow");
698 self.table.resize(len, None);
699 }
700
701 /// Perform any per-search setup for this slot table.
702 ///
703 /// In particular, this sets the length of the number of slots used in the
704 /// slots given by the caller (if any at all). This number may be smaller
705 /// than the total number of slots available, e.g., when the caller is only
706 /// interested in tracking the overall match and not the spans of every
707 /// matching capturing group. Only tracking the overall match can save a
708 /// substantial amount of time copying capturing spans during a search.
709 fn setup_search(&mut self, captures_slot_len: usize) {
710 self.slots_for_captures = captures_slot_len;
711 }
712
713 /// Return a mutable slice of the slots for the given state.
714 ///
715 /// Note that the length of the slice returned may be less than the total
716 /// number of slots available for this state. In particular, the length
717 /// always matches the number of slots indicated via `setup_search`.
718 fn for_state(&mut self, sid: StateID) -> &mut [Option<NonMaxUsize>] {
719 let i = sid.as_usize() * self.slots_per_state;
720 &mut self.table[i..i + self.slots_for_captures]
721 }
722
723 /// Return a slice of slots of appropriate length where every slot offset
724 /// is guaranteed to be absent. This is useful in cases where you need to
725 /// compute an epsilon closure outside of the user supplied regex, and thus
726 /// never want it to have any capturing slots set.
727 fn all_absent(&mut self) -> &mut [Option<NonMaxUsize>] {
728 let i = self.table.len() - self.slots_per_state;
729 &mut self.table[i..i + self.slots_for_captures]
730 }
731}
732
733/// Represents a stack frame for use while computing an epsilon closure.
734///
735/// (An "epsilon closure" refers to the set of reachable NFA states from a
736/// single state without consuming any input. That is, the set of all epsilon
737/// transitions not only from that single state, but from every other state
738/// reachable by an epsilon transition as well. This is why it's called a
739/// "closure.")
740///
741/// Computing the epsilon closure in a Thompson NFA proceeds via a depth
742/// first traversal over all epsilon transitions from a particular state.
743/// (A depth first traversal is important because it emulates the same priority
744/// of matches that is typically found in backtracking regex engines.) This
745/// depth first traversal is naturally expressed using recursion, but to avoid
746/// a call stack size proportional to the size of a regex, we put our stack on
747/// the heap instead.
748///
749/// This stack thus consists of call frames. The typical call frame is
750/// `Explore`, which instructs epsilon closure to explore the epsilon
751/// transitions from that state. (Subsequent epsilon transitions are then
752/// pushed on to the stack as more `Explore` frames.) If the state ID being
753/// explored has no epsilon transitions, then the capturing group slots are
754/// copied from the original state that sparked the epsilon closure (from the
755/// 'step' routine) to the state ID being explored. This way, capturing group
756/// slots are forwarded from the previous state to the next.
757///
758/// The other stack frame, `RestoreCaptures`, instructs the epsilon closure to
759/// set the position for a particular slot back to some particular offset. This
760/// frame is pushed when `Explore` sees a `Capture` transition. `Explore` will
761/// set the offset of the slot indicated in `Capture` to the current offset,
762/// and then push the old offset on to the stack as a `RestoreCapture` frame.
763/// Thus, the new offset is only used until the epsilon closure reverts back to
764/// the `RestoreCapture` frame. In effect, this gives the `Capture` epsilon
765/// transition its "scope" to only states that come "after" it during depth
766/// first traversal.
767#[derive(Clone, Debug)]
768enum FollowEpsilon {
769 /// Explore the epsilon transitions from a state ID.
770 Explore(StateID),
771 /// Reset the given `slot` to the given `offset` (which might be `None`).
772 RestoreCapture { slot: u32, offset: Option<NonMaxUsize> },
773}
774
775/// A sparse set used for representing ordered NFA states.
776///
777/// This supports constant time addition and membership testing. Clearing an
778/// entire set can also be done in constant time. Iteration yields elements
779/// in the order in which they were inserted.
780///
781/// The data structure is based on: https://research.swtch.com/sparse
782/// Note though that we don't actually use uninitialized memory. We generally
783/// reuse sparse sets, so the initial allocation cost is bareable. However, its
784/// other properties listed above are extremely useful.
785#[derive(Clone)]
786struct SparseSet {
787 /// The number of elements currently in this set.
788 len: usize,
789 /// Dense contains the ids in the order in which they were inserted.
790 dense: Vec<StateID>,
791 /// Sparse maps ids to their location in dense.
792 ///
793 /// A state ID is in the set if and only if
794 /// sparse[id] < len && id == dense[sparse[id]].
795 ///
796 /// Note that these are indices into 'dense'. It's a little weird to use
797 /// StateID here, but we know our length can never exceed the bounds of
798 /// StateID (enforced by 'resize') and StateID will be at most 4 bytes
799 /// where as a usize is likely double that in most cases.
800 sparse: Vec<StateID>,
801}
802
803impl SparseSet {
804 /// Create a new sparse set with the given capacity.
805 ///
806 /// Sparse sets have a fixed size and they cannot grow. Attempting to
807 /// insert more distinct elements than the total capacity of the set will
808 /// result in a panic.
809 ///
810 /// This panics if the capacity given is bigger than `StateID::LIMIT`.
811 fn new(capacity: usize) -> SparseSet {
812 let mut set = SparseSet { len: 0, dense: vec![], sparse: vec![] };
813 set.resize(capacity);
814 set
815 }
816
817 /// Resizes this sparse set to have the new capacity given.
818 ///
819 /// This set is automatically cleared.
820 ///
821 /// This panics if the capacity given is bigger than `StateID::LIMIT`.
822 fn resize(&mut self, new_capacity: usize) {
823 assert!(
824 new_capacity <= u32::MAX.as_usize(),
825 "sparse set capacity cannot excced {:?}",
826 u32::MAX,
827 );
828 self.clear();
829 self.dense.resize(new_capacity, 0);
830 self.sparse.resize(new_capacity, 0);
831 }
832
833 /// Returns the capacity of this set.
834 ///
835 /// The capacity represents a fixed limit on the number of distinct
836 /// elements that are allowed in this set. The capacity cannot be changed.
837 fn capacity(&self) -> usize {
838 self.dense.len()
839 }
840
841 /// Returns the number of elements in this set.
842 fn len(&self) -> usize {
843 self.len
844 }
845
846 /// Returns true if and only if this set is empty.
847 fn is_empty(&self) -> bool {
848 self.len() == 0
849 }
850
851 /// Insert the state ID value into this set and return true if the given
852 /// state ID was not previously in this set.
853 ///
854 /// This operation is idempotent. If the given value is already in this
855 /// set, then this is a no-op.
856 ///
857 /// If more than `capacity` ids are inserted, then this panics.
858 fn insert(&mut self, id: StateID) -> bool {
859 if self.contains(id) {
860 return false;
861 }
862
863 let index = self.len();
864 assert!(
865 index < self.capacity(),
866 "{:?} exceeds capacity of {:?} when inserting {:?}",
867 index,
868 self.capacity(),
869 id,
870 );
871 self.dense[index] = id;
872 // OK because we don't permit the capacity to be set higher than
873 // u32::MAX.
874 self.sparse[id.as_usize()] = u32::try_from(index).unwrap();
875 self.len += 1;
876 true
877 }
878
879 /// Returns true if and only if this set contains the given value.
880 fn contains(&self, id: StateID) -> bool {
881 let index = self.sparse[id.as_usize()];
882 index.as_usize() < self.len() && self.dense[index.as_usize()] == id
883 }
884
885 /// Clear this set such that it has no members.
886 fn clear(&mut self) {
887 self.len = 0;
888 }
889
890 /// Returns an iterator over all the state IDs in this set in the order in
891 /// which they were inserted.
892 fn iter(&self) -> SparseSetIter<'_> {
893 SparseSetIter(self.dense[..self.len()].iter())
894 }
895}
896
897impl core::fmt::Debug for SparseSet {
898 fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
899 let elements: Vec<StateID> = self.iter().collect();
900 f.debug_tuple("SparseSet").field(&elements).finish()
901 }
902}
903
904/// An iterator over all elements in a sparse set.
905///
906/// The lifetime `'a` refers to the lifetime of the set being iterated over.
907#[derive(Debug)]
908struct SparseSetIter<'a>(core::slice::Iter<'a, StateID>);
909
910impl<'a> Iterator for SparseSetIter<'a> {
911 type Item = StateID;
912
913 fn next(&mut self) -> Option<StateID> {
914 self.0.next().map(|&id| id)
915 }
916}