regex_automata/dfa/
determinize.rs

1use alloc::{collections::BTreeMap, vec::Vec};
2
3use crate::{
4    dfa::{
5        dense::{self, BuildError},
6        DEAD,
7    },
8    nfa::thompson,
9    util::{
10        self,
11        alphabet::{self, ByteSet},
12        determinize::{State, StateBuilderEmpty, StateBuilderNFA},
13        primitives::{PatternID, StateID},
14        search::{Anchored, MatchKind},
15        sparse_set::SparseSets,
16        start::Start,
17    },
18};
19
20/// A builder for configuring and running a DFA determinizer.
21#[derive(Clone, Debug)]
22pub(crate) struct Config {
23    match_kind: MatchKind,
24    quit: ByteSet,
25    dfa_size_limit: Option<usize>,
26    determinize_size_limit: Option<usize>,
27}
28
29impl Config {
30    /// Create a new default config for a determinizer. The determinizer may be
31    /// configured before calling `run`.
32    pub fn new() -> Config {
33        Config {
34            match_kind: MatchKind::LeftmostFirst,
35            quit: ByteSet::empty(),
36            dfa_size_limit: None,
37            determinize_size_limit: None,
38        }
39    }
40
41    /// Run determinization on the given NFA and write the resulting DFA into
42    /// the one given. The DFA given should be initialized but otherwise empty.
43    /// "Initialized" means that it is setup to handle the NFA's byte classes,
44    /// number of patterns and whether to build start states for each pattern.
45    pub fn run(
46        &self,
47        nfa: &thompson::NFA,
48        dfa: &mut dense::OwnedDFA,
49    ) -> Result<(), BuildError> {
50        let dead = State::dead();
51        let quit = State::dead();
52        let mut cache = StateMap::default();
53        // We only insert the dead state here since its representation is
54        // identical to the quit state. And we never want anything pointing
55        // to the quit state other than specific transitions derived from the
56        // determinizer's configured "quit" bytes.
57        //
58        // We do put the quit state into 'builder_states' below. This ensures
59        // that a proper DFA state ID is allocated for it, and that no other
60        // DFA state uses the "location after the DEAD state." That is, it
61        // is assumed that the quit state is always the state immediately
62        // following the DEAD state.
63        cache.insert(dead.clone(), DEAD);
64
65        let runner = Runner {
66            config: self.clone(),
67            nfa,
68            dfa,
69            builder_states: alloc::vec![dead, quit],
70            cache,
71            memory_usage_state: 0,
72            sparses: SparseSets::new(nfa.states().len()),
73            stack: alloc::vec![],
74            scratch_state_builder: StateBuilderEmpty::new(),
75        };
76        runner.run()
77    }
78
79    /// The match semantics to use for determinization.
80    ///
81    /// MatchKind::All corresponds to the standard textbook construction.
82    /// All possible match states are represented in the DFA.
83    /// MatchKind::LeftmostFirst permits greediness and otherwise tries to
84    /// simulate the match semantics of backtracking regex engines. Namely,
85    /// only a subset of match states are built, and dead states are used to
86    /// stop searches with an unanchored prefix.
87    ///
88    /// The default is MatchKind::LeftmostFirst.
89    pub fn match_kind(&mut self, kind: MatchKind) -> &mut Config {
90        self.match_kind = kind;
91        self
92    }
93
94    /// The set of bytes to use that will cause the DFA to enter a quit state,
95    /// stop searching and return an error. By default, this is empty.
96    pub fn quit(&mut self, set: ByteSet) -> &mut Config {
97        self.quit = set;
98        self
99    }
100
101    /// The limit, in bytes of the heap, that the DFA is permitted to use. This
102    /// does not include the auxiliary heap storage used by determinization.
103    pub fn dfa_size_limit(&mut self, bytes: Option<usize>) -> &mut Config {
104        self.dfa_size_limit = bytes;
105        self
106    }
107
108    /// The limit, in bytes of the heap, that determinization itself is allowed
109    /// to use. This does not include the size of the DFA being built.
110    pub fn determinize_size_limit(
111        &mut self,
112        bytes: Option<usize>,
113    ) -> &mut Config {
114        self.determinize_size_limit = bytes;
115        self
116    }
117}
118
119/// The actual implementation of determinization that converts an NFA to a DFA
120/// through powerset construction.
121///
122/// This determinizer roughly follows the typical powerset construction, where
123/// each DFA state is comprised of one or more NFA states. In the worst case,
124/// there is one DFA state for every possible combination of NFA states. In
125/// practice, this only happens in certain conditions, typically when there are
126/// bounded repetitions.
127///
128/// The main differences between this implementation and typical deteminization
129/// are that this implementation delays matches by one state and hackily makes
130/// look-around work. Comments below attempt to explain this.
131///
132/// The lifetime variable `'a` refers to the lifetime of the NFA or DFA,
133/// whichever is shorter.
134#[derive(Debug)]
135struct Runner<'a> {
136    /// The configuration used to initialize determinization.
137    config: Config,
138    /// The NFA we're converting into a DFA.
139    nfa: &'a thompson::NFA,
140    /// The DFA we're building.
141    dfa: &'a mut dense::OwnedDFA,
142    /// Each DFA state being built is defined as an *ordered* set of NFA
143    /// states, along with some meta facts about the ordered set of NFA states.
144    ///
145    /// This is never empty. The first state is always a dummy state such that
146    /// a state id == 0 corresponds to a dead state. The second state is always
147    /// the quit state.
148    ///
149    /// Why do we have states in both a `Vec` and in a cache map below?
150    /// Well, they serve two different roles based on access patterns.
151    /// `builder_states` is the canonical home of each state, and provides
152    /// constant random access by a DFA state's ID. The cache map below, on
153    /// the other hand, provides a quick way of searching for identical DFA
154    /// states by using the DFA state as a key in the map. Of course, we use
155    /// reference counting to avoid actually duplicating the state's data
156    /// itself. (Although this has never been benchmarked.) Note that the cache
157    /// map does not give us full minimization; it just lets us avoid some very
158    /// obvious redundant states.
159    ///
160    /// Note that the index into this Vec isn't quite the DFA's state ID.
161    /// Rather, it's just an index. To get the state ID, you have to multiply
162    /// it by the DFA's stride. That's done by self.dfa.from_index. And the
163    /// inverse is self.dfa.to_index.
164    ///
165    /// Moreover, DFA states don't usually retain the IDs assigned to them
166    /// by their position in this Vec. After determinization completes,
167    /// states are shuffled around to support other optimizations. See the
168    /// sibling 'special' module for more details on that. (The reason for
169    /// mentioning this is that if you print out the DFA for debugging during
170    /// determinization, and then print out the final DFA after it is fully
171    /// built, then the state IDs likely won't match up.)
172    builder_states: Vec<State>,
173    /// A cache of DFA states that already exist and can be easily looked up
174    /// via ordered sets of NFA states.
175    ///
176    /// See `builder_states` docs for why we store states in two different
177    /// ways.
178    cache: StateMap,
179    /// The memory usage, in bytes, used by builder_states and cache. We track
180    /// this as new states are added since states use a variable amount of
181    /// heap. Tracking this as we add states makes it possible to compute the
182    /// total amount of memory used by the determinizer in constant time.
183    memory_usage_state: usize,
184    /// A pair of sparse sets for tracking ordered sets of NFA state IDs.
185    /// These are reused throughout determinization. A bounded sparse set
186    /// gives us constant time insertion, membership testing and clearing.
187    sparses: SparseSets,
188    /// Scratch space for a stack of NFA states to visit, for depth first
189    /// visiting without recursion.
190    stack: Vec<StateID>,
191    /// Scratch space for storing an ordered sequence of NFA states, for
192    /// amortizing allocation. This is principally useful for when we avoid
193    /// adding a new DFA state since it already exists. In order to detect this
194    /// case though, we still need an ordered set of NFA state IDs. So we use
195    /// this space to stage that ordered set before we know whether we need to
196    /// create a new DFA state or not.
197    scratch_state_builder: StateBuilderEmpty,
198}
199
200/// A map from states to state identifiers. When using std, we use a standard
201/// hashmap, since it's a bit faster for this use case. (Other maps, like
202/// one's based on FNV, have not yet been benchmarked.)
203///
204/// The main purpose of this map is to reuse states where possible. This won't
205/// fully minimize the DFA, but it works well in a lot of cases.
206#[cfg(feature = "std")]
207type StateMap = std::collections::HashMap<State, StateID>;
208#[cfg(not(feature = "std"))]
209type StateMap = BTreeMap<State, StateID>;
210
211impl<'a> Runner<'a> {
212    /// Build the DFA. If there was a problem constructing the DFA (e.g., if
213    /// the chosen state identifier representation is too small), then an error
214    /// is returned.
215    fn run(mut self) -> Result<(), BuildError> {
216        if self.nfa.look_set_any().contains_word_unicode()
217            && !self.config.quit.contains_range(0x80, 0xFF)
218        {
219            return Err(BuildError::unsupported_dfa_word_boundary_unicode());
220        }
221
222        // A sequence of "representative" bytes drawn from each equivalence
223        // class. These representative bytes are fed to the NFA to compute
224        // state transitions. This allows us to avoid re-computing state
225        // transitions for bytes that are guaranteed to produce identical
226        // results. Since computing the representatives needs to do a little
227        // work, we do it once here because we'll be iterating over them a lot.
228        let representatives: Vec<alphabet::Unit> =
229            self.dfa.byte_classes().representatives(..).collect();
230        // The set of all DFA state IDs that still need to have their
231        // transitions set. We start by seeding this with all starting states.
232        let mut uncompiled = alloc::vec![];
233        self.add_all_starts(&mut uncompiled)?;
234        while let Some(dfa_id) = uncompiled.pop() {
235            for &unit in &representatives {
236                if unit.as_u8().map_or(false, |b| self.config.quit.contains(b))
237                {
238                    continue;
239                }
240                // In many cases, the state we transition to has already been
241                // computed. 'cached_state' will do the minimal amount of work
242                // to check this, and if it exists, immediately return an
243                // already existing state ID.
244                let (next_dfa_id, is_new) = self.cached_state(dfa_id, unit)?;
245                self.dfa.set_transition(dfa_id, unit, next_dfa_id);
246                // If the state ID we got back is newly created, then we need
247                // to compile it, so add it to our uncompiled frontier.
248                if is_new {
249                    uncompiled.push(next_dfa_id);
250                }
251            }
252        }
253        debug!(
254            "determinization complete, memory usage: {}, \
255             dense DFA size: {}, \
256             is reverse? {}",
257            self.memory_usage(),
258            self.dfa.memory_usage(),
259            self.nfa.is_reverse(),
260        );
261
262        // A map from DFA state ID to one or more NFA match IDs. Each NFA match
263        // ID corresponds to a distinct regex pattern that matches in the state
264        // corresponding to the key.
265        let mut matches: BTreeMap<StateID, Vec<PatternID>> = BTreeMap::new();
266        self.cache.clear();
267        #[cfg(feature = "logging")]
268        let mut total_pat_len = 0;
269        for (i, state) in self.builder_states.into_iter().enumerate() {
270            if let Some(pat_ids) = state.match_pattern_ids() {
271                let id = self.dfa.to_state_id(i);
272                log! {
273                    total_pat_len += pat_ids.len();
274                }
275                matches.insert(id, pat_ids);
276            }
277        }
278        log! {
279            use core::mem::size_of;
280            let per_elem = size_of::<StateID>() + size_of::<Vec<PatternID>>();
281            let pats = total_pat_len * size_of::<PatternID>();
282            let mem = (matches.len() * per_elem) + pats;
283            log::debug!("matches map built, memory usage: {}", mem);
284        }
285        // At this point, we shuffle the "special" states in the final DFA.
286        // This permits a DFA's match loop to detect a match condition (among
287        // other things) by merely inspecting the current state's identifier,
288        // and avoids the need for any additional auxiliary storage.
289        self.dfa.shuffle(matches)?;
290        Ok(())
291    }
292
293    /// Return the identifier for the next DFA state given an existing DFA
294    /// state and an input byte. If the next DFA state already exists, then
295    /// return its identifier from the cache. Otherwise, build the state, cache
296    /// it and return its identifier.
297    ///
298    /// This routine returns a boolean indicating whether a new state was
299    /// built. If a new state is built, then the caller needs to add it to its
300    /// frontier of uncompiled DFA states to compute transitions for.
301    fn cached_state(
302        &mut self,
303        dfa_id: StateID,
304        unit: alphabet::Unit,
305    ) -> Result<(StateID, bool), BuildError> {
306        // Compute the set of all reachable NFA states, including epsilons.
307        let empty_builder = self.get_state_builder();
308        let builder = util::determinize::next(
309            self.nfa,
310            self.config.match_kind,
311            &mut self.sparses,
312            &mut self.stack,
313            &self.builder_states[self.dfa.to_index(dfa_id)],
314            unit,
315            empty_builder,
316        );
317        self.maybe_add_state(builder)
318    }
319
320    /// Compute the set of DFA start states and add their identifiers in
321    /// 'dfa_state_ids' (no duplicates are added).
322    fn add_all_starts(
323        &mut self,
324        dfa_state_ids: &mut Vec<StateID>,
325    ) -> Result<(), BuildError> {
326        // These should be the first states added.
327        assert!(dfa_state_ids.is_empty());
328        // We only want to add (un)anchored starting states that is consistent
329        // with our DFA's configuration. Unconditionally adding both (although
330        // it is the default) can make DFAs quite a bit bigger.
331        if self.dfa.start_kind().has_unanchored() {
332            self.add_start_group(Anchored::No, dfa_state_ids)?;
333        }
334        if self.dfa.start_kind().has_anchored() {
335            self.add_start_group(Anchored::Yes, dfa_state_ids)?;
336        }
337        // I previously has an 'assert' here checking that either
338        // 'dfa_state_ids' was non-empty, or the NFA had zero patterns. But it
339        // turns out this isn't always true. For example, the NFA might have
340        // one or more patterns but where all such patterns are just 'fail'
341        // states. These will ultimately just compile down to DFA dead states,
342        // and since the dead state was added earlier, no new DFA states are
343        // added. And thus, it is valid and okay for 'dfa_state_ids' to be
344        // empty even if there are a non-zero number of patterns in the NFA.
345
346        // We only need to compute anchored start states for each pattern if it
347        // was requested to do so.
348        if self.dfa.starts_for_each_pattern() {
349            for pid in self.nfa.patterns() {
350                self.add_start_group(Anchored::Pattern(pid), dfa_state_ids)?;
351            }
352        }
353        Ok(())
354    }
355
356    /// Add a group of start states for the given match pattern ID. Any new
357    /// DFA states added are pushed on to 'dfa_state_ids'. (No duplicates are
358    /// pushed.)
359    ///
360    /// When pattern_id is None, then this will compile a group of unanchored
361    /// start states (if the DFA is unanchored). When the pattern_id is
362    /// present, then this will compile a group of anchored start states that
363    /// only match the given pattern.
364    ///
365    /// This panics if `anchored` corresponds to an invalid pattern ID.
366    fn add_start_group(
367        &mut self,
368        anchored: Anchored,
369        dfa_state_ids: &mut Vec<StateID>,
370    ) -> Result<(), BuildError> {
371        let nfa_start = match anchored {
372            Anchored::No => self.nfa.start_unanchored(),
373            Anchored::Yes => self.nfa.start_anchored(),
374            Anchored::Pattern(pid) => {
375                self.nfa.start_pattern(pid).expect("valid pattern ID")
376            }
377        };
378
379        // When compiling start states, we're careful not to build additional
380        // states that aren't necessary. For example, if the NFA has no word
381        // boundary assertion, then there's no reason to have distinct start
382        // states for 'NonWordByte' and 'WordByte' starting configurations.
383        // Instead, the 'WordByte' starting configuration can just point
384        // directly to the start state for the 'NonWordByte' config.
385        //
386        // Note though that we only need to care about assertions in the prefix
387        // of an NFA since this only concerns the starting states. (Actually,
388        // the most precisely thing we could do it is look at the prefix
389        // assertions of each pattern when 'anchored == Anchored::Pattern',
390        // and then only compile extra states if the prefix is non-empty.) But
391        // we settle for simplicity here instead of absolute minimalism. It is
392        // somewhat rare, after all, for multiple patterns in the same regex to
393        // have different prefix look-arounds.
394
395        let (id, is_new) =
396            self.add_one_start(nfa_start, Start::NonWordByte)?;
397        self.dfa.set_start_state(anchored, Start::NonWordByte, id);
398        if is_new {
399            dfa_state_ids.push(id);
400        }
401
402        if !self.nfa.look_set_prefix_any().contains_word() {
403            self.dfa.set_start_state(anchored, Start::WordByte, id);
404        } else {
405            let (id, is_new) =
406                self.add_one_start(nfa_start, Start::WordByte)?;
407            self.dfa.set_start_state(anchored, Start::WordByte, id);
408            if is_new {
409                dfa_state_ids.push(id);
410            }
411        }
412        if !self.nfa.look_set_prefix_any().contains_anchor() {
413            self.dfa.set_start_state(anchored, Start::Text, id);
414            self.dfa.set_start_state(anchored, Start::LineLF, id);
415            self.dfa.set_start_state(anchored, Start::LineCR, id);
416            self.dfa.set_start_state(
417                anchored,
418                Start::CustomLineTerminator,
419                id,
420            );
421        } else {
422            let (id, is_new) = self.add_one_start(nfa_start, Start::Text)?;
423            self.dfa.set_start_state(anchored, Start::Text, id);
424            if is_new {
425                dfa_state_ids.push(id);
426            }
427
428            let (id, is_new) = self.add_one_start(nfa_start, Start::LineLF)?;
429            self.dfa.set_start_state(anchored, Start::LineLF, id);
430            if is_new {
431                dfa_state_ids.push(id);
432            }
433
434            let (id, is_new) = self.add_one_start(nfa_start, Start::LineCR)?;
435            self.dfa.set_start_state(anchored, Start::LineCR, id);
436            if is_new {
437                dfa_state_ids.push(id);
438            }
439
440            let (id, is_new) =
441                self.add_one_start(nfa_start, Start::CustomLineTerminator)?;
442            self.dfa.set_start_state(
443                anchored,
444                Start::CustomLineTerminator,
445                id,
446            );
447            if is_new {
448                dfa_state_ids.push(id);
449            }
450        }
451
452        Ok(())
453    }
454
455    /// Add a new DFA start state corresponding to the given starting NFA
456    /// state, and the starting search configuration. (The starting search
457    /// configuration essentially tells us which look-behind assertions are
458    /// true for this particular state.)
459    ///
460    /// The boolean returned indicates whether the state ID returned is a newly
461    /// created state, or a previously cached state.
462    fn add_one_start(
463        &mut self,
464        nfa_start: StateID,
465        start: Start,
466    ) -> Result<(StateID, bool), BuildError> {
467        // Compute the look-behind assertions that are true in this starting
468        // configuration, and the determine the epsilon closure. While
469        // computing the epsilon closure, we only follow condiional epsilon
470        // transitions that satisfy the look-behind assertions in 'look_have'.
471        let mut builder_matches = self.get_state_builder().into_matches();
472        util::determinize::set_lookbehind_from_start(
473            self.nfa,
474            &start,
475            &mut builder_matches,
476        );
477        self.sparses.set1.clear();
478        util::determinize::epsilon_closure(
479            self.nfa,
480            nfa_start,
481            builder_matches.look_have(),
482            &mut self.stack,
483            &mut self.sparses.set1,
484        );
485        let mut builder = builder_matches.into_nfa();
486        util::determinize::add_nfa_states(
487            &self.nfa,
488            &self.sparses.set1,
489            &mut builder,
490        );
491        self.maybe_add_state(builder)
492    }
493
494    /// Adds the given state to the DFA being built depending on whether it
495    /// already exists in this determinizer's cache.
496    ///
497    /// If it does exist, then the memory used by 'state' is put back into the
498    /// determinizer and the previously created state's ID is returned. (Along
499    /// with 'false', indicating that no new state was added.)
500    ///
501    /// If it does not exist, then the state is added to the DFA being built
502    /// and a fresh ID is allocated (if ID allocation fails, then an error is
503    /// returned) and returned. (Along with 'true', indicating that a new state
504    /// was added.)
505    fn maybe_add_state(
506        &mut self,
507        builder: StateBuilderNFA,
508    ) -> Result<(StateID, bool), BuildError> {
509        if let Some(&cached_id) = self.cache.get(builder.as_bytes()) {
510            // Since we have a cached state, put the constructed state's
511            // memory back into our scratch space, so that it can be reused.
512            self.put_state_builder(builder);
513            return Ok((cached_id, false));
514        }
515        self.add_state(builder).map(|sid| (sid, true))
516    }
517
518    /// Add the given state to the DFA and make it available in the cache.
519    ///
520    /// The state initially has no transitions. That is, it transitions to the
521    /// dead state for all possible inputs, and transitions to the quit state
522    /// for all quit bytes.
523    ///
524    /// If adding the state would exceed the maximum value for StateID, then an
525    /// error is returned.
526    fn add_state(
527        &mut self,
528        builder: StateBuilderNFA,
529    ) -> Result<StateID, BuildError> {
530        let id = self.dfa.add_empty_state()?;
531        if !self.config.quit.is_empty() {
532            for b in self.config.quit.iter() {
533                self.dfa.set_transition(
534                    id,
535                    alphabet::Unit::u8(b),
536                    self.dfa.quit_id(),
537                );
538            }
539        }
540        let state = builder.to_state();
541        // States use reference counting internally, so we only need to count
542        // their memory usage once.
543        self.memory_usage_state += state.memory_usage();
544        self.builder_states.push(state.clone());
545        self.cache.insert(state, id);
546        self.put_state_builder(builder);
547        if let Some(limit) = self.config.dfa_size_limit {
548            if self.dfa.memory_usage() > limit {
549                return Err(BuildError::dfa_exceeded_size_limit(limit));
550            }
551        }
552        if let Some(limit) = self.config.determinize_size_limit {
553            if self.memory_usage() > limit {
554                return Err(BuildError::determinize_exceeded_size_limit(
555                    limit,
556                ));
557            }
558        }
559        Ok(id)
560    }
561
562    /// Returns a state builder from this determinizer that might have existing
563    /// capacity. This helps avoid allocs in cases where a state is built that
564    /// turns out to already be cached.
565    ///
566    /// Callers must put the state builder back with 'put_state_builder',
567    /// otherwise the allocation reuse won't work.
568    fn get_state_builder(&mut self) -> StateBuilderEmpty {
569        core::mem::replace(
570            &mut self.scratch_state_builder,
571            StateBuilderEmpty::new(),
572        )
573    }
574
575    /// Puts the given state builder back into this determinizer for reuse.
576    ///
577    /// Note that building a 'State' from a builder always creates a new
578    /// alloc, so callers should always put the builder back.
579    fn put_state_builder(&mut self, builder: StateBuilderNFA) {
580        let _ = core::mem::replace(
581            &mut self.scratch_state_builder,
582            builder.clear(),
583        );
584    }
585
586    /// Return the memory usage, in bytes, of this determinizer at the current
587    /// point in time. This does not include memory used by the NFA or the
588    /// dense DFA itself.
589    fn memory_usage(&self) -> usize {
590        use core::mem::size_of;
591
592        self.builder_states.len() * size_of::<State>()
593        // Maps likely use more memory than this, but it's probably close.
594        + self.cache.len() * (size_of::<State>() + size_of::<StateID>())
595        + self.memory_usage_state
596        + self.stack.capacity() * size_of::<StateID>()
597        + self.scratch_state_builder.capacity()
598    }
599}