regex_automata/dfa/
accel.rs

1// This module defines some core types for dealing with accelerated DFA states.
2// Briefly, a DFA state can be "accelerated" if all of its transitions except
3// for a few loop back to itself. This directly implies that the only way out
4// of such a state is if a byte corresponding to one of those non-loopback
5// transitions is found. Such states are often found in simple repetitions in
6// non-Unicode regexes. For example, consider '(?-u)[^a]+a'. We can look at its
7// DFA with regex-cli:
8//
9//     $ regex-cli debug dense dfa -p '(?-u)[^a]+a' -BbC --no-table
10//     D 000000:
11//     Q 000001:
12//      *000002:
13//     A 000003: \x00-` => 3, a => 8, b-\xFF => 3
14//     A 000004: \x00-` => 4, a => 7, b-\xFF => 4
15//       000005: \x00-` => 4, b-\xFF => 4
16//       000006: \x00-` => 3, a => 6, b-\xFF => 3
17//       000007: \x00-\xFF => 2, EOI => 2
18//       000008: \x00-\xFF => 2, EOI => 2
19//
20// In particular, state 3 is accelerated (shown via the 'A' indicator) since
21// the only way to leave that state once entered is to see an 'a' byte. If
22// there is a long run of non-'a' bytes, then using something like 'memchr'
23// to find the next 'a' byte can be significantly faster than just using the
24// standard byte-at-a-time state machine.
25//
26// Unfortunately, this optimization rarely applies when Unicode is enabled.
27// For example, patterns like '[^a]' don't actually match any byte that isn't
28// 'a', but rather, any UTF-8 encoding of a Unicode scalar value that isn't
29// 'a'. This makes the state machine much more complex---far beyond a single
30// state---and removes the ability to easily accelerate it. (Because if the
31// machine sees a non-UTF-8 sequence, then the machine won't match through it.)
32//
33// In practice, we only consider accelerating states that have 3 or fewer
34// non-loop transitions. At a certain point, you get diminishing returns, but
35// also because that's what the memchr crate supports. The structures below
36// hard-code this assumption and provide (de)serialization APIs for use inside
37// a DFA.
38//
39// And finally, note that there is some trickery involved in making it very
40// fast to not only check whether a state is accelerated at search time, but
41// also to access the bytes to search for to implement the acceleration itself.
42// dfa/special.rs provides more detail, but the short story is that all
43// accelerated states appear contiguously in a DFA. This means we can represent
44// the ID space of all accelerated DFA states with a single range. So given
45// a state ID, we can determine whether it's accelerated via
46//
47//     min_accel_id <= id <= max_accel_id
48//
49// And find its corresponding accelerator with:
50//
51//     accels.get((id - min_accel_id) / dfa_stride)
52
53#[cfg(feature = "dfa-build")]
54use alloc::{vec, vec::Vec};
55
56use crate::util::{
57    int::Pointer,
58    memchr,
59    wire::{self, DeserializeError, Endian, SerializeError},
60};
61
62/// The base type used to represent a collection of accelerators.
63///
64/// While an `Accel` is represented as a fixed size array of bytes, a
65/// *collection* of `Accel`s (called `Accels`) is represented internally as a
66/// slice of u32. While it's a bit unnatural to do this and costs us a bit of
67/// fairly low-risk not-safe code, it lets us remove the need for a second type
68/// parameter in the definition of dense::DFA. (Which really wants everything
69/// to be a slice of u32.)
70type AccelTy = u32;
71
72/// The size of the unit of representation for accelerators.
73///
74/// ACCEL_CAP *must* be a multiple of this size.
75const ACCEL_TY_SIZE: usize = core::mem::size_of::<AccelTy>();
76
77/// The maximum length in bytes that a single Accel can be. This is distinct
78/// from the capacity of an accelerator in that the length represents only the
79/// bytes that should be read.
80const ACCEL_LEN: usize = 4;
81
82/// The capacity of each accelerator, in bytes. We set this to 8 since it's a
83/// multiple of 4 (our ID size) and because it gives us a little wiggle room
84/// if we want to support more accel bytes in the future without a breaking
85/// change.
86///
87/// This MUST be a multiple of ACCEL_TY_SIZE.
88const ACCEL_CAP: usize = 8;
89
90/// Search for between 1 and 3 needle bytes in the given haystack, starting the
91/// search at the given position. If `needles` has a length other than 1-3,
92/// then this panics.
93#[cfg_attr(feature = "perf-inline", inline(always))]
94pub(crate) fn find_fwd(
95    needles: &[u8],
96    haystack: &[u8],
97    at: usize,
98) -> Option<usize> {
99    let bs = needles;
100    let i = match needles.len() {
101        1 => memchr::memchr(bs[0], &haystack[at..])?,
102        2 => memchr::memchr2(bs[0], bs[1], &haystack[at..])?,
103        3 => memchr::memchr3(bs[0], bs[1], bs[2], &haystack[at..])?,
104        0 => panic!("cannot find with empty needles"),
105        n => panic!("invalid needles length: {}", n),
106    };
107    Some(at + i)
108}
109
110/// Search for between 1 and 3 needle bytes in the given haystack in reverse,
111/// starting the search at the given position. If `needles` has a length other
112/// than 1-3, then this panics.
113#[cfg_attr(feature = "perf-inline", inline(always))]
114pub(crate) fn find_rev(
115    needles: &[u8],
116    haystack: &[u8],
117    at: usize,
118) -> Option<usize> {
119    let bs = needles;
120    match needles.len() {
121        1 => memchr::memrchr(bs[0], &haystack[..at]),
122        2 => memchr::memrchr2(bs[0], bs[1], &haystack[..at]),
123        3 => memchr::memrchr3(bs[0], bs[1], bs[2], &haystack[..at]),
124        0 => panic!("cannot find with empty needles"),
125        n => panic!("invalid needles length: {}", n),
126    }
127}
128
129/// Represents the accelerators for all accelerated states in a dense DFA.
130///
131/// The `A` type parameter represents the type of the underlying bytes.
132/// Generally, this is either `&[AccelTy]` or `Vec<AccelTy>`.
133#[derive(Clone)]
134pub(crate) struct Accels<A> {
135    /// A length prefixed slice of contiguous accelerators. See the top comment
136    /// in this module for more details on how we can jump from a DFA's state
137    /// ID to an accelerator in this list.
138    ///
139    /// The first 4 bytes always correspond to the number of accelerators
140    /// that follow.
141    accels: A,
142}
143
144#[cfg(feature = "dfa-build")]
145impl Accels<Vec<AccelTy>> {
146    /// Create an empty sequence of accelerators for a DFA.
147    pub fn empty() -> Accels<Vec<AccelTy>> {
148        Accels { accels: vec![0] }
149    }
150
151    /// Add an accelerator to this sequence.
152    ///
153    /// This adds to the accelerator to the end of the sequence and therefore
154    /// should be done in correspondence with its state in the DFA.
155    ///
156    /// This panics if this results in more accelerators than AccelTy::MAX.
157    pub fn add(&mut self, accel: Accel) {
158        self.accels.extend_from_slice(&accel.as_accel_tys());
159        let len = self.len();
160        self.set_len(len + 1);
161    }
162
163    /// Set the number of accelerators in this sequence, which is encoded in
164    /// the first 4 bytes of the underlying bytes.
165    fn set_len(&mut self, new_len: usize) {
166        // The only way an accelerator gets added is if a state exists for
167        // it, and if a state exists, then its index is guaranteed to be
168        // representable by a AccelTy by virtue of the guarantees provided by
169        // StateID.
170        let new_len = AccelTy::try_from(new_len).unwrap();
171        self.accels[0] = new_len;
172    }
173}
174
175impl<'a> Accels<&'a [AccelTy]> {
176    /// Deserialize a sequence of accelerators from the given bytes. If there
177    /// was a problem deserializing, then an error is returned.
178    ///
179    /// This is guaranteed to run in constant time. This does not guarantee
180    /// that every accelerator in the returned collection is valid. Thus,
181    /// accessing one may panic, or not-safe code that relies on accelerators
182    /// being correct my result in UB.
183    ///
184    /// Callers may check the validity of every accelerator with the `validate`
185    /// method.
186    pub fn from_bytes_unchecked(
187        mut slice: &'a [u8],
188    ) -> Result<(Accels<&'a [AccelTy]>, usize), DeserializeError> {
189        let slice_start = slice.as_ptr().as_usize();
190
191        let (accel_len, _) =
192            wire::try_read_u32_as_usize(slice, "accelerators length")?;
193        // The accelerator length is part of the accel_tys slice that
194        // we deserialize. This is perhaps a bit idiosyncratic. It would
195        // probably be better to split out the length into a real field.
196
197        let accel_tys_len = wire::add(
198            wire::mul(accel_len, 2, "total number of accelerator accel_tys")?,
199            1,
200            "total number of accel_tys",
201        )?;
202        let accel_tys_bytes_len = wire::mul(
203            ACCEL_TY_SIZE,
204            accel_tys_len,
205            "total number of bytes in accelerators",
206        )?;
207        wire::check_slice_len(slice, accel_tys_bytes_len, "accelerators")?;
208        wire::check_alignment::<AccelTy>(slice)?;
209        let accel_tys = &slice[..accel_tys_bytes_len];
210        slice = &slice[accel_tys_bytes_len..];
211        // SAFETY: We've checked the length and alignment above, and since
212        // slice is just bytes and AccelTy is just a u32, we can safely cast to
213        // a slice of &[AccelTy].
214        let accels = unsafe {
215            core::slice::from_raw_parts(
216                accel_tys.as_ptr().cast::<AccelTy>(),
217                accel_tys_len,
218            )
219        };
220        Ok((Accels { accels }, slice.as_ptr().as_usize() - slice_start))
221    }
222}
223
224impl<A: AsRef<[AccelTy]>> Accels<A> {
225    /// Return an owned version of the accelerators.
226    #[cfg(feature = "alloc")]
227    pub fn to_owned(&self) -> Accels<alloc::vec::Vec<AccelTy>> {
228        Accels { accels: self.accels.as_ref().to_vec() }
229    }
230
231    /// Return a borrowed version of the accelerators.
232    pub fn as_ref(&self) -> Accels<&[AccelTy]> {
233        Accels { accels: self.accels.as_ref() }
234    }
235
236    /// Return the bytes representing the serialization of the accelerators.
237    pub fn as_bytes(&self) -> &[u8] {
238        let accels = self.accels.as_ref();
239        // SAFETY: This is safe because accels is a just a slice of AccelTy,
240        // and u8 always has a smaller alignment.
241        unsafe {
242            core::slice::from_raw_parts(
243                accels.as_ptr().cast::<u8>(),
244                accels.len() * ACCEL_TY_SIZE,
245            )
246        }
247    }
248
249    /// Returns the memory usage, in bytes, of these accelerators.
250    ///
251    /// The memory usage is computed based on the number of bytes used to
252    /// represent all of the accelerators.
253    ///
254    /// This does **not** include the stack size used by this value.
255    pub fn memory_usage(&self) -> usize {
256        self.as_bytes().len()
257    }
258
259    /// Return the bytes to search for corresponding to the accelerator in this
260    /// sequence at index `i`. If no such accelerator exists, then this panics.
261    ///
262    /// The significance of the index is that it should be in correspondence
263    /// with the index of the corresponding DFA. That is, accelerated DFA
264    /// states are stored contiguously in the DFA and have an ordering implied
265    /// by their respective state IDs. The state's index in that sequence
266    /// corresponds to the index of its corresponding accelerator.
267    #[cfg_attr(feature = "perf-inline", inline(always))]
268    pub fn needles(&self, i: usize) -> &[u8] {
269        if i >= self.len() {
270            panic!("invalid accelerator index {}", i);
271        }
272        let bytes = self.as_bytes();
273        let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
274        let len = usize::from(bytes[offset]);
275        &bytes[offset + 1..offset + 1 + len]
276    }
277
278    /// Return the total number of accelerators in this sequence.
279    pub fn len(&self) -> usize {
280        // This should never panic since deserialization checks that the
281        // length can fit into a usize.
282        usize::try_from(self.accels.as_ref()[0]).unwrap()
283    }
284
285    /// Return the accelerator in this sequence at index `i`. If no such
286    /// accelerator exists, then this returns None.
287    ///
288    /// See the docs for `needles` on the significance of the index.
289    fn get(&self, i: usize) -> Option<Accel> {
290        if i >= self.len() {
291            return None;
292        }
293        let offset = ACCEL_TY_SIZE + i * ACCEL_CAP;
294        let accel = Accel::from_slice(&self.as_bytes()[offset..])
295            .expect("Accels must contain valid accelerators");
296        Some(accel)
297    }
298
299    /// Returns an iterator of accelerators in this sequence.
300    fn iter(&self) -> IterAccels<'_, A> {
301        IterAccels { accels: self, i: 0 }
302    }
303
304    /// Writes these accelerators to the given byte buffer using the indicated
305    /// endianness. If the given buffer is too small, then an error is
306    /// returned. Upon success, the total number of bytes written is returned.
307    /// The number of bytes written is guaranteed to be a multiple of 8.
308    pub fn write_to<E: Endian>(
309        &self,
310        dst: &mut [u8],
311    ) -> Result<usize, SerializeError> {
312        let nwrite = self.write_to_len();
313        assert_eq!(
314            nwrite % ACCEL_TY_SIZE,
315            0,
316            "expected accelerator bytes written to be a multiple of {}",
317            ACCEL_TY_SIZE,
318        );
319        if dst.len() < nwrite {
320            return Err(SerializeError::buffer_too_small("accelerators"));
321        }
322
323        // The number of accelerators can never exceed AccelTy::MAX.
324        E::write_u32(AccelTy::try_from(self.len()).unwrap(), dst);
325        // The actual accelerators are just raw bytes and thus their endianness
326        // is irrelevant. So we can copy them as bytes.
327        dst[ACCEL_TY_SIZE..nwrite]
328            .copy_from_slice(&self.as_bytes()[ACCEL_TY_SIZE..nwrite]);
329        Ok(nwrite)
330    }
331
332    /// Validates that every accelerator in this collection can be successfully
333    /// deserialized as a valid accelerator.
334    pub fn validate(&self) -> Result<(), DeserializeError> {
335        for chunk in self.as_bytes()[ACCEL_TY_SIZE..].chunks(ACCEL_CAP) {
336            let _ = Accel::from_slice(chunk)?;
337        }
338        Ok(())
339    }
340
341    /// Returns the total number of bytes written by `write_to`.
342    pub fn write_to_len(&self) -> usize {
343        self.as_bytes().len()
344    }
345}
346
347impl<A: AsRef<[AccelTy]>> core::fmt::Debug for Accels<A> {
348    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
349        write!(f, "Accels(")?;
350        let mut list = f.debug_list();
351        for a in self.iter() {
352            list.entry(&a);
353        }
354        list.finish()?;
355        write!(f, ")")
356    }
357}
358
359#[derive(Debug)]
360struct IterAccels<'a, A: AsRef<[AccelTy]>> {
361    accels: &'a Accels<A>,
362    i: usize,
363}
364
365impl<'a, A: AsRef<[AccelTy]>> Iterator for IterAccels<'a, A> {
366    type Item = Accel;
367
368    fn next(&mut self) -> Option<Accel> {
369        let accel = self.accels.get(self.i)?;
370        self.i += 1;
371        Some(accel)
372    }
373}
374
375/// Accel represents a structure for determining how to "accelerate" a DFA
376/// state.
377///
378/// Namely, it contains zero or more bytes that must be seen in order for the
379/// DFA to leave the state it is associated with. In practice, the actual range
380/// is 1 to 3 bytes.
381///
382/// The purpose of acceleration is to identify states whose vast majority
383/// of transitions are just loops back to the same state. For example,
384/// in the regex `(?-u)^[^a]+b`, the corresponding DFA will have a state
385/// (corresponding to `[^a]+`) where all transitions *except* for `a` and
386/// `b` loop back to itself. Thus, this state can be "accelerated" by simply
387/// looking for the next occurrence of either `a` or `b` instead of explicitly
388/// following transitions. (In this case, `b` transitions to the next state
389/// where as `a` would transition to the dead state.)
390#[derive(Clone)]
391pub(crate) struct Accel {
392    /// The first byte is the length. Subsequent bytes are the accelerated
393    /// bytes.
394    ///
395    /// Note that we make every accelerator 8 bytes as a slightly wasteful
396    /// way of making sure alignment is always correct for state ID sizes of
397    /// 1, 2, 4 and 8. This should be okay since accelerated states aren't
398    /// particularly common, especially when Unicode is enabled.
399    bytes: [u8; ACCEL_CAP],
400}
401
402impl Accel {
403    /// Returns an empty accel, where no bytes are accelerated.
404    #[cfg(feature = "dfa-build")]
405    pub fn new() -> Accel {
406        Accel { bytes: [0; ACCEL_CAP] }
407    }
408
409    /// Returns a verified accelerator derived from the beginning of the given
410    /// slice.
411    ///
412    /// If the slice is not long enough or contains invalid bytes for an
413    /// accelerator, then this returns an error.
414    pub fn from_slice(mut slice: &[u8]) -> Result<Accel, DeserializeError> {
415        slice = &slice[..core::cmp::min(ACCEL_LEN, slice.len())];
416        let bytes = slice
417            .try_into()
418            .map_err(|_| DeserializeError::buffer_too_small("accelerator"))?;
419        Accel::from_bytes(bytes)
420    }
421
422    /// Returns a verified accelerator derived from raw bytes.
423    ///
424    /// If the given bytes are invalid, then this returns an error.
425    fn from_bytes(bytes: [u8; 4]) -> Result<Accel, DeserializeError> {
426        if usize::from(bytes[0]) >= ACCEL_LEN {
427            return Err(DeserializeError::generic(
428                "accelerator bytes cannot have length more than 3",
429            ));
430        }
431        Ok(Accel::from_bytes_unchecked(bytes))
432    }
433
434    /// Returns an accelerator derived from raw bytes.
435    ///
436    /// This does not check whether the given bytes are valid. Invalid bytes
437    /// cannot sacrifice memory safety, but may result in panics or silent
438    /// logic bugs.
439    fn from_bytes_unchecked(bytes: [u8; 4]) -> Accel {
440        Accel { bytes: [bytes[0], bytes[1], bytes[2], bytes[3], 0, 0, 0, 0] }
441    }
442
443    /// Attempts to add the given byte to this accelerator. If the accelerator
444    /// is already full or thinks the byte is a poor accelerator, then this
445    /// returns false. Otherwise, returns true.
446    ///
447    /// If the given byte is already in this accelerator, then it panics.
448    #[cfg(feature = "dfa-build")]
449    pub fn add(&mut self, byte: u8) -> bool {
450        if self.len() >= 3 {
451            return false;
452        }
453        // As a special case, we totally reject trying to accelerate a state
454        // with an ASCII space. In most cases, it occurs very frequently, and
455        // tends to result in worse overall performance.
456        if byte == b' ' {
457            return false;
458        }
459        assert!(
460            !self.contains(byte),
461            "accelerator already contains {:?}",
462            crate::util::escape::DebugByte(byte)
463        );
464        self.bytes[self.len() + 1] = byte;
465        self.bytes[0] += 1;
466        true
467    }
468
469    /// Return the number of bytes in this accelerator.
470    pub fn len(&self) -> usize {
471        usize::from(self.bytes[0])
472    }
473
474    /// Returns true if and only if there are no bytes in this accelerator.
475    #[cfg(feature = "dfa-build")]
476    pub fn is_empty(&self) -> bool {
477        self.len() == 0
478    }
479
480    /// Returns the slice of bytes to accelerate.
481    ///
482    /// If this accelerator is empty, then this returns an empty slice.
483    fn needles(&self) -> &[u8] {
484        &self.bytes[1..1 + self.len()]
485    }
486
487    /// Returns true if and only if this accelerator will accelerate the given
488    /// byte.
489    #[cfg(feature = "dfa-build")]
490    fn contains(&self, byte: u8) -> bool {
491        self.needles().iter().position(|&b| b == byte).is_some()
492    }
493
494    /// Returns the accelerator bytes as an array of AccelTys.
495    #[cfg(feature = "dfa-build")]
496    fn as_accel_tys(&self) -> [AccelTy; 2] {
497        assert_eq!(ACCEL_CAP, 8);
498        // These unwraps are OK since ACCEL_CAP is set to 8.
499        let first =
500            AccelTy::from_ne_bytes(self.bytes[0..4].try_into().unwrap());
501        let second =
502            AccelTy::from_ne_bytes(self.bytes[4..8].try_into().unwrap());
503        [first, second]
504    }
505}
506
507impl core::fmt::Debug for Accel {
508    fn fmt(&self, f: &mut core::fmt::Formatter) -> core::fmt::Result {
509        write!(f, "Accel(")?;
510        let mut set = f.debug_set();
511        for &b in self.needles() {
512            set.entry(&crate::util::escape::DebugByte(b));
513        }
514        set.finish()?;
515        write!(f, ")")
516    }
517}