regex_syntax/
utf8.rs

1/*!
2Converts ranges of Unicode scalar values to equivalent ranges of UTF-8 bytes.
3
4This is sub-module is useful for constructing byte based automatons that need
5to embed UTF-8 decoding. The most common use of this module is in conjunction
6with the [`hir::ClassUnicodeRange`](crate::hir::ClassUnicodeRange) type.
7
8See the documentation on the `Utf8Sequences` iterator for more details and
9an example.
10
11# Wait, what is this?
12
13This is simplest to explain with an example. Let's say you wanted to test
14whether a particular byte sequence was a Cyrillic character. One possible
15scalar value range is `[0400-04FF]`. The set of allowed bytes for this
16range can be expressed as a sequence of byte ranges:
17
18```text
19[D0-D3][80-BF]
20```
21
22This is simple enough: simply encode the boundaries, `0400` encodes to
23`D0 80` and `04FF` encodes to `D3 BF`, and create ranges from each
24corresponding pair of bytes: `D0` to `D3` and `80` to `BF`.
25
26However, what if you wanted to add the Cyrillic Supplementary characters to
27your range? Your range might then become `[0400-052F]`. The same procedure
28as above doesn't quite work because `052F` encodes to `D4 AF`. The byte ranges
29you'd get from the previous transformation would be `[D0-D4][80-AF]`. However,
30this isn't quite correct because this range doesn't capture many characters,
31for example, `04FF` (because its last byte, `BF` isn't in the range `80-AF`).
32
33Instead, you need multiple sequences of byte ranges:
34
35```text
36[D0-D3][80-BF]  # matches codepoints 0400-04FF
37[D4][80-AF]     # matches codepoints 0500-052F
38```
39
40This gets even more complicated if you want bigger ranges, particularly if
41they naively contain surrogate codepoints. For example, the sequence of byte
42ranges for the basic multilingual plane (`[0000-FFFF]`) look like this:
43
44```text
45[0-7F]
46[C2-DF][80-BF]
47[E0][A0-BF][80-BF]
48[E1-EC][80-BF][80-BF]
49[ED][80-9F][80-BF]
50[EE-EF][80-BF][80-BF]
51```
52
53Note that the byte ranges above will *not* match any erroneous encoding of
54UTF-8, including encodings of surrogate codepoints.
55
56And, of course, for all of Unicode (`[000000-10FFFF]`):
57
58```text
59[0-7F]
60[C2-DF][80-BF]
61[E0][A0-BF][80-BF]
62[E1-EC][80-BF][80-BF]
63[ED][80-9F][80-BF]
64[EE-EF][80-BF][80-BF]
65[F0][90-BF][80-BF][80-BF]
66[F1-F3][80-BF][80-BF][80-BF]
67[F4][80-8F][80-BF][80-BF]
68```
69
70This module automates the process of creating these byte ranges from ranges of
71Unicode scalar values.
72
73# Lineage
74
75I got the idea and general implementation strategy from Russ Cox in his
76[article on regexps](https://web.archive.org/web/20160404141123/https://swtch.com/~rsc/regexp/regexp3.html) and RE2.
77Russ Cox got it from Ken Thompson's `grep` (no source, folk lore?).
78I also got the idea from
79[Lucene](https://github.com/apache/lucene-solr/blob/ae93f4e7ac6a3908046391de35d4f50a0d3c59ca/lucene/core/src/java/org/apache/lucene/util/automaton/UTF32ToUTF8.java),
80which uses it for executing automata on their term index.
81*/
82
83use core::{char, fmt, iter::FusedIterator, slice};
84
85use alloc::{vec, vec::Vec};
86
87const MAX_UTF8_BYTES: usize = 4;
88
89/// Utf8Sequence represents a sequence of byte ranges.
90///
91/// To match a Utf8Sequence, a candidate byte sequence must match each
92/// successive range.
93///
94/// For example, if there are two ranges, `[C2-DF][80-BF]`, then the byte
95/// sequence `\xDD\x61` would not match because `0x61 < 0x80`.
96#[derive(Copy, Clone, Eq, PartialEq, PartialOrd, Ord)]
97pub enum Utf8Sequence {
98    /// One byte range.
99    One(Utf8Range),
100    /// Two successive byte ranges.
101    Two([Utf8Range; 2]),
102    /// Three successive byte ranges.
103    Three([Utf8Range; 3]),
104    /// Four successive byte ranges.
105    Four([Utf8Range; 4]),
106}
107
108impl Utf8Sequence {
109    /// Creates a new UTF-8 sequence from the encoded bytes of a scalar value
110    /// range.
111    ///
112    /// This assumes that `start` and `end` have the same length.
113    fn from_encoded_range(start: &[u8], end: &[u8]) -> Self {
114        assert_eq!(start.len(), end.len());
115        match start.len() {
116            2 => Utf8Sequence::Two([
117                Utf8Range::new(start[0], end[0]),
118                Utf8Range::new(start[1], end[1]),
119            ]),
120            3 => Utf8Sequence::Three([
121                Utf8Range::new(start[0], end[0]),
122                Utf8Range::new(start[1], end[1]),
123                Utf8Range::new(start[2], end[2]),
124            ]),
125            4 => Utf8Sequence::Four([
126                Utf8Range::new(start[0], end[0]),
127                Utf8Range::new(start[1], end[1]),
128                Utf8Range::new(start[2], end[2]),
129                Utf8Range::new(start[3], end[3]),
130            ]),
131            n => unreachable!("invalid encoded length: {n}"),
132        }
133    }
134
135    /// Returns the underlying sequence of byte ranges as a slice.
136    pub fn as_slice(&self) -> &[Utf8Range] {
137        use self::Utf8Sequence::*;
138        match *self {
139            One(ref r) => slice::from_ref(r),
140            Two(ref r) => &r[..],
141            Three(ref r) => &r[..],
142            Four(ref r) => &r[..],
143        }
144    }
145
146    /// Returns the number of byte ranges in this sequence.
147    ///
148    /// The length is guaranteed to be in the closed interval `[1, 4]`.
149    pub fn len(&self) -> usize {
150        self.as_slice().len()
151    }
152
153    /// Reverses the ranges in this sequence.
154    ///
155    /// For example, if this corresponds to the following sequence:
156    ///
157    /// ```text
158    /// [D0-D3][80-BF]
159    /// ```
160    ///
161    /// Then after reversal, it will be
162    ///
163    /// ```text
164    /// [80-BF][D0-D3]
165    /// ```
166    ///
167    /// This is useful when one is constructing a UTF-8 automaton to match
168    /// character classes in reverse.
169    pub fn reverse(&mut self) {
170        match *self {
171            Utf8Sequence::One(_) => {}
172            Utf8Sequence::Two(ref mut x) => x.reverse(),
173            Utf8Sequence::Three(ref mut x) => x.reverse(),
174            Utf8Sequence::Four(ref mut x) => x.reverse(),
175        }
176    }
177
178    /// Returns true if and only if a prefix of `bytes` matches this sequence
179    /// of byte ranges.
180    pub fn matches(&self, bytes: &[u8]) -> bool {
181        if bytes.len() < self.len() {
182            return false;
183        }
184        for (&b, r) in bytes.iter().zip(self) {
185            if !r.matches(b) {
186                return false;
187            }
188        }
189        true
190    }
191}
192
193impl<'a> IntoIterator for &'a Utf8Sequence {
194    type IntoIter = slice::Iter<'a, Utf8Range>;
195    type Item = &'a Utf8Range;
196
197    fn into_iter(self) -> Self::IntoIter {
198        self.as_slice().iter()
199    }
200}
201
202impl fmt::Debug for Utf8Sequence {
203    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
204        use self::Utf8Sequence::*;
205        match *self {
206            One(ref r) => write!(f, "{r:?}"),
207            Two(ref r) => write!(f, "{:?}{:?}", r[0], r[1]),
208            Three(ref r) => write!(f, "{:?}{:?}{:?}", r[0], r[1], r[2]),
209            Four(ref r) => {
210                write!(f, "{:?}{:?}{:?}{:?}", r[0], r[1], r[2], r[3])
211            }
212        }
213    }
214}
215
216/// A single inclusive range of UTF-8 bytes.
217#[derive(Clone, Copy, Eq, PartialEq, PartialOrd, Ord)]
218pub struct Utf8Range {
219    /// Start of byte range (inclusive).
220    pub start: u8,
221    /// End of byte range (inclusive).
222    pub end: u8,
223}
224
225impl Utf8Range {
226    fn new(start: u8, end: u8) -> Self {
227        Utf8Range { start, end }
228    }
229
230    /// Returns true if and only if the given byte is in this range.
231    pub fn matches(&self, b: u8) -> bool {
232        self.start <= b && b <= self.end
233    }
234}
235
236impl fmt::Debug for Utf8Range {
237    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
238        if self.start == self.end {
239            write!(f, "[{:X}]", self.start)
240        } else {
241            write!(f, "[{:X}-{:X}]", self.start, self.end)
242        }
243    }
244}
245
246/// An iterator over ranges of matching UTF-8 byte sequences.
247///
248/// The iteration represents an alternation of comprehensive byte sequences
249/// that match precisely the set of UTF-8 encoded scalar values.
250///
251/// A byte sequence corresponds to one of the scalar values in the range given
252/// if and only if it completely matches exactly one of the sequences of byte
253/// ranges produced by this iterator.
254///
255/// Each sequence of byte ranges matches a unique set of bytes. That is, no two
256/// sequences will match the same bytes.
257///
258/// # Example
259///
260/// This shows how to match an arbitrary byte sequence against a range of
261/// scalar values.
262///
263/// ```rust
264/// use regex_syntax::utf8::{Utf8Sequences, Utf8Sequence};
265///
266/// fn matches(seqs: &[Utf8Sequence], bytes: &[u8]) -> bool {
267///     for range in seqs {
268///         if range.matches(bytes) {
269///             return true;
270///         }
271///     }
272///     false
273/// }
274///
275/// // Test the basic multilingual plane.
276/// let seqs: Vec<_> = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect();
277///
278/// // UTF-8 encoding of 'a'.
279/// assert!(matches(&seqs, &[0x61]));
280/// // UTF-8 encoding of '☃' (`\u{2603}`).
281/// assert!(matches(&seqs, &[0xE2, 0x98, 0x83]));
282/// // UTF-8 encoding of `\u{10348}` (outside the BMP).
283/// assert!(!matches(&seqs, &[0xF0, 0x90, 0x8D, 0x88]));
284/// // Tries to match against a UTF-8 encoding of a surrogate codepoint,
285/// // which is invalid UTF-8, and therefore fails, despite the fact that
286/// // the corresponding codepoint (0xD800) falls in the range given.
287/// assert!(!matches(&seqs, &[0xED, 0xA0, 0x80]));
288/// // And fails against plain old invalid UTF-8.
289/// assert!(!matches(&seqs, &[0xFF, 0xFF]));
290/// ```
291///
292/// If this example seems circuitous, that's because it is! It's meant to be
293/// illustrative. In practice, you could just try to decode your byte sequence
294/// and compare it with the scalar value range directly. However, this is not
295/// always possible (for example, in a byte based automaton).
296#[derive(Debug)]
297pub struct Utf8Sequences {
298    range_stack: Vec<ScalarRange>,
299}
300
301impl Utf8Sequences {
302    /// Create a new iterator over UTF-8 byte ranges for the scalar value range
303    /// given.
304    pub fn new(start: char, end: char) -> Self {
305        let range =
306            ScalarRange { start: u32::from(start), end: u32::from(end) };
307        Utf8Sequences { range_stack: vec![range] }
308    }
309
310    /// reset resets the scalar value range.
311    /// Any existing state is cleared, but resources may be reused.
312    ///
313    /// N.B. Benchmarks say that this method is dubious.
314    #[doc(hidden)]
315    pub fn reset(&mut self, start: char, end: char) {
316        self.range_stack.clear();
317        self.push(u32::from(start), u32::from(end));
318    }
319
320    fn push(&mut self, start: u32, end: u32) {
321        self.range_stack.push(ScalarRange { start, end });
322    }
323}
324
325struct ScalarRange {
326    start: u32,
327    end: u32,
328}
329
330impl fmt::Debug for ScalarRange {
331    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
332        write!(f, "ScalarRange({:X}, {:X})", self.start, self.end)
333    }
334}
335
336impl Iterator for Utf8Sequences {
337    type Item = Utf8Sequence;
338
339    fn next(&mut self) -> Option<Self::Item> {
340        'TOP: while let Some(mut r) = self.range_stack.pop() {
341            'INNER: loop {
342                if let Some((r1, r2)) = r.split() {
343                    self.push(r2.start, r2.end);
344                    r.start = r1.start;
345                    r.end = r1.end;
346                    continue 'INNER;
347                }
348                if !r.is_valid() {
349                    continue 'TOP;
350                }
351                for i in 1..MAX_UTF8_BYTES {
352                    let max = max_scalar_value(i);
353                    if r.start <= max && max < r.end {
354                        self.push(max + 1, r.end);
355                        r.end = max;
356                        continue 'INNER;
357                    }
358                }
359                if let Some(ascii_range) = r.as_ascii() {
360                    return Some(Utf8Sequence::One(ascii_range));
361                }
362                for i in 1..MAX_UTF8_BYTES {
363                    let m = (1 << (6 * i)) - 1;
364                    if (r.start & !m) != (r.end & !m) {
365                        if (r.start & m) != 0 {
366                            self.push((r.start | m) + 1, r.end);
367                            r.end = r.start | m;
368                            continue 'INNER;
369                        }
370                        if (r.end & m) != m {
371                            self.push(r.end & !m, r.end);
372                            r.end = (r.end & !m) - 1;
373                            continue 'INNER;
374                        }
375                    }
376                }
377                let mut start = [0; MAX_UTF8_BYTES];
378                let mut end = [0; MAX_UTF8_BYTES];
379                let n = r.encode(&mut start, &mut end);
380                return Some(Utf8Sequence::from_encoded_range(
381                    &start[0..n],
382                    &end[0..n],
383                ));
384            }
385        }
386        None
387    }
388}
389
390impl FusedIterator for Utf8Sequences {}
391
392impl ScalarRange {
393    /// split splits this range if it overlaps with a surrogate codepoint.
394    ///
395    /// Either or both ranges may be invalid.
396    fn split(&self) -> Option<(ScalarRange, ScalarRange)> {
397        if self.start < 0xE000 && self.end > 0xD7FF {
398            Some((
399                ScalarRange { start: self.start, end: 0xD7FF },
400                ScalarRange { start: 0xE000, end: self.end },
401            ))
402        } else {
403            None
404        }
405    }
406
407    /// is_valid returns true if and only if start <= end.
408    fn is_valid(&self) -> bool {
409        self.start <= self.end
410    }
411
412    /// as_ascii returns this range as a Utf8Range if and only if all scalar
413    /// values in this range can be encoded as a single byte.
414    fn as_ascii(&self) -> Option<Utf8Range> {
415        if self.is_ascii() {
416            let start = u8::try_from(self.start).unwrap();
417            let end = u8::try_from(self.end).unwrap();
418            Some(Utf8Range::new(start, end))
419        } else {
420            None
421        }
422    }
423
424    /// is_ascii returns true if the range is ASCII only (i.e., takes a single
425    /// byte to encode any scalar value).
426    fn is_ascii(&self) -> bool {
427        self.is_valid() && self.end <= 0x7f
428    }
429
430    /// encode writes the UTF-8 encoding of the start and end of this range
431    /// to the corresponding destination slices, and returns the number of
432    /// bytes written.
433    ///
434    /// The slices should have room for at least `MAX_UTF8_BYTES`.
435    fn encode(&self, start: &mut [u8], end: &mut [u8]) -> usize {
436        let cs = char::from_u32(self.start).unwrap();
437        let ce = char::from_u32(self.end).unwrap();
438        let ss = cs.encode_utf8(start);
439        let se = ce.encode_utf8(end);
440        assert_eq!(ss.len(), se.len());
441        ss.len()
442    }
443}
444
445fn max_scalar_value(nbytes: usize) -> u32 {
446    match nbytes {
447        1 => 0x007F,
448        2 => 0x07FF,
449        3 => 0xFFFF,
450        4 => 0x0010_FFFF,
451        _ => unreachable!("invalid UTF-8 byte sequence size"),
452    }
453}
454
455#[cfg(test)]
456mod tests {
457    use core::char;
458
459    use alloc::{vec, vec::Vec};
460
461    use crate::utf8::{Utf8Range, Utf8Sequences};
462
463    fn rutf8(s: u8, e: u8) -> Utf8Range {
464        Utf8Range::new(s, e)
465    }
466
467    fn never_accepts_surrogate_codepoints(start: char, end: char) {
468        for cp in 0xD800..0xE000 {
469            let buf = encode_surrogate(cp);
470            for r in Utf8Sequences::new(start, end) {
471                if r.matches(&buf) {
472                    panic!(
473                        "Sequence ({:X}, {:X}) contains range {:?}, \
474                         which matches surrogate code point {:X} \
475                         with encoded bytes {:?}",
476                        u32::from(start),
477                        u32::from(end),
478                        r,
479                        cp,
480                        buf,
481                    );
482                }
483            }
484        }
485    }
486
487    #[test]
488    fn codepoints_no_surrogates() {
489        never_accepts_surrogate_codepoints('\u{0}', '\u{FFFF}');
490        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFF}');
491        never_accepts_surrogate_codepoints('\u{0}', '\u{10FFFE}');
492        never_accepts_surrogate_codepoints('\u{80}', '\u{10FFFF}');
493        never_accepts_surrogate_codepoints('\u{D7FF}', '\u{E000}');
494    }
495
496    #[test]
497    fn single_codepoint_one_sequence() {
498        // Tests that every range of scalar values that contains a single
499        // scalar value is recognized by one sequence of byte ranges.
500        for i in 0x0..=0x0010_FFFF {
501            let c = match char::from_u32(i) {
502                None => continue,
503                Some(c) => c,
504            };
505            let seqs: Vec<_> = Utf8Sequences::new(c, c).collect();
506            assert_eq!(seqs.len(), 1);
507        }
508    }
509
510    #[test]
511    fn bmp() {
512        use crate::utf8::Utf8Sequence::*;
513
514        let seqs = Utf8Sequences::new('\u{0}', '\u{FFFF}').collect::<Vec<_>>();
515        assert_eq!(
516            seqs,
517            vec![
518                One(rutf8(0x0, 0x7F)),
519                Two([rutf8(0xC2, 0xDF), rutf8(0x80, 0xBF)]),
520                Three([
521                    rutf8(0xE0, 0xE0),
522                    rutf8(0xA0, 0xBF),
523                    rutf8(0x80, 0xBF)
524                ]),
525                Three([
526                    rutf8(0xE1, 0xEC),
527                    rutf8(0x80, 0xBF),
528                    rutf8(0x80, 0xBF)
529                ]),
530                Three([
531                    rutf8(0xED, 0xED),
532                    rutf8(0x80, 0x9F),
533                    rutf8(0x80, 0xBF)
534                ]),
535                Three([
536                    rutf8(0xEE, 0xEF),
537                    rutf8(0x80, 0xBF),
538                    rutf8(0x80, 0xBF)
539                ]),
540            ]
541        );
542    }
543
544    #[test]
545    fn reverse() {
546        use crate::utf8::Utf8Sequence::*;
547
548        let mut s = One(rutf8(0xA, 0xB));
549        s.reverse();
550        assert_eq!(s.as_slice(), &[rutf8(0xA, 0xB)]);
551
552        let mut s = Two([rutf8(0xA, 0xB), rutf8(0xB, 0xC)]);
553        s.reverse();
554        assert_eq!(s.as_slice(), &[rutf8(0xB, 0xC), rutf8(0xA, 0xB)]);
555
556        let mut s = Three([rutf8(0xA, 0xB), rutf8(0xB, 0xC), rutf8(0xC, 0xD)]);
557        s.reverse();
558        assert_eq!(
559            s.as_slice(),
560            &[rutf8(0xC, 0xD), rutf8(0xB, 0xC), rutf8(0xA, 0xB)]
561        );
562
563        let mut s = Four([
564            rutf8(0xA, 0xB),
565            rutf8(0xB, 0xC),
566            rutf8(0xC, 0xD),
567            rutf8(0xD, 0xE),
568        ]);
569        s.reverse();
570        assert_eq!(
571            s.as_slice(),
572            &[
573                rutf8(0xD, 0xE),
574                rutf8(0xC, 0xD),
575                rutf8(0xB, 0xC),
576                rutf8(0xA, 0xB)
577            ]
578        );
579    }
580
581    fn encode_surrogate(cp: u32) -> [u8; 3] {
582        const TAG_CONT: u8 = 0b1000_0000;
583        const TAG_THREE_B: u8 = 0b1110_0000;
584
585        assert!(0xD800 <= cp && cp < 0xE000);
586        let mut dst = [0; 3];
587        dst[0] = u8::try_from(cp >> 12 & 0x0F).unwrap() | TAG_THREE_B;
588        dst[1] = u8::try_from(cp >> 6 & 0x3F).unwrap() | TAG_CONT;
589        dst[2] = u8::try_from(cp & 0x3F).unwrap() | TAG_CONT;
590        dst
591    }
592}
regex_syntax/utf8.rs

regex_syntax/
utf8.rs