regex_lite/
utf8.rs

1/// Returns true if and only if the given byte is considered a word character.
2/// This only applies to ASCII.
3pub(crate) fn is_word_byte(b: u8) -> bool {
4    const fn mkwordset() -> [bool; 256] {
5        // FIXME: Use as_usize() once const functions in traits are stable.
6        let mut set = [false; 256];
7        set[b'_' as usize] = true;
8
9        let mut byte = b'0';
10        while byte <= b'9' {
11            set[byte as usize] = true;
12            byte += 1;
13        }
14        byte = b'A';
15        while byte <= b'Z' {
16            set[byte as usize] = true;
17            byte += 1;
18        }
19        byte = b'a';
20        while byte <= b'z' {
21            set[byte as usize] = true;
22            byte += 1;
23        }
24        set
25    }
26    const WORD: [bool; 256] = mkwordset();
27    WORD[b as usize]
28}
29
30/// The accept state index. When we enter this state, we know we've found a
31/// valid Unicode scalar value.
32const ACCEPT: usize = 12;
33/// The reject state index. When we enter this state, we know that we've found
34/// invalid UTF-8.
35const REJECT: usize = 0;
36
37/// Like `decode`, but automatically converts the `None` case to the
38/// replacement codepoint.
39pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
40    match decode(slice) {
41        (Some(ch), size) => (ch, size),
42        (None, size) => ('\u{FFFD}', size),
43    }
44}
45
46/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
47///
48/// When successful, the corresponding Unicode scalar value is returned along
49/// with the number of bytes it was encoded with. The number of bytes consumed
50/// for a successful decode is always between 1 and 4, inclusive.
51///
52/// When unsuccessful, `None` is returned along with the number of bytes that
53/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
54/// the number of bytes consumed is always between 0 and 3, inclusive, where
55/// 0 is only returned when `slice` is empty.
56pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
57    let slice = slice.as_ref();
58    match slice.get(0) {
59        None => return (None, 0),
60        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
61        _ => {}
62    }
63
64    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
65    while i < slice.len() {
66        decode_step(&mut state, &mut cp, slice[i]);
67        i += 1;
68
69        if state == ACCEPT {
70            // OK since `decode_step` guarantees that `cp` is a valid Unicode
71            // scalar value in an ACCEPT state.
72            //
73            // We don't have to use safe code here, but do so because perf
74            // isn't our primary objective in regex-lite.
75            let ch = char::from_u32(cp).unwrap();
76            return (Some(ch), i);
77        } else if state == REJECT {
78            // At this point, we always want to advance at least one byte.
79            return (None, core::cmp::max(1, i.saturating_sub(1)));
80        }
81    }
82    (None, i)
83}
84
85/// Transitions to the next state and updates `cp` while it does.
86fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
87    // Splits the space of all bytes into equivalence classes, such that
88    // any byte in the same class can never discriminate between whether a
89    // particular sequence is valid UTF-8 or not.
90    #[cfg_attr(rustfmt, rustfmt::skip)]
91    const CLASSES: [u8; 256] = [
92       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
93       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
94       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
95       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
96       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
97       7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
98       8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
99      10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
100    ];
101
102    // A state machine taken from `bstr` which was in turn adapted from:
103    // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
104    #[cfg_attr(rustfmt, rustfmt::skip)]
105    const STATES_FORWARD: &'static [u8] = &[
106      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107      12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
108      0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
109      0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
110      0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
111      0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
112      0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
113      0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
114      0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115    ];
116
117    let class = CLASSES[usize::from(b)];
118    if *state == ACCEPT {
119        *cp = (0xFF >> class) & (b as u32);
120    } else {
121        *cp = (b as u32 & 0b111111) | (*cp << 6);
122    }
123    *state = usize::from(STATES_FORWARD[*state + usize::from(class)]);
124}
125
126#[cfg(test)]
127mod tests {
128    use alloc::{vec, vec::Vec};
129
130    use super::*;
131
132    #[test]
133    fn decode_valid() {
134        fn d(mut s: &str) -> Vec<char> {
135            let mut chars = vec![];
136            while !s.is_empty() {
137                let (ch, size) = decode(s.as_bytes());
138                s = &s[size..];
139                chars.push(ch.unwrap());
140            }
141            chars
142        }
143
144        assert_eq!(vec!['☃'], d("☃"));
145        assert_eq!(vec!['☃', '☃'], d("☃☃"));
146        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
147        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
148        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
149    }
150
151    #[test]
152    fn decode_invalid() {
153        let (ch, size) = decode(b"");
154        assert_eq!(None, ch);
155        assert_eq!(0, size);
156
157        let (ch, size) = decode(b"\xFF");
158        assert_eq!(None, ch);
159        assert_eq!(1, size);
160
161        let (ch, size) = decode(b"\xCE\xF0");
162        assert_eq!(None, ch);
163        assert_eq!(1, size);
164
165        let (ch, size) = decode(b"\xE2\x98\xF0");
166        assert_eq!(None, ch);
167        assert_eq!(2, size);
168
169        let (ch, size) = decode(b"\xF0\x9D\x9D");
170        assert_eq!(None, ch);
171        assert_eq!(3, size);
172
173        let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
174        assert_eq!(None, ch);
175        assert_eq!(3, size);
176
177        let (ch, size) = decode(b"\xF0\x82\x82\xAC");
178        assert_eq!(None, ch);
179        assert_eq!(1, size);
180
181        let (ch, size) = decode(b"\xED\xA0\x80");
182        assert_eq!(None, ch);
183        assert_eq!(1, size);
184
185        let (ch, size) = decode(b"\xCEa");
186        assert_eq!(None, ch);
187        assert_eq!(1, size);
188
189        let (ch, size) = decode(b"\xE2\x98a");
190        assert_eq!(None, ch);
191        assert_eq!(2, size);
192
193        let (ch, size) = decode(b"\xF0\x9D\x9Ca");
194        assert_eq!(None, ch);
195        assert_eq!(3, size);
196    }
197
198    #[test]
199    fn decode_lossily() {
200        let (ch, size) = decode_lossy(b"");
201        assert_eq!('\u{FFFD}', ch);
202        assert_eq!(0, size);
203
204        let (ch, size) = decode_lossy(b"\xFF");
205        assert_eq!('\u{FFFD}', ch);
206        assert_eq!(1, size);
207
208        let (ch, size) = decode_lossy(b"\xCE\xF0");
209        assert_eq!('\u{FFFD}', ch);
210        assert_eq!(1, size);
211
212        let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
213        assert_eq!('\u{FFFD}', ch);
214        assert_eq!(2, size);
215
216        let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
217        assert_eq!('\u{FFFD}', ch);
218        assert_eq!(3, size);
219
220        let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
221        assert_eq!('\u{FFFD}', ch);
222        assert_eq!(1, size);
223
224        let (ch, size) = decode_lossy(b"\xED\xA0\x80");
225        assert_eq!('\u{FFFD}', ch);
226        assert_eq!(1, size);
227
228        let (ch, size) = decode_lossy(b"\xCEa");
229        assert_eq!('\u{FFFD}', ch);
230        assert_eq!(1, size);
231
232        let (ch, size) = decode_lossy(b"\xE2\x98a");
233        assert_eq!('\u{FFFD}', ch);
234        assert_eq!(2, size);
235
236        let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
237        assert_eq!('\u{FFFD}', ch);
238        assert_eq!(3, size);
239    }
240}