1pub(crate) fn is_word_byte(b: u8) -> bool {
4 const fn mkwordset() -> [bool; 256] {
5 let mut set = [false; 256];
7 set[b'_' as usize] = true;
8
9 let mut byte = b'0';
10 while byte <= b'9' {
11 set[byte as usize] = true;
12 byte += 1;
13 }
14 byte = b'A';
15 while byte <= b'Z' {
16 set[byte as usize] = true;
17 byte += 1;
18 }
19 byte = b'a';
20 while byte <= b'z' {
21 set[byte as usize] = true;
22 byte += 1;
23 }
24 set
25 }
26 const WORD: [bool; 256] = mkwordset();
27 WORD[b as usize]
28}
29
30const ACCEPT: usize = 12;
33const REJECT: usize = 0;
36
37pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
40 match decode(slice) {
41 (Some(ch), size) => (ch, size),
42 (None, size) => ('\u{FFFD}', size),
43 }
44}
45
46pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
57 let slice = slice.as_ref();
58 match slice.get(0) {
59 None => return (None, 0),
60 Some(&b) if b <= 0x7F => return (Some(b as char), 1),
61 _ => {}
62 }
63
64 let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
65 while i < slice.len() {
66 decode_step(&mut state, &mut cp, slice[i]);
67 i += 1;
68
69 if state == ACCEPT {
70 let ch = char::from_u32(cp).unwrap();
76 return (Some(ch), i);
77 } else if state == REJECT {
78 return (None, core::cmp::max(1, i.saturating_sub(1)));
80 }
81 }
82 (None, i)
83}
84
85fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
87 #[cfg_attr(rustfmt, rustfmt::skip)]
91 const CLASSES: [u8; 256] = [
92 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
93 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
94 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
95 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0, 0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
96 1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1, 9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
97 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7, 7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
98 8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2, 2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
99 10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
100 ];
101
102 #[cfg_attr(rustfmt, rustfmt::skip)]
105 const STATES_FORWARD: &'static [u8] = &[
106 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
107 12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
108 0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
109 0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
110 0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
111 0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
112 0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
113 0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
114 0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
115 ];
116
117 let class = CLASSES[usize::from(b)];
118 if *state == ACCEPT {
119 *cp = (0xFF >> class) & (b as u32);
120 } else {
121 *cp = (b as u32 & 0b111111) | (*cp << 6);
122 }
123 *state = usize::from(STATES_FORWARD[*state + usize::from(class)]);
124}
125
126#[cfg(test)]
127mod tests {
128 use alloc::{vec, vec::Vec};
129
130 use super::*;
131
132 #[test]
133 fn decode_valid() {
134 fn d(mut s: &str) -> Vec<char> {
135 let mut chars = vec![];
136 while !s.is_empty() {
137 let (ch, size) = decode(s.as_bytes());
138 s = &s[size..];
139 chars.push(ch.unwrap());
140 }
141 chars
142 }
143
144 assert_eq!(vec!['☃'], d("☃"));
145 assert_eq!(vec!['☃', '☃'], d("☃☃"));
146 assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
147 assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
148 assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
149 }
150
151 #[test]
152 fn decode_invalid() {
153 let (ch, size) = decode(b"");
154 assert_eq!(None, ch);
155 assert_eq!(0, size);
156
157 let (ch, size) = decode(b"\xFF");
158 assert_eq!(None, ch);
159 assert_eq!(1, size);
160
161 let (ch, size) = decode(b"\xCE\xF0");
162 assert_eq!(None, ch);
163 assert_eq!(1, size);
164
165 let (ch, size) = decode(b"\xE2\x98\xF0");
166 assert_eq!(None, ch);
167 assert_eq!(2, size);
168
169 let (ch, size) = decode(b"\xF0\x9D\x9D");
170 assert_eq!(None, ch);
171 assert_eq!(3, size);
172
173 let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
174 assert_eq!(None, ch);
175 assert_eq!(3, size);
176
177 let (ch, size) = decode(b"\xF0\x82\x82\xAC");
178 assert_eq!(None, ch);
179 assert_eq!(1, size);
180
181 let (ch, size) = decode(b"\xED\xA0\x80");
182 assert_eq!(None, ch);
183 assert_eq!(1, size);
184
185 let (ch, size) = decode(b"\xCEa");
186 assert_eq!(None, ch);
187 assert_eq!(1, size);
188
189 let (ch, size) = decode(b"\xE2\x98a");
190 assert_eq!(None, ch);
191 assert_eq!(2, size);
192
193 let (ch, size) = decode(b"\xF0\x9D\x9Ca");
194 assert_eq!(None, ch);
195 assert_eq!(3, size);
196 }
197
198 #[test]
199 fn decode_lossily() {
200 let (ch, size) = decode_lossy(b"");
201 assert_eq!('\u{FFFD}', ch);
202 assert_eq!(0, size);
203
204 let (ch, size) = decode_lossy(b"\xFF");
205 assert_eq!('\u{FFFD}', ch);
206 assert_eq!(1, size);
207
208 let (ch, size) = decode_lossy(b"\xCE\xF0");
209 assert_eq!('\u{FFFD}', ch);
210 assert_eq!(1, size);
211
212 let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
213 assert_eq!('\u{FFFD}', ch);
214 assert_eq!(2, size);
215
216 let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
217 assert_eq!('\u{FFFD}', ch);
218 assert_eq!(3, size);
219
220 let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
221 assert_eq!('\u{FFFD}', ch);
222 assert_eq!(1, size);
223
224 let (ch, size) = decode_lossy(b"\xED\xA0\x80");
225 assert_eq!('\u{FFFD}', ch);
226 assert_eq!(1, size);
227
228 let (ch, size) = decode_lossy(b"\xCEa");
229 assert_eq!('\u{FFFD}', ch);
230 assert_eq!(1, size);
231
232 let (ch, size) = decode_lossy(b"\xE2\x98a");
233 assert_eq!('\u{FFFD}', ch);
234 assert_eq!(2, size);
235
236 let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
237 assert_eq!('\u{FFFD}', ch);
238 assert_eq!(3, size);
239 }
240}