1//! Types supporting the UTF-8 parser
23/// Action to take when receiving a byte
4#[derive(Debug, Copy, Clone)]
5pub enum Action {
6/// Unexpected byte; sequence is invalid
7InvalidSequence = 0,
8/// Received valid 7-bit ASCII byte which can be directly emitted.
9EmitByte = 1,
10/// Set the bottom continuation byte
11SetByte1 = 2,
12/// Set the 2nd-from-last continuation byte
13SetByte2 = 3,
14/// Set the 2nd-from-last byte which is part of a two byte sequence
15SetByte2Top = 4,
16/// Set the 3rd-from-last continuation byte
17SetByte3 = 5,
18/// Set the 3rd-from-last byte which is part of a three byte sequence
19SetByte3Top = 6,
20/// Set the top byte of a four byte sequence.
21SetByte4 = 7,
22}
2324/// States the parser can be in.
25///
26/// There is a state for each initial input of the 3 and 4 byte sequences since
27/// the following bytes are subject to different conditions than a tail byte.
28#[allow(non_camel_case_types)]
29#[derive(Debug, Copy, Clone, PartialEq, Eq)]
30pub enum State {
31/// Ground state; expect anything
32Ground = 0,
33/// 3 tail bytes
34Tail3 = 1,
35/// 2 tail bytes
36Tail2 = 2,
37/// 1 tail byte
38Tail1 = 3,
39/// UTF8-3 starting with E0
40U3_2_e0 = 4,
41/// UTF8-3 starting with ED
42U3_2_ed = 5,
43/// UTF8-4 starting with F0
44Utf8_4_3_f0 = 6,
45/// UTF8-4 starting with F4
46Utf8_4_3_f4 = 7,
47}
4849impl Default for State {
50fn default() -> State {
51 State::Ground
52 }
53}
5455impl State {
56/// Advance the parser state.
57 ///
58 /// This takes the current state and input byte into consideration, to determine the next state
59 /// and any action that should be taken.
60#[inline]
61pub fn advance(self, byte: u8) -> (State, Action) {
62match self {
63 State::Ground => match byte {
640x00..=0x7f => (State::Ground, Action::EmitByte),
650xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
660xe0 => (State::U3_2_e0, Action::SetByte3Top),
670xe1..=0xec => (State::Tail2, Action::SetByte3Top),
680xed => (State::U3_2_ed, Action::SetByte3Top),
690xee..=0xef => (State::Tail2, Action::SetByte3Top),
700xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
710xf1..=0xf3 => (State::Tail3, Action::SetByte4),
720xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
73_ => (State::Ground, Action::InvalidSequence),
74 },
75 State::U3_2_e0 => match byte {
760xa0..=0xbf => (State::Tail1, Action::SetByte2),
77_ => (State::Ground, Action::InvalidSequence),
78 },
79 State::U3_2_ed => match byte {
800x80..=0x9f => (State::Tail1, Action::SetByte2),
81_ => (State::Ground, Action::InvalidSequence),
82 },
83 State::Utf8_4_3_f0 => match byte {
840x90..=0xbf => (State::Tail2, Action::SetByte3),
85_ => (State::Ground, Action::InvalidSequence),
86 },
87 State::Utf8_4_3_f4 => match byte {
880x80..=0x8f => (State::Tail2, Action::SetByte3),
89_ => (State::Ground, Action::InvalidSequence),
90 },
91 State::Tail3 => match byte {
920x80..=0xbf => (State::Tail2, Action::SetByte3),
93_ => (State::Ground, Action::InvalidSequence),
94 },
95 State::Tail2 => match byte {
960x80..=0xbf => (State::Tail1, Action::SetByte2),
97_ => (State::Ground, Action::InvalidSequence),
98 },
99 State::Tail1 => match byte {
1000x80..=0xbf => (State::Ground, Action::SetByte1),
101_ => (State::Ground, Action::InvalidSequence),
102 },
103 }
104 }
105}