utf8parse/
types.rs

1//! Types supporting the UTF-8 parser
2
3/// Action to take when receiving a byte
4#[derive(Debug, Copy, Clone)]
5pub enum Action {
6    /// Unexpected byte; sequence is invalid
7    InvalidSequence = 0,
8    /// Received valid 7-bit ASCII byte which can be directly emitted.
9    EmitByte = 1,
10    /// Set the bottom continuation byte
11    SetByte1 = 2,
12    /// Set the 2nd-from-last continuation byte
13    SetByte2 = 3,
14    /// Set the 2nd-from-last byte which is part of a two byte sequence
15    SetByte2Top = 4,
16    /// Set the 3rd-from-last continuation byte
17    SetByte3 = 5,
18    /// Set the 3rd-from-last byte which is part of a three byte sequence
19    SetByte3Top = 6,
20    /// Set the top byte of a four byte sequence.
21    SetByte4 = 7,
22}
23
24/// States the parser can be in.
25///
26/// There is a state for each initial input of the 3 and 4 byte sequences since
27/// the following bytes are subject to different conditions than a tail byte.
28#[allow(non_camel_case_types)]
29#[derive(Debug, Copy, Clone, PartialEq, Eq)]
30pub enum State {
31    /// Ground state; expect anything
32    Ground = 0,
33    /// 3 tail bytes
34    Tail3 = 1,
35    /// 2 tail bytes
36    Tail2 = 2,
37    /// 1 tail byte
38    Tail1 = 3,
39    /// UTF8-3 starting with E0
40    U3_2_e0 = 4,
41    /// UTF8-3 starting with ED
42    U3_2_ed = 5,
43    /// UTF8-4 starting with F0
44    Utf8_4_3_f0 = 6,
45    /// UTF8-4 starting with F4
46    Utf8_4_3_f4 = 7,
47}
48
49impl Default for State {
50    fn default() -> State {
51        State::Ground
52    }
53}
54
55impl State {
56    /// Advance the parser state.
57    ///
58    /// This takes the current state and input byte into consideration, to determine the next state
59    /// and any action that should be taken.
60    #[inline]
61    pub fn advance(self, byte: u8) -> (State, Action) {
62        match self {
63            State::Ground => match byte {
64                0x00..=0x7f => (State::Ground, Action::EmitByte),
65                0xc2..=0xdf => (State::Tail1, Action::SetByte2Top),
66                0xe0 => (State::U3_2_e0, Action::SetByte3Top),
67                0xe1..=0xec => (State::Tail2, Action::SetByte3Top),
68                0xed => (State::U3_2_ed, Action::SetByte3Top),
69                0xee..=0xef => (State::Tail2, Action::SetByte3Top),
70                0xf0 => (State::Utf8_4_3_f0, Action::SetByte4),
71                0xf1..=0xf3 => (State::Tail3, Action::SetByte4),
72                0xf4 => (State::Utf8_4_3_f4, Action::SetByte4),
73                _ => (State::Ground, Action::InvalidSequence),
74            },
75            State::U3_2_e0 => match byte {
76                0xa0..=0xbf => (State::Tail1, Action::SetByte2),
77                _ => (State::Ground, Action::InvalidSequence),
78            },
79            State::U3_2_ed => match byte {
80                0x80..=0x9f => (State::Tail1, Action::SetByte2),
81                _ => (State::Ground, Action::InvalidSequence),
82            },
83            State::Utf8_4_3_f0 => match byte {
84                0x90..=0xbf => (State::Tail2, Action::SetByte3),
85                _ => (State::Ground, Action::InvalidSequence),
86            },
87            State::Utf8_4_3_f4 => match byte {
88                0x80..=0x8f => (State::Tail2, Action::SetByte3),
89                _ => (State::Ground, Action::InvalidSequence),
90            },
91            State::Tail3 => match byte {
92                0x80..=0xbf => (State::Tail2, Action::SetByte3),
93                _ => (State::Ground, Action::InvalidSequence),
94            },
95            State::Tail2 => match byte {
96                0x80..=0xbf => (State::Tail1, Action::SetByte2),
97                _ => (State::Ground, Action::InvalidSequence),
98            },
99            State::Tail1 => match byte {
100                0x80..=0xbf => (State::Ground, Action::SetByte1),
101                _ => (State::Ground, Action::InvalidSequence),
102            },
103        }
104    }
105}