regex_lite/
utf8.rs

/// Returns true if and only if the given byte is considered a word character.
/// This only applies to ASCII.
pub(crate) fn is_word_byte(b: u8) -> bool {
    const fn mkwordset() -> [bool; 256] {
        // FIXME: Use as_usize() once const functions in traits are stable.
        let mut set = [false; 256];
        set[b'_' as usize] = true;

        let mut byte = b'0';
        while byte <= b'9' {
            set[byte as usize] = true;
            byte += 1;
        }
        byte = b'A';
        while byte <= b'Z' {
            set[byte as usize] = true;
            byte += 1;
        }
        byte = b'a';
        while byte <= b'z' {
            set[byte as usize] = true;
            byte += 1;
        }
        set
    }
    const WORD: [bool; 256] = mkwordset();
    WORD[b as usize]
}

/// The accept state index. When we enter this state, we know we've found a
/// valid Unicode scalar value.
const ACCEPT: usize = 12;
/// The reject state index. When we enter this state, we know that we've found
/// invalid UTF-8.
const REJECT: usize = 0;

/// Like `decode`, but automatically converts the `None` case to the
/// replacement codepoint.
pub(crate) fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
    match decode(slice) {
        (Some(ch), size) => (ch, size),
        (None, size) => ('\u{FFFD}', size),
    }
}

/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
///
/// When successful, the corresponding Unicode scalar value is returned along
/// with the number of bytes it was encoded with. The number of bytes consumed
/// for a successful decode is always between 1 and 4, inclusive.
///
/// When unsuccessful, `None` is returned along with the number of bytes that
/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
/// the number of bytes consumed is always between 0 and 3, inclusive, where
/// 0 is only returned when `slice` is empty.
pub(crate) fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
    let slice = slice.as_ref();
    match slice.get(0) {
        None => return (None, 0),
        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
        _ => {}
    }

    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
    while i < slice.len() {
        decode_step(&mut state, &mut cp, slice[i]);
        i += 1;

        if state == ACCEPT {
            // OK since `decode_step` guarantees that `cp` is a valid Unicode
            // scalar value in an ACCEPT state.
            //
            // We don't have to use safe code here, but do so because perf
            // isn't our primary objective in regex-lite.
            let ch = char::from_u32(cp).unwrap();
            return (Some(ch), i);
        } else if state == REJECT {
            // At this point, we always want to advance at least one byte.
            return (None, core::cmp::max(1, i.saturating_sub(1)));
        }
    }
    (None, i)
}

/// Transitions to the next state and updates `cp` while it does.
fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
    // Splits the space of all bytes into equivalence classes, such that
    // any byte in the same class can never discriminate between whether a
    // particular sequence is valid UTF-8 or not.
    #[cfg_attr(rustfmt, rustfmt::skip)]
    const CLASSES: [u8; 256] = [
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
       1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
       7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
       8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
      10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
    ];

    // A state machine taken from `bstr` which was in turn adapted from:
    // https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
    #[cfg_attr(rustfmt, rustfmt::skip)]
    const STATES_FORWARD: &'static [u8] = &[
      0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
      12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
      0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
      0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
      0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
      0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
      0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
      0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    ];

    let class = CLASSES[usize::from(b)];
    if *state == ACCEPT {
        *cp = (0xFF >> class) & (b as u32);
    } else {
        *cp = (b as u32 & 0b111111) | (*cp << 6);
    }
    *state = usize::from(STATES_FORWARD[*state + usize::from(class)]);
}

#[cfg(test)]
mod tests {
    use alloc::{vec, vec::Vec};

    use super::*;

    #[test]
    fn decode_valid() {
        fn d(mut s: &str) -> Vec<char> {
            let mut chars = vec![];
            while !s.is_empty() {
                let (ch, size) = decode(s.as_bytes());
                s = &s[size..];
                chars.push(ch.unwrap());
            }
            chars
        }

        assert_eq!(vec!['☃'], d("☃"));
        assert_eq!(vec!['☃', '☃'], d("☃☃"));
        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
    }

    #[test]
    fn decode_invalid() {
        let (ch, size) = decode(b"");
        assert_eq!(None, ch);
        assert_eq!(0, size);

        let (ch, size) = decode(b"\xFF");
        assert_eq!(None, ch);
        assert_eq!(1, size);

        let (ch, size) = decode(b"\xCE\xF0");
        assert_eq!(None, ch);
        assert_eq!(1, size);

        let (ch, size) = decode(b"\xE2\x98\xF0");
        assert_eq!(None, ch);
        assert_eq!(2, size);

        let (ch, size) = decode(b"\xF0\x9D\x9D");
        assert_eq!(None, ch);
        assert_eq!(3, size);

        let (ch, size) = decode(b"\xF0\x9D\x9D\xF0");
        assert_eq!(None, ch);
        assert_eq!(3, size);

        let (ch, size) = decode(b"\xF0\x82\x82\xAC");
        assert_eq!(None, ch);
        assert_eq!(1, size);

        let (ch, size) = decode(b"\xED\xA0\x80");
        assert_eq!(None, ch);
        assert_eq!(1, size);

        let (ch, size) = decode(b"\xCEa");
        assert_eq!(None, ch);
        assert_eq!(1, size);

        let (ch, size) = decode(b"\xE2\x98a");
        assert_eq!(None, ch);
        assert_eq!(2, size);

        let (ch, size) = decode(b"\xF0\x9D\x9Ca");
        assert_eq!(None, ch);
        assert_eq!(3, size);
    }

    #[test]
    fn decode_lossily() {
        let (ch, size) = decode_lossy(b"");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(0, size);

        let (ch, size) = decode_lossy(b"\xFF");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(1, size);

        let (ch, size) = decode_lossy(b"\xCE\xF0");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(1, size);

        let (ch, size) = decode_lossy(b"\xE2\x98\xF0");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(2, size);

        let (ch, size) = decode_lossy(b"\xF0\x9D\x9D\xF0");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(3, size);

        let (ch, size) = decode_lossy(b"\xF0\x82\x82\xAC");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(1, size);

        let (ch, size) = decode_lossy(b"\xED\xA0\x80");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(1, size);

        let (ch, size) = decode_lossy(b"\xCEa");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(1, size);

        let (ch, size) = decode_lossy(b"\xE2\x98a");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(2, size);

        let (ch, size) = decode_lossy(b"\xF0\x9D\x9Ca");
        assert_eq!('\u{FFFD}', ch);
        assert_eq!(3, size);
    }
}
regex_lite/utf8.rs

regex_lite/
utf8.rs