brotli/enc/
utf8_util.rs

1use crate::enc::floatX;
2
3fn parse_as_utf8(input: &[u8], size: usize) -> (usize, i32) {
4    if (input[0] & 0x80) == 0 {
5        if input[0] > 0 {
6            return (1, i32::from(input[0]));
7        }
8    }
9    if size > 1 && (input[0] & 0xe0) == 0xc0 && (input[1] & 0xc0) == 0x80 {
10        let symbol = (input[0] as i32 & 0x1f) << 6 | input[1] as i32 & 0x3f;
11        if symbol > 0x7f {
12            return (2, symbol);
13        }
14    }
15    if size > 2
16        && (input[0] & 0xf0) == 0xe0
17        && (input[1] & 0xc0) == 0x80
18        && (input[2] & 0xc0) == 0x80
19    {
20        let symbol = (i32::from(input[0]) & 0x0f) << 12
21            | (i32::from(input[1]) & 0x3f) << 6
22            | i32::from(input[2]) & 0x3f;
23        if symbol > 0x7ff {
24            return (3, symbol);
25        }
26    }
27    if size > 3
28        && (input[0] & 0xf8) == 0xf0
29        && (input[1] & 0xc0) == 0x80
30        && (input[2] & 0xc0) == 0x80
31        && (input[3] & 0xc0) == 0x80
32    {
33        let symbol = (i32::from(input[0]) & 0x07) << 18
34            | (i32::from(input[1]) & 0x3f) << 12
35            | (i32::from(input[2]) & 0x3f) << 6
36            | i32::from(input[3]) & 0x3f;
37        if symbol > 0xffff && symbol <= 0x10_ffff {
38            return (4, symbol);
39        }
40    }
41
42    (1, 0x11_0000 | i32::from(input[0]))
43}
44
45#[deprecated(note = "Use is_mostly_utf8 instead")]
46pub fn BrotliIsMostlyUTF8(
47    data: &[u8],
48    pos: usize,
49    mask: usize,
50    length: usize,
51    min_fraction: floatX,
52) -> i32 {
53    is_mostly_utf8(data, pos, mask, length, min_fraction).into()
54}
55
56pub(crate) fn is_mostly_utf8(
57    data: &[u8],
58    pos: usize,
59    mask: usize,
60    length: usize,
61    min_fraction: floatX,
62) -> bool {
63    let mut size_utf8: usize = 0;
64    let mut i: usize = 0;
65    while i < length {
66        let (bytes_read, symbol) = parse_as_utf8(&data[(pos.wrapping_add(i) & mask)..], length - i);
67        i = i.wrapping_add(bytes_read);
68        if symbol < 0x11_0000 {
69            size_utf8 = size_utf8.wrapping_add(bytes_read);
70        }
71    }
72    size_utf8 as floatX > min_fraction * length as floatX
73}