base64_simd/
ascii.rs
1use vsimd::isa::AVX2;
2use vsimd::tools::slice_parts;
3use vsimd::{matches_isa, Scalable, POD, SIMD256};
4
5use core::ops::Not;
6
7#[inline(always)]
8#[must_use]
9fn lookup_ascii_whitespace(c: u8) -> u8 {
10 const TABLE: &[u8; 256] = &{
11 let mut ans = [0; 256];
12 let mut i: u8 = 0;
13 loop {
14 ans[i as usize] = if i.is_ascii_whitespace() { 0xff } else { 0 };
15 if i == 255 {
16 break;
17 }
18 i += 1;
19 }
20 ans
21 };
22 unsafe { *TABLE.get_unchecked(c as usize) }
23}
24
25#[inline(always)]
26fn has_ascii_whitespace<S: Scalable<V>, V: POD>(s: S, x: V) -> bool {
27 let m1 = s.i8xn_lt(s.u8xn_sub(x, s.u8xn_splat(0x89)), s.i8xn_splat(-128 + 5));
37
38 let m2 = s.u8xn_eq(x, s.u8xn_splat(0x0b));
40
41 let m3 = s.u8xn_eq(x, s.u8xn_splat(0x20));
43
44 s.mask8xn_any(s.or(s.andnot(m1, m2), m3))
46}
47
48#[inline(always)]
49unsafe fn find_non_ascii_whitespace_short(mut src: *const u8, len: usize) -> usize {
50 let base = src;
51 let end = base.add(len);
52 while src < end {
53 if lookup_ascii_whitespace(src.read()) != 0 {
54 break;
55 }
56 src = src.add(1);
57 }
58
59 src.offset_from(base) as usize
60}
61
62#[inline(always)]
63pub unsafe fn find_non_ascii_whitespace_fallback(src: *const u8, len: usize) -> usize {
64 find_non_ascii_whitespace_short(src, len)
65}
66
67#[inline(always)]
68pub unsafe fn find_non_ascii_whitespace_simd<S: SIMD256>(s: S, mut src: *const u8, len: usize) -> usize {
69 let base = src;
70
71 if matches_isa!(S, AVX2) {
72 let end = src.add(len / 32 * 32);
73 while src < end {
74 let x = s.v256_load_unaligned(src);
75 if has_ascii_whitespace(s, x) {
76 break;
77 }
78 src = src.add(32);
79 }
80 if (len % 32) >= 16 {
81 let x = s.v128_load_unaligned(src);
82 if has_ascii_whitespace(s, x).not() {
83 src = src.add(16);
84 }
85 }
86 } else {
87 let end = src.add(len / 16 * 16);
88 while src < end {
89 let x = s.v128_load_unaligned(src);
90 if has_ascii_whitespace(s, x) {
91 break;
92 }
93 src = src.add(16);
94 }
95 }
96
97 let checked_len = src.offset_from(base) as usize;
98 let pos = find_non_ascii_whitespace_short(src, len - checked_len);
99 checked_len + pos
100}
101
102#[inline(always)]
103#[must_use]
104pub fn find_non_ascii_whitespace(data: &[u8]) -> usize {
105 let (src, len) = slice_parts(data);
106 unsafe { crate::multiversion::find_non_ascii_whitespace::auto(src, len) }
107}
108
109#[inline(always)]
110#[must_use]
111pub unsafe fn remove_ascii_whitespace_fallback(mut src: *const u8, len: usize, mut dst: *mut u8) -> usize {
112 let dst_base = dst;
113
114 let end = src.add(len);
115 while src < end {
116 let x = src.read();
117 if lookup_ascii_whitespace(x) == 0 {
118 dst.write(x);
119 dst = dst.add(1);
120 }
121 src = src.add(1);
122 }
123
124 dst.offset_from(dst_base) as usize
125}
126
127#[inline(always)]
128#[must_use]
129pub fn remove_ascii_whitespace_inplace(data: &mut [u8]) -> &mut [u8] {
130 let pos = find_non_ascii_whitespace(data);
131 debug_assert!(pos <= data.len());
132
133 if pos == data.len() {
134 return data;
135 }
136
137 unsafe {
138 let len = data.len() - pos;
139 let dst = data.as_mut_ptr().add(pos);
140 let src = dst;
141
142 let rem = remove_ascii_whitespace_fallback(src, len, dst);
143 debug_assert!(rem <= len);
144
145 data.get_unchecked_mut(..(pos + rem))
146 }
147}
148
149#[cfg(test)]
150mod tests {
151 use super::*;
152
153 #[cfg_attr(not(target_arch = "wasm32"), test)]
154 #[cfg_attr(target_arch = "wasm32", wasm_bindgen_test::wasm_bindgen_test)]
155 fn test_remove_ascii_whitespace() {
156 let cases = [
157 "\0\0\0\0",
158 "abcd",
159 "ab\tcd",
160 "ab\ncd",
161 "ab\x0Ccd",
162 "ab\rcd",
163 "ab cd",
164 "ab\t\n\x0C\r cd",
165 "ab\t\n\x0C\r =\t\n\x0C\r =\t\n\x0C\r ",
166 ];
167
168 let check = |case: &str, repeat: usize| {
169 let mut buf = case.repeat(repeat).into_bytes();
170 let expected = {
171 let mut v = buf.clone();
172 v.retain(|c| !c.is_ascii_whitespace());
173 v
174 };
175 let ans = remove_ascii_whitespace_inplace(&mut buf);
176 assert_eq!(ans, &*expected, "case = {case:?}");
177 };
178
179 for case in cases {
180 check(case, 1);
181
182 if cfg!(not(miri)) {
183 check(case, 10);
184 }
185 }
186 }
187}
188
189#[cfg(test)]
190mod algorithm {
191 #[test]
192 #[ignore]
193 fn is_ascii_whitespace() {
194 for x in 0..=255u8 {
195 let m1 = (x.wrapping_sub(0x89) as i8) < (-128 + 5);
196 let m2 = x == 0x0b;
197 let m3 = x == 0x20;
198 let ans = (m1 && !m2) || m3;
199 assert_eq!(ans, x.is_ascii_whitespace());
200 }
201 }
202}