str_indices/
chars.rs

1//! Index by chars.
2
3use crate::byte_chunk::{ByteChunk, Chunk};
4
5/// Counts the chars in a string slice.
6///
7/// Runs in O(N) time.
8#[inline]
9pub fn count(text: &str) -> usize {
10    count_impl::<Chunk>(text.as_bytes())
11}
12
13/// Converts from byte-index to char-index in a string slice.
14///
15/// If the byte is in the middle of a multi-byte char, returns the index of
16/// the char that the byte belongs to.
17///
18/// Any past-the-end index will return the one-past-the-end char index.
19///
20/// Runs in O(N) time.
21#[inline]
22pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
23    let bytes = text.as_bytes();
24
25    // Ensure the index is either a char boundary or is off the end of
26    // the text.
27    let mut i = byte_idx;
28    while Some(true) == bytes.get(i).map(|byte| (*byte & 0xC0) == 0x80) {
29        i -= 1;
30    }
31
32    count_impl::<Chunk>(&bytes[0..i.min(bytes.len())])
33}
34
35/// Converts from char-index to byte-index in a string slice.
36///
37/// Any past-the-end index will return the one-past-the-end byte index.
38///
39/// Runs in O(N) time.
40#[inline]
41pub fn to_byte_idx(text: &str, char_idx: usize) -> usize {
42    to_byte_idx_impl::<Chunk>(text, char_idx)
43}
44
45//-------------------------------------------------------------
46
47#[inline(always)]
48fn to_byte_idx_impl<T: ByteChunk>(text: &str, char_idx: usize) -> usize {
49    // Get `middle` so we can do more efficient chunk-based counting.
50    // We can't use this to get `end`, however, because the start index of
51    // `end` actually depends on the accumulating char counts during the
52    // counting process.
53    let (start, middle, _) = unsafe { text.as_bytes().align_to::<T>() };
54
55    let mut byte_count = 0;
56    let mut char_count = 0;
57
58    // Take care of any unaligned bytes at the beginning.
59    for byte in start.iter() {
60        char_count += ((*byte & 0xC0) != 0x80) as usize;
61        if char_count > char_idx {
62            break;
63        }
64        byte_count += 1;
65    }
66
67    // Process chunks in the fast path.
68    let mut chunks = middle;
69    let mut max_round_len = char_idx.saturating_sub(char_count) / T::MAX_ACC;
70    while max_round_len > 0 && !chunks.is_empty() {
71        // Choose the largest number of chunks we can do this round
72        // that will neither overflow `max_acc` nor blast past the
73        // char we're looking for.
74        let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len());
75        max_round_len -= round_len;
76        let round = &chunks[..round_len];
77        chunks = &chunks[round_len..];
78
79        // Process the chunks in this round.
80        let mut acc = T::zero();
81        for chunk in round.iter() {
82            acc = acc.add(chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80));
83        }
84        char_count += (T::SIZE * round_len) - acc.sum_bytes();
85        byte_count += T::SIZE * round_len;
86    }
87
88    // Process chunks in the slow path.
89    for chunk in chunks.iter() {
90        let new_char_count =
91            char_count + T::SIZE - chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80).sum_bytes();
92        if new_char_count >= char_idx {
93            break;
94        }
95        char_count = new_char_count;
96        byte_count += T::SIZE;
97    }
98
99    // Take care of any unaligned bytes at the end.
100    let end = &text.as_bytes()[byte_count..];
101    for byte in end.iter() {
102        char_count += ((*byte & 0xC0) != 0x80) as usize;
103        if char_count > char_idx {
104            break;
105        }
106        byte_count += 1;
107    }
108
109    byte_count
110}
111
112#[inline(always)]
113pub(crate) fn count_impl<T: ByteChunk>(text: &[u8]) -> usize {
114    if text.len() < T::SIZE {
115        // Bypass the more complex routine for short strings, where the
116        // complexity hurts performance.
117        text.iter()
118            .map(|byte| ((byte & 0xC0) != 0x80) as usize)
119            .sum()
120    } else {
121        // Get `middle` for more efficient chunk-based counting.
122        let (start, middle, end) = unsafe { text.align_to::<T>() };
123
124        let mut inv_count = 0;
125
126        // Take care of unaligned bytes at the beginning.
127        for byte in start.iter() {
128            inv_count += ((byte & 0xC0) == 0x80) as usize;
129        }
130
131        // Take care of the middle bytes in big chunks.
132        for chunks in middle.chunks(T::MAX_ACC) {
133            let mut acc = T::zero();
134            for chunk in chunks.iter() {
135                acc = acc.add(chunk.bitand(T::splat(0xc0)).cmp_eq_byte(0x80));
136            }
137            inv_count += acc.sum_bytes();
138        }
139
140        // Take care of unaligned bytes at the end.
141        for byte in end.iter() {
142            inv_count += ((byte & 0xC0) == 0x80) as usize;
143        }
144
145        text.len() - inv_count
146    }
147}
148
149//=============================================================
150
151#[cfg(test)]
152mod tests {
153    use super::*;
154
155    // 124 bytes, 100 chars, 4 lines
156    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
157                              a fine day, isn't it?\nAren't you glad \
158                              we're alive?\nこんにちは、みんなさん!";
159
160    #[test]
161    fn count_01() {
162        let text = "Hello せかい! Hello せかい! Hello せかい! Hello せかい! Hello せかい!";
163
164        assert_eq!(54, count(text));
165    }
166
167    #[test]
168    fn count_02() {
169        assert_eq!(100, count(TEXT_LINES));
170    }
171
172    #[test]
173    fn from_byte_idx_01() {
174        let text = "Hello せかい!";
175        assert_eq!(0, from_byte_idx(text, 0));
176        assert_eq!(1, from_byte_idx(text, 1));
177        assert_eq!(6, from_byte_idx(text, 6));
178        assert_eq!(6, from_byte_idx(text, 7));
179        assert_eq!(6, from_byte_idx(text, 8));
180        assert_eq!(7, from_byte_idx(text, 9));
181        assert_eq!(7, from_byte_idx(text, 10));
182        assert_eq!(7, from_byte_idx(text, 11));
183        assert_eq!(8, from_byte_idx(text, 12));
184        assert_eq!(8, from_byte_idx(text, 13));
185        assert_eq!(8, from_byte_idx(text, 14));
186        assert_eq!(9, from_byte_idx(text, 15));
187        assert_eq!(10, from_byte_idx(text, 16));
188        assert_eq!(10, from_byte_idx(text, 17));
189        assert_eq!(10, from_byte_idx(text, 18));
190        assert_eq!(10, from_byte_idx(text, 19));
191    }
192
193    #[test]
194    fn from_byte_idx_02() {
195        let text = "";
196        assert_eq!(0, from_byte_idx(text, 0));
197        assert_eq!(0, from_byte_idx(text, 1));
198
199        let text = "h";
200        assert_eq!(0, from_byte_idx(text, 0));
201        assert_eq!(1, from_byte_idx(text, 1));
202        assert_eq!(1, from_byte_idx(text, 2));
203
204        let text = "hi";
205        assert_eq!(0, from_byte_idx(text, 0));
206        assert_eq!(1, from_byte_idx(text, 1));
207        assert_eq!(2, from_byte_idx(text, 2));
208        assert_eq!(2, from_byte_idx(text, 3));
209    }
210
211    #[test]
212    fn from_byte_idx_03() {
213        let text = "せかい";
214        assert_eq!(0, from_byte_idx(text, 0));
215        assert_eq!(0, from_byte_idx(text, 1));
216        assert_eq!(0, from_byte_idx(text, 2));
217        assert_eq!(1, from_byte_idx(text, 3));
218        assert_eq!(1, from_byte_idx(text, 4));
219        assert_eq!(1, from_byte_idx(text, 5));
220        assert_eq!(2, from_byte_idx(text, 6));
221        assert_eq!(2, from_byte_idx(text, 7));
222        assert_eq!(2, from_byte_idx(text, 8));
223        assert_eq!(3, from_byte_idx(text, 9));
224        assert_eq!(3, from_byte_idx(text, 10));
225        assert_eq!(3, from_byte_idx(text, 11));
226        assert_eq!(3, from_byte_idx(text, 12));
227    }
228
229    #[test]
230    fn from_byte_idx_04() {
231        // Ascii range
232        for i in 0..88 {
233            assert_eq!(i, from_byte_idx(TEXT_LINES, i));
234        }
235
236        // Hiragana characters
237        for i in 88..125 {
238            assert_eq!(88 + ((i - 88) / 3), from_byte_idx(TEXT_LINES, i));
239        }
240
241        // Past the end
242        for i in 125..130 {
243            assert_eq!(100, from_byte_idx(TEXT_LINES, i));
244        }
245    }
246
247    #[test]
248    fn to_byte_idx_01() {
249        let text = "Hello せかい!";
250        assert_eq!(0, to_byte_idx(text, 0));
251        assert_eq!(1, to_byte_idx(text, 1));
252        assert_eq!(2, to_byte_idx(text, 2));
253        assert_eq!(5, to_byte_idx(text, 5));
254        assert_eq!(6, to_byte_idx(text, 6));
255        assert_eq!(12, to_byte_idx(text, 8));
256        assert_eq!(15, to_byte_idx(text, 9));
257        assert_eq!(16, to_byte_idx(text, 10));
258    }
259
260    #[test]
261    fn to_byte_idx_02() {
262        let text = "せかい";
263        assert_eq!(0, to_byte_idx(text, 0));
264        assert_eq!(3, to_byte_idx(text, 1));
265        assert_eq!(6, to_byte_idx(text, 2));
266        assert_eq!(9, to_byte_idx(text, 3));
267    }
268
269    #[test]
270    fn to_byte_idx_03() {
271        let text = "Hello world!";
272        assert_eq!(0, to_byte_idx(text, 0));
273        assert_eq!(1, to_byte_idx(text, 1));
274        assert_eq!(8, to_byte_idx(text, 8));
275        assert_eq!(11, to_byte_idx(text, 11));
276        assert_eq!(12, to_byte_idx(text, 12));
277    }
278
279    #[test]
280    fn to_byte_idx_04() {
281        let text = "Hello world! Hello せかい! Hello world! Hello せかい! \
282                    Hello world! Hello せかい! Hello world! Hello せかい! \
283                    Hello world! Hello せかい! Hello world! Hello せかい! \
284                    Hello world! Hello せかい! Hello world! Hello せかい!";
285        assert_eq!(0, to_byte_idx(text, 0));
286        assert_eq!(30, to_byte_idx(text, 24));
287        assert_eq!(60, to_byte_idx(text, 48));
288        assert_eq!(90, to_byte_idx(text, 72));
289        assert_eq!(115, to_byte_idx(text, 93));
290        assert_eq!(120, to_byte_idx(text, 96));
291        assert_eq!(150, to_byte_idx(text, 120));
292        assert_eq!(180, to_byte_idx(text, 144));
293        assert_eq!(210, to_byte_idx(text, 168));
294        assert_eq!(239, to_byte_idx(text, 191));
295    }
296
297    #[test]
298    fn to_byte_idx_05() {
299        // Ascii range
300        for i in 0..88 {
301            assert_eq!(i, to_byte_idx(TEXT_LINES, i));
302        }
303
304        // Hiragana characters
305        for i in 88..100 {
306            assert_eq!(88 + ((i - 88) * 3), to_byte_idx(TEXT_LINES, i));
307        }
308
309        // Past the end
310        for i in 100..110 {
311            assert_eq!(124, to_byte_idx(TEXT_LINES, i));
312        }
313    }
314}