str_indices/
lines_lf.rs

1//! Index by lines (line feed only).
2//!
3//! This module recognizes the following as line breaks:
4//!
5//! - `U+000A`          — LF (Line Feed)
6//! - `U+000D` `U+000A` — CRLF (Carriage Return + Line Feed)
7//!   — by coincidence due to ignoring CR.
8
9use crate::byte_chunk::{ByteChunk, Chunk};
10
11/// Counts the line breaks in a string slice.
12///
13/// Runs in O(N) time.
14#[inline]
15pub fn count_breaks(text: &str) -> usize {
16    count_breaks_impl::<Chunk>(text.as_bytes())
17}
18
19/// Converts from byte-index to line-index in a string slice.
20///
21/// Line break characters are considered to be a part of the line they
22/// end.  And a string that ends with a line break is considered to have
23/// a final empty line.  So this function is equivalent to counting the
24/// line breaks before the specified byte.
25///
26/// Any past-the-end index will return the last line index.
27///
28/// Runs in O(N) time.
29#[inline]
30pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
31    let i = byte_idx.min(text.len());
32    count_breaks_impl::<Chunk>(&text.as_bytes()[..i])
33}
34
35/// Converts from line-index to byte-index in a string slice.
36///
37/// Returns the byte index of the start of the specified line.  Line 0 is
38/// the start of the string, and subsequent lines start immediately
39/// *after* each line break character.
40///
41/// Any past-the-end index will return the one-past-the-end byte index.
42///
43/// Runs in O(N) time.
44#[inline]
45pub fn to_byte_idx(text: &str, line_idx: usize) -> usize {
46    to_byte_idx_impl::<Chunk>(text.as_bytes(), line_idx)
47}
48
49//-------------------------------------------------------------
50
51#[inline(always)]
52fn to_byte_idx_impl<T: ByteChunk>(text: &[u8], line_idx: usize) -> usize {
53    // Get `middle` so we can do more efficient chunk-based counting.
54    // We can't use this to get `end`, however, because the start index of
55    // `end` actually depends on the accumulating line counts during the
56    // counting process.
57    let (start, middle, _) = unsafe { text.align_to::<T>() };
58
59    let mut byte_count = 0;
60    let mut lf_count = 0;
61
62    // Take care of any unaligned bytes at the beginning.
63    for byte in start.iter() {
64        if lf_count == line_idx {
65            break;
66        }
67        lf_count += (*byte == 0x0A) as usize;
68        byte_count += 1;
69    }
70
71    // Process chunks in the fast path.
72    let mut chunks = middle;
73    let mut max_round_len = (line_idx - lf_count) / T::MAX_ACC;
74    while max_round_len > 0 && !chunks.is_empty() {
75        // Choose the largest number of chunks we can do this round
76        // that will neither overflow `max_acc` nor blast past the
77        // remaining line breaks we're looking for.
78        let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len());
79        max_round_len -= round_len;
80        let round = &chunks[..round_len];
81        chunks = &chunks[round_len..];
82
83        // Process the chunks in this round.
84        let mut acc = T::zero();
85        for chunk in round.iter() {
86            acc = acc.add(chunk.cmp_eq_byte(0x0A));
87        }
88        lf_count += acc.sum_bytes();
89        byte_count += T::SIZE * round_len;
90    }
91
92    // Process chunks in the slow path.
93    for chunk in chunks.iter() {
94        let new_lf_count = lf_count + chunk.cmp_eq_byte(0x0A).sum_bytes();
95        if new_lf_count >= line_idx {
96            break;
97        }
98        lf_count = new_lf_count;
99        byte_count += T::SIZE;
100    }
101
102    // Take care of any unaligned bytes at the end.
103    let end = &text[byte_count..];
104    for byte in end.iter() {
105        if lf_count == line_idx {
106            break;
107        }
108        lf_count += (*byte == 0x0A) as usize;
109        byte_count += 1;
110    }
111
112    // Finish up
113    byte_count
114}
115
116/// Counts the line breaks in a utf8 encoded string.
117///
118/// The following unicode sequences are considered newlines by this function:
119/// - u{000A}        (Line Feed)
120#[inline(always)]
121fn count_breaks_impl<T: ByteChunk>(text: &[u8]) -> usize {
122    if text.len() < T::SIZE {
123        // Bypass the more complex routine for short strings, where the
124        // complexity hurts performance.
125        text.iter().map(|byte| (*byte == 0x0A) as usize).sum()
126    } else {
127        // Get `middle` so we can do more efficient chunk-based counting.
128        let (start, middle, end) = unsafe { text.align_to::<T>() };
129
130        let mut count = 0;
131
132        // Take care of unaligned bytes at the beginning.
133        for byte in start.iter() {
134            count += (*byte == 0x0A) as usize;
135        }
136
137        // Take care of the middle bytes in big chunks.
138        for chunks in middle.chunks(T::MAX_ACC) {
139            let mut acc = T::zero();
140            for chunk in chunks.iter() {
141                acc = acc.add(chunk.cmp_eq_byte(0x0A));
142            }
143            count += acc.sum_bytes();
144        }
145
146        // Take care of unaligned bytes at the end.
147        for byte in end.iter() {
148            count += (*byte == 0x0A) as usize;
149        }
150
151        count
152    }
153}
154
155//=============================================================
156
157#[cfg(test)]
158mod tests {
159    use super::*;
160
161    // 124 bytes, 100 chars, 4 lines
162    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
163                              a fine day, isn't it?\nAren't you glad \
164                              we're alive?\nこんにちは、みんなさん!";
165
166    #[test]
167    fn count_breaks_01() {
168        let text = "\nHello\u{000D}\nせ\u{000B}か\u{000C}い\u{0085}. \
169                    There\nis something.\u{2029}";
170        assert_eq!(45, text.len());
171        assert_eq!(3, count_breaks(text));
172    }
173
174    #[test]
175    fn from_byte_idx_01() {
176        let text = "Here\nare\nsome\nwords";
177        assert_eq!(0, from_byte_idx(text, 0));
178        assert_eq!(0, from_byte_idx(text, 4));
179        assert_eq!(1, from_byte_idx(text, 5));
180        assert_eq!(1, from_byte_idx(text, 8));
181        assert_eq!(2, from_byte_idx(text, 9));
182        assert_eq!(2, from_byte_idx(text, 13));
183        assert_eq!(3, from_byte_idx(text, 14));
184        assert_eq!(3, from_byte_idx(text, 19));
185    }
186
187    #[test]
188    fn from_byte_idx_02() {
189        let text = "\nHere\nare\nsome\nwords\n";
190        assert_eq!(0, from_byte_idx(text, 0));
191        assert_eq!(1, from_byte_idx(text, 1));
192        assert_eq!(1, from_byte_idx(text, 5));
193        assert_eq!(2, from_byte_idx(text, 6));
194        assert_eq!(2, from_byte_idx(text, 9));
195        assert_eq!(3, from_byte_idx(text, 10));
196        assert_eq!(3, from_byte_idx(text, 14));
197        assert_eq!(4, from_byte_idx(text, 15));
198        assert_eq!(4, from_byte_idx(text, 20));
199        assert_eq!(5, from_byte_idx(text, 21));
200    }
201
202    #[test]
203    fn from_byte_idx_03() {
204        let text = "Here\r\nare\r\nsome\r\nwords";
205        assert_eq!(0, from_byte_idx(text, 0));
206        assert_eq!(0, from_byte_idx(text, 4));
207        assert_eq!(0, from_byte_idx(text, 5));
208        assert_eq!(1, from_byte_idx(text, 6));
209        assert_eq!(1, from_byte_idx(text, 9));
210        assert_eq!(1, from_byte_idx(text, 10));
211        assert_eq!(2, from_byte_idx(text, 11));
212        assert_eq!(2, from_byte_idx(text, 15));
213        assert_eq!(2, from_byte_idx(text, 16));
214        assert_eq!(3, from_byte_idx(text, 17));
215    }
216
217    #[test]
218    fn from_byte_idx_04() {
219        // Line 0
220        for i in 0..32 {
221            assert_eq!(0, from_byte_idx(TEXT_LINES, i));
222        }
223
224        // Line 1
225        for i in 32..59 {
226            assert_eq!(1, from_byte_idx(TEXT_LINES, i));
227        }
228
229        // Line 2
230        for i in 59..88 {
231            assert_eq!(2, from_byte_idx(TEXT_LINES, i));
232        }
233
234        // Line 3
235        for i in 88..125 {
236            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
237        }
238
239        // Past the end
240        for i in 125..130 {
241            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
242        }
243    }
244
245    #[test]
246    fn to_byte_idx_01() {
247        let text = "Here\r\nare\r\nsome\r\nwords";
248        assert_eq!(0, to_byte_idx(text, 0));
249        assert_eq!(6, to_byte_idx(text, 1));
250        assert_eq!(11, to_byte_idx(text, 2));
251        assert_eq!(17, to_byte_idx(text, 3));
252    }
253
254    #[test]
255    fn to_byte_idx_02() {
256        let text = "\nHere\nare\nsome\nwords\n";
257        assert_eq!(0, to_byte_idx(text, 0));
258        assert_eq!(1, to_byte_idx(text, 1));
259        assert_eq!(6, to_byte_idx(text, 2));
260        assert_eq!(10, to_byte_idx(text, 3));
261        assert_eq!(15, to_byte_idx(text, 4));
262        assert_eq!(21, to_byte_idx(text, 5));
263    }
264
265    #[test]
266    fn to_byte_idx_03() {
267        assert_eq!(0, to_byte_idx(TEXT_LINES, 0));
268        assert_eq!(32, to_byte_idx(TEXT_LINES, 1));
269        assert_eq!(59, to_byte_idx(TEXT_LINES, 2));
270        assert_eq!(88, to_byte_idx(TEXT_LINES, 3));
271
272        // Past end
273        assert_eq!(124, to_byte_idx(TEXT_LINES, 4));
274        assert_eq!(124, to_byte_idx(TEXT_LINES, 5));
275        assert_eq!(124, to_byte_idx(TEXT_LINES, 6));
276    }
277
278    #[test]
279    fn line_byte_round_trip() {
280        let text = "\nHere\nare\nsome\nwords\n";
281        assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6)));
282        assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2)));
283
284        assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0)));
285        assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0)));
286
287        assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21)));
288        assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5)));
289    }
290}