str_indices/
lines_crlf.rs

1//! Index by lines (carriage return and line feed).
2//!
3//! This module recognizes the following as line breaks:
4//!
5//! - `U+000A`          &mdash; LF (Line Feed)
6//! - `U+000D`          &mdash; CR (Carriage Return)
7//! - `U+000D` `U+000A` &mdash; CRLF (Carriage Return + Line Feed)
8//!
9//! (Note: if you only want to recognize LF and CRLF, without
10//! recognizing CR individually, see the [`lines_lf`](crate::lines_lf) module.)
11
12use crate::byte_chunk::{ByteChunk, Chunk};
13
14/// Counts the line breaks in a string slice.
15///
16/// Runs in O(N) time.
17#[inline]
18pub fn count_breaks(text: &str) -> usize {
19    count_breaks_impl::<Chunk>(text.as_bytes())
20}
21
22/// Converts from byte-index to line-index in a string slice.
23///
24/// Line break characters are considered to be a part of the line they
25/// end.  And a string that ends with a line break is considered to have
26/// a final empty line.  So this function is equivalent to counting the
27/// line breaks before the specified byte.
28///
29/// Any past-the-end index will return the last line index.
30///
31/// Runs in O(N) time.
32#[inline]
33pub fn from_byte_idx(text: &str, byte_idx: usize) -> usize {
34    let i = byte_idx.min(text.len());
35    let nl_count = count_breaks_impl::<Chunk>(&text.as_bytes()[..i]);
36    if crate::is_not_crlf_middle(i, text.as_bytes()) {
37        nl_count
38    } else {
39        nl_count - 1
40    }
41}
42
43/// Converts from line-index to byte-index in a string slice.
44///
45/// Returns the byte index of the start of the specified line.  Line 0 is
46/// the start of the string, and subsequent lines start immediately
47/// *after* each line break character.
48///
49/// Any past-the-end index will return the one-past-the-end byte index.
50///
51/// Runs in O(N) time.
52#[inline]
53pub fn to_byte_idx(text: &str, line_idx: usize) -> usize {
54    to_byte_idx_impl::<Chunk>(text.as_bytes(), line_idx)
55}
56
57//-------------------------------------------------------------
58
59#[inline(always)]
60fn to_byte_idx_impl<T: ByteChunk>(text: &[u8], line_idx: usize) -> usize {
61    // Get `middle` so we can do more efficient chunk-based counting.
62    // We can't use this to get `end`, however, because the start index of
63    // `end` actually depends on the accumulating line counts during the
64    // counting process.
65    let (start, middle, _) = unsafe { text.align_to::<T>() };
66
67    let mut byte_count = 0;
68    let mut break_count = 0;
69
70    // Take care of any unaligned bytes at the beginning.
71    for byte in start.iter() {
72        if break_count == line_idx {
73            break;
74        }
75        break_count +=
76            (*byte == 0x0A || (*byte == 0x0D && text.get(byte_count + 1) != Some(&0x0A))) as usize;
77        byte_count += 1;
78    }
79
80    // Process chunks in the fast path.
81    let mut chunks = middle;
82    let mut max_round_len = (line_idx - break_count) / T::MAX_ACC;
83    while max_round_len > 0 && !chunks.is_empty() {
84        // Choose the largest number of chunks we can do this round
85        // that will neither overflow `max_acc` nor blast past the
86        // remaining line breaks we're looking for.
87        let round_len = T::MAX_ACC.min(max_round_len).min(chunks.len());
88        max_round_len -= round_len;
89        let round = &chunks[..round_len];
90        chunks = &chunks[round_len..];
91
92        // Process the chunks in this round.
93        let mut acc = T::zero();
94        for chunk in round.iter() {
95            let lf_flags = chunk.cmp_eq_byte(0x0A);
96            let cr_flags = chunk.cmp_eq_byte(0x0D);
97            let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1));
98            acc = acc.add(lf_flags).add(cr_flags.sub(crlf_flags));
99        }
100        break_count += acc.sum_bytes();
101
102        // Handle CRLFs at chunk boundaries in this round.
103        let mut i = byte_count;
104        while i < (byte_count + T::SIZE * round_len) {
105            i += T::SIZE;
106            break_count -= (text[i - 1] == 0x0D && text.get(i) == Some(&0x0A)) as usize;
107        }
108
109        byte_count += T::SIZE * round_len;
110    }
111
112    // Process chunks in the slow path.
113    for chunk in chunks.iter() {
114        let breaks = {
115            let lf_flags = chunk.cmp_eq_byte(0x0A);
116            let cr_flags = chunk.cmp_eq_byte(0x0D);
117            let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1));
118            lf_flags.add(cr_flags.sub(crlf_flags)).sum_bytes()
119        };
120        let boundary_crlf = {
121            let i = byte_count + T::SIZE;
122            (text[i - 1] == 0x0D && text.get(i) == Some(&0x0A)) as usize
123        };
124        let new_break_count = break_count + breaks - boundary_crlf;
125        if new_break_count >= line_idx {
126            break;
127        }
128        break_count = new_break_count;
129        byte_count += T::SIZE;
130    }
131
132    // Take care of any unaligned bytes at the end.
133    let end = &text[byte_count..];
134    for byte in end.iter() {
135        if break_count == line_idx {
136            break;
137        }
138        break_count +=
139            (*byte == 0x0A || (*byte == 0x0D && text.get(byte_count + 1) != Some(&0x0A))) as usize;
140        byte_count += 1;
141    }
142
143    // Finish up
144    byte_count
145}
146
147/// Counts the line breaks in a utf8 encoded string.
148///
149/// The following unicode sequences are considered newlines by this function:
150/// - u{000A}        (Line Feed)
151#[inline(always)]
152fn count_breaks_impl<T: ByteChunk>(text: &[u8]) -> usize {
153    // Get `middle` so we can do more efficient chunk-based counting.
154    let (start, middle, end) = unsafe { text.align_to::<T>() };
155
156    let mut count = 0;
157
158    // Take care of unaligned bytes at the beginning.
159    let mut last_was_cr = false;
160    for byte in start.iter().copied() {
161        let is_lf = byte == 0x0A;
162        let is_cr = byte == 0x0D;
163        count += (is_cr | (is_lf & !last_was_cr)) as usize;
164        last_was_cr = is_cr;
165    }
166
167    // Take care of the middle bytes in big chunks.
168    for chunks in middle.chunks(T::MAX_ACC) {
169        let mut acc = T::zero();
170        for chunk in chunks.iter() {
171            let lf_flags = chunk.cmp_eq_byte(0x0A);
172            let cr_flags = chunk.cmp_eq_byte(0x0D);
173            let crlf_flags = cr_flags.bitand(lf_flags.shift_back_lex(1));
174            acc = acc.add(lf_flags).add(cr_flags.sub(crlf_flags));
175        }
176        count += acc.sum_bytes();
177    }
178
179    // Check chunk boundaries for CRLF.
180    let mut i = start.len();
181    while i < (text.len() - end.len()) {
182        if text[i] == 0x0A {
183            count -= (text.get(i.saturating_sub(1)) == Some(&0x0D)) as usize;
184        }
185        i += T::SIZE;
186    }
187
188    // Take care of unaligned bytes at the end.
189    let mut last_was_cr = text.get((text.len() - end.len()).saturating_sub(1)) == Some(&0x0D);
190    for byte in end.iter().copied() {
191        let is_lf = byte == 0x0A;
192        let is_cr = byte == 0x0D;
193        count += (is_cr | (is_lf & !last_was_cr)) as usize;
194        last_was_cr = is_cr;
195    }
196
197    count
198}
199
200//=============================================================
201
202#[cfg(test)]
203mod tests {
204    use super::*;
205
206    // 124 bytes, 100 chars, 4 lines
207    const TEXT_LINES: &str = "Hello there!  How're you doing?\nIt's \
208                              a fine day, isn't it?\nAren't you glad \
209                              we're alive?\nこんにちは、みんなさん！";
210
211    #[test]
212    fn count_breaks_01() {
213        let text = "\u{000A}Hello\u{000D}\u{000A}せ\u{000B}か\u{000C}い\u{0085}. \
214                    There\u{000A}is something.\u{2029}";
215        assert_eq!(45, text.len());
216        assert_eq!(3, count_breaks(text));
217    }
218
219    #[test]
220    fn from_byte_idx_01() {
221        let text = "Here\nare\nsome\nwords";
222        assert_eq!(0, from_byte_idx(text, 0));
223        assert_eq!(0, from_byte_idx(text, 4));
224        assert_eq!(1, from_byte_idx(text, 5));
225        assert_eq!(1, from_byte_idx(text, 8));
226        assert_eq!(2, from_byte_idx(text, 9));
227        assert_eq!(2, from_byte_idx(text, 13));
228        assert_eq!(3, from_byte_idx(text, 14));
229        assert_eq!(3, from_byte_idx(text, 19));
230    }
231
232    #[test]
233    fn from_byte_idx_02() {
234        let text = "\nHere\nare\nsome\nwords\n";
235        assert_eq!(0, from_byte_idx(text, 0));
236        assert_eq!(1, from_byte_idx(text, 1));
237        assert_eq!(1, from_byte_idx(text, 5));
238        assert_eq!(2, from_byte_idx(text, 6));
239        assert_eq!(2, from_byte_idx(text, 9));
240        assert_eq!(3, from_byte_idx(text, 10));
241        assert_eq!(3, from_byte_idx(text, 14));
242        assert_eq!(4, from_byte_idx(text, 15));
243        assert_eq!(4, from_byte_idx(text, 20));
244        assert_eq!(5, from_byte_idx(text, 21));
245    }
246
247    #[test]
248    fn from_byte_idx_03() {
249        let text = "Here\r\nare\r\nsome\r\nwords";
250        assert_eq!(0, from_byte_idx(text, 0));
251        assert_eq!(0, from_byte_idx(text, 4));
252        assert_eq!(0, from_byte_idx(text, 5));
253        assert_eq!(1, from_byte_idx(text, 6));
254        assert_eq!(1, from_byte_idx(text, 9));
255        assert_eq!(1, from_byte_idx(text, 10));
256        assert_eq!(2, from_byte_idx(text, 11));
257        assert_eq!(2, from_byte_idx(text, 15));
258        assert_eq!(2, from_byte_idx(text, 16));
259        assert_eq!(3, from_byte_idx(text, 17));
260    }
261
262    #[test]
263    fn from_byte_idx_04() {
264        // Line 0
265        for i in 0..32 {
266            assert_eq!(0, from_byte_idx(TEXT_LINES, i));
267        }
268
269        // Line 1
270        for i in 32..59 {
271            assert_eq!(1, from_byte_idx(TEXT_LINES, i));
272        }
273
274        // Line 2
275        for i in 59..88 {
276            assert_eq!(2, from_byte_idx(TEXT_LINES, i));
277        }
278
279        // Line 3
280        for i in 88..125 {
281            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
282        }
283
284        // Past the end
285        for i in 125..130 {
286            assert_eq!(3, from_byte_idx(TEXT_LINES, i));
287        }
288    }
289
290    #[test]
291    fn to_byte_idx_01() {
292        let text = "Here\r\nare\r\nsome\r\nwords";
293        assert_eq!(0, to_byte_idx(text, 0));
294        assert_eq!(6, to_byte_idx(text, 1));
295        assert_eq!(11, to_byte_idx(text, 2));
296        assert_eq!(17, to_byte_idx(text, 3));
297    }
298
299    #[test]
300    fn to_byte_idx_02() {
301        let text = "\nHere\nare\nsome\nwords\n";
302        assert_eq!(0, to_byte_idx(text, 0));
303        assert_eq!(1, to_byte_idx(text, 1));
304        assert_eq!(6, to_byte_idx(text, 2));
305        assert_eq!(10, to_byte_idx(text, 3));
306        assert_eq!(15, to_byte_idx(text, 4));
307        assert_eq!(21, to_byte_idx(text, 5));
308    }
309
310    #[test]
311    fn to_byte_idx_03() {
312        assert_eq!(0, to_byte_idx(TEXT_LINES, 0));
313        assert_eq!(32, to_byte_idx(TEXT_LINES, 1));
314        assert_eq!(59, to_byte_idx(TEXT_LINES, 2));
315        assert_eq!(88, to_byte_idx(TEXT_LINES, 3));
316
317        // Past end
318        assert_eq!(124, to_byte_idx(TEXT_LINES, 4));
319        assert_eq!(124, to_byte_idx(TEXT_LINES, 5));
320        assert_eq!(124, to_byte_idx(TEXT_LINES, 6));
321    }
322
323    #[test]
324    fn line_byte_round_trip() {
325        let text = "\nHere\nare\nsome\nwords\n";
326        assert_eq!(6, to_byte_idx(text, from_byte_idx(text, 6)));
327        assert_eq!(2, from_byte_idx(text, to_byte_idx(text, 2)));
328
329        assert_eq!(0, to_byte_idx(text, from_byte_idx(text, 0)));
330        assert_eq!(0, from_byte_idx(text, to_byte_idx(text, 0)));
331
332        assert_eq!(21, to_byte_idx(text, from_byte_idx(text, 21)));
333        assert_eq!(5, from_byte_idx(text, to_byte_idx(text, 5)));
334    }
335}
str_indices/lines_crlf.rs

str_indices/
lines_crlf.rs