bstr/
io.rs

1/*!
2Utilities for working with I/O using byte strings.
3
4This module currently only exports a single trait, `BufReadExt`, which provides
5facilities for conveniently and efficiently working with lines as byte strings.
6
7More APIs may be added in the future.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use std::io;
13
14use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15
16/// An extension trait for
17/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18/// which provides convenience APIs for dealing with byte strings.
19pub trait BufReadExt: io::BufRead {
20    /// Returns an iterator over the lines of this reader, where each line
21    /// is represented as a byte string.
22    ///
23    /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24    /// an error is yielded if there was a problem reading from the underlying
25    /// reader.
26    ///
27    /// On success, the next line in the iterator is returned. The line does
28    /// *not* contain a trailing `\n` or `\r\n`.
29    ///
30    /// # Examples
31    ///
32    /// Basic usage:
33    ///
34    /// ```
35    /// use std::io;
36    ///
37    /// use bstr::io::BufReadExt;
38    ///
39    /// # fn example() -> Result<(), io::Error> {
40    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
41    ///
42    /// let mut lines = vec![];
43    /// for result in cursor.byte_lines() {
44    ///     let line = result?;
45    ///     lines.push(line);
46    /// }
47    /// assert_eq!(lines.len(), 3);
48    /// assert_eq!(lines[0], "lorem".as_bytes());
49    /// assert_eq!(lines[1], "ipsum".as_bytes());
50    /// assert_eq!(lines[2], "dolor".as_bytes());
51    /// # Ok(()) }; example().unwrap()
52    /// ```
53    fn byte_lines(self) -> ByteLines<Self>
54    where
55        Self: Sized,
56    {
57        ByteLines { buf: self }
58    }
59
60    /// Returns an iterator over byte-terminated records of this reader, where
61    /// each record is represented as a byte string.
62    ///
63    /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64    /// an error is yielded if there was a problem reading from the underlying
65    /// reader.
66    ///
67    /// On success, the next record in the iterator is returned. The record
68    /// does *not* contain its trailing terminator.
69    ///
70    /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71    /// that it has no special handling for `\r`.
72    ///
73    /// # Examples
74    ///
75    /// Basic usage:
76    ///
77    /// ```
78    /// use std::io;
79    ///
80    /// use bstr::io::BufReadExt;
81    ///
82    /// # fn example() -> Result<(), io::Error> {
83    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
84    ///
85    /// let mut records = vec![];
86    /// for result in cursor.byte_records(b'\x00') {
87    ///     let record = result?;
88    ///     records.push(record);
89    /// }
90    /// assert_eq!(records.len(), 3);
91    /// assert_eq!(records[0], "lorem".as_bytes());
92    /// assert_eq!(records[1], "ipsum".as_bytes());
93    /// assert_eq!(records[2], "dolor".as_bytes());
94    /// # Ok(()) }; example().unwrap()
95    /// ```
96    fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97    where
98        Self: Sized,
99    {
100        ByteRecords { terminator, buf: self }
101    }
102
103    /// Executes the given closure on each line in the underlying reader.
104    ///
105    /// If the closure returns an error (or if the underlying reader returns an
106    /// error), then iteration is stopped and the error is returned. If false
107    /// is returned, then iteration is stopped and no error is returned.
108    ///
109    /// The closure given is called on exactly the same values as yielded by
110    /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111    /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112    ///
113    /// This routine is useful for iterating over lines as quickly as
114    /// possible. Namely, a single allocation is reused for each line.
115    ///
116    /// # Examples
117    ///
118    /// Basic usage:
119    ///
120    /// ```
121    /// use std::io;
122    ///
123    /// use bstr::io::BufReadExt;
124    ///
125    /// # fn example() -> Result<(), io::Error> {
126    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
127    ///
128    /// let mut lines = vec![];
129    /// cursor.for_byte_line(|line| {
130    ///     lines.push(line.to_vec());
131    ///     Ok(true)
132    /// })?;
133    /// assert_eq!(lines.len(), 3);
134    /// assert_eq!(lines[0], "lorem".as_bytes());
135    /// assert_eq!(lines[1], "ipsum".as_bytes());
136    /// assert_eq!(lines[2], "dolor".as_bytes());
137    /// # Ok(()) }; example().unwrap()
138    /// ```
139    fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140    where
141        Self: Sized,
142        F: FnMut(&[u8]) -> io::Result<bool>,
143    {
144        self.for_byte_line_with_terminator(|line| {
145            for_each_line(trim_line_slice(line))
146        })
147    }
148
149    /// Executes the given closure on each byte-terminated record in the
150    /// underlying reader.
151    ///
152    /// If the closure returns an error (or if the underlying reader returns an
153    /// error), then iteration is stopped and the error is returned. If false
154    /// is returned, then iteration is stopped and no error is returned.
155    ///
156    /// The closure given is called on exactly the same values as yielded by
157    /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158    /// iterator. Namely, records do _not_ contain a trailing terminator byte.
159    ///
160    /// This routine is useful for iterating over records as quickly as
161    /// possible. Namely, a single allocation is reused for each record.
162    ///
163    /// # Examples
164    ///
165    /// Basic usage:
166    ///
167    /// ```
168    /// use std::io;
169    ///
170    /// use bstr::io::BufReadExt;
171    ///
172    /// # fn example() -> Result<(), io::Error> {
173    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
174    ///
175    /// let mut records = vec![];
176    /// cursor.for_byte_record(b'\x00', |record| {
177    ///     records.push(record.to_vec());
178    ///     Ok(true)
179    /// })?;
180    /// assert_eq!(records.len(), 3);
181    /// assert_eq!(records[0], "lorem".as_bytes());
182    /// assert_eq!(records[1], "ipsum".as_bytes());
183    /// assert_eq!(records[2], "dolor".as_bytes());
184    /// # Ok(()) }; example().unwrap()
185    /// ```
186    fn for_byte_record<F>(
187        &mut self,
188        terminator: u8,
189        mut for_each_record: F,
190    ) -> io::Result<()>
191    where
192        Self: Sized,
193        F: FnMut(&[u8]) -> io::Result<bool>,
194    {
195        self.for_byte_record_with_terminator(terminator, |chunk| {
196            for_each_record(trim_record_slice(chunk, terminator))
197        })
198    }
199
200    /// Executes the given closure on each line in the underlying reader.
201    ///
202    /// If the closure returns an error (or if the underlying reader returns an
203    /// error), then iteration is stopped and the error is returned. If false
204    /// is returned, then iteration is stopped and no error is returned.
205    ///
206    /// Unlike
207    /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208    /// the lines given to the closure *do* include the line terminator, if one
209    /// exists.
210    ///
211    /// This routine is useful for iterating over lines as quickly as
212    /// possible. Namely, a single allocation is reused for each line.
213    ///
214    /// This is identical to `for_byte_record_with_terminator` with a
215    /// terminator of `\n`.
216    ///
217    /// # Examples
218    ///
219    /// Basic usage:
220    ///
221    /// ```
222    /// use std::io;
223    ///
224    /// use bstr::io::BufReadExt;
225    ///
226    /// # fn example() -> Result<(), io::Error> {
227    /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
228    ///
229    /// let mut lines = vec![];
230    /// cursor.for_byte_line_with_terminator(|line| {
231    ///     lines.push(line.to_vec());
232    ///     Ok(true)
233    /// })?;
234    /// assert_eq!(lines.len(), 3);
235    /// assert_eq!(lines[0], "lorem\n".as_bytes());
236    /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
237    /// assert_eq!(lines[2], "dolor".as_bytes());
238    /// # Ok(()) }; example().unwrap()
239    /// ```
240    fn for_byte_line_with_terminator<F>(
241        &mut self,
242        for_each_line: F,
243    ) -> io::Result<()>
244    where
245        Self: Sized,
246        F: FnMut(&[u8]) -> io::Result<bool>,
247    {
248        self.for_byte_record_with_terminator(b'\n', for_each_line)
249    }
250
251    /// Executes the given closure on each byte-terminated record in the
252    /// underlying reader.
253    ///
254    /// If the closure returns an error (or if the underlying reader returns an
255    /// error), then iteration is stopped and the error is returned. If false
256    /// is returned, then iteration is stopped and no error is returned.
257    ///
258    /// Unlike
259    /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260    /// the lines given to the closure *do* include the record terminator, if
261    /// one exists.
262    ///
263    /// This routine is useful for iterating over records as quickly as
264    /// possible. Namely, a single allocation is reused for each record.
265    ///
266    /// # Examples
267    ///
268    /// Basic usage:
269    ///
270    /// ```
271    /// use std::io;
272    ///
273    /// use bstr::{io::BufReadExt, B};
274    ///
275    /// # fn example() -> Result<(), io::Error> {
276    /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277    ///
278    /// let mut records = vec![];
279    /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280    ///     records.push(record.to_vec());
281    ///     Ok(true)
282    /// })?;
283    /// assert_eq!(records.len(), 3);
284    /// assert_eq!(records[0], B(b"lorem\x00"));
285    /// assert_eq!(records[1], B("ipsum\x00"));
286    /// assert_eq!(records[2], B("dolor"));
287    /// # Ok(()) }; example().unwrap()
288    /// ```
289    fn for_byte_record_with_terminator<F>(
290        &mut self,
291        terminator: u8,
292        mut for_each_record: F,
293    ) -> io::Result<()>
294    where
295        Self: Sized,
296        F: FnMut(&[u8]) -> io::Result<bool>,
297    {
298        let mut bytes = vec![];
299        let mut res = Ok(());
300        let mut consumed = 0;
301        'outer: loop {
302            // Lend out complete record slices from our buffer
303            {
304                let mut buf = self.fill_buf()?;
305                if buf.is_empty() {
306                    break;
307                }
308                while let Some(index) = buf.find_byte(terminator) {
309                    let (record, rest) = buf.split_at(index + 1);
310                    buf = rest;
311                    consumed += record.len();
312                    match for_each_record(record) {
313                        Ok(false) => break 'outer,
314                        Err(err) => {
315                            res = Err(err);
316                            break 'outer;
317                        }
318                        _ => (),
319                    }
320                }
321
322                // Copy the final record fragment to our local buffer. This
323                // saves read_until() from re-scanning a buffer we know
324                // contains no remaining terminators.
325                bytes.extend_from_slice(buf);
326                consumed += buf.len();
327            }
328
329            self.consume(consumed);
330            consumed = 0;
331
332            // N.B. read_until uses a different version of memchr that may
333            // be slower than the memchr crate that bstr uses. However, this
334            // should only run for a fairly small number of records, assuming a
335            // decent buffer size.
336            self.read_until(terminator, &mut bytes)?;
337            if bytes.is_empty() || !for_each_record(&bytes)? {
338                break;
339            }
340            bytes.clear();
341        }
342        self.consume(consumed);
343        res
344    }
345}
346
347impl<B: io::BufRead> BufReadExt for B {}
348
349/// An iterator over lines from an instance of
350/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
351///
352/// This iterator is generally created by calling the
353/// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
354/// method on the
355/// [`BufReadExt`](trait.BufReadExt.html)
356/// trait.
357#[derive(Debug)]
358pub struct ByteLines<B> {
359    buf: B,
360}
361
362/// An iterator over records from an instance of
363/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
364///
365/// A byte record is any sequence of bytes terminated by a particular byte
366/// chosen by the caller. For example, NUL separated byte strings are said to
367/// be NUL-terminated byte records.
368///
369/// This iterator is generally created by calling the
370/// [`byte_records`](trait.BufReadExt.html#method.byte_records)
371/// method on the
372/// [`BufReadExt`](trait.BufReadExt.html)
373/// trait.
374#[derive(Debug)]
375pub struct ByteRecords<B> {
376    buf: B,
377    terminator: u8,
378}
379
380impl<B: io::BufRead> Iterator for ByteLines<B> {
381    type Item = io::Result<Vec<u8>>;
382
383    fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
384        let mut bytes = vec![];
385        match self.buf.read_until(b'\n', &mut bytes) {
386            Err(e) => Some(Err(e)),
387            Ok(0) => None,
388            Ok(_) => {
389                trim_line(&mut bytes);
390                Some(Ok(bytes))
391            }
392        }
393    }
394}
395
396impl<B: io::BufRead> Iterator for ByteRecords<B> {
397    type Item = io::Result<Vec<u8>>;
398
399    fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
400        let mut bytes = vec![];
401        match self.buf.read_until(self.terminator, &mut bytes) {
402            Err(e) => Some(Err(e)),
403            Ok(0) => None,
404            Ok(_) => {
405                trim_record(&mut bytes, self.terminator);
406                Some(Ok(bytes))
407            }
408        }
409    }
410}
411
412fn trim_line(line: &mut Vec<u8>) {
413    if line.last_byte() == Some(b'\n') {
414        line.pop_byte();
415        if line.last_byte() == Some(b'\r') {
416            line.pop_byte();
417        }
418    }
419}
420
421fn trim_line_slice(mut line: &[u8]) -> &[u8] {
422    if line.last_byte() == Some(b'\n') {
423        line = &line[..line.len() - 1];
424        if line.last_byte() == Some(b'\r') {
425            line = &line[..line.len() - 1];
426        }
427    }
428    line
429}
430
431fn trim_record(record: &mut Vec<u8>, terminator: u8) {
432    if record.last_byte() == Some(terminator) {
433        record.pop_byte();
434    }
435}
436
437fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
438    if record.last_byte() == Some(terminator) {
439        record = &record[..record.len() - 1];
440    }
441    record
442}
443
444#[cfg(all(test, feature = "std"))]
445mod tests {
446    use alloc::{vec, vec::Vec};
447
448    use crate::bstring::BString;
449
450    use super::BufReadExt;
451
452    fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
453        let mut lines = vec![];
454        slice
455            .as_ref()
456            .for_byte_line(|line| {
457                lines.push(BString::from(line.to_vec()));
458                Ok(true)
459            })
460            .unwrap();
461        lines
462    }
463
464    fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
465        let mut lines = vec![];
466        slice
467            .as_ref()
468            .for_byte_line_with_terminator(|line| {
469                lines.push(BString::from(line.to_vec()));
470                Ok(true)
471            })
472            .unwrap();
473        lines
474    }
475
476    #[test]
477    fn lines_without_terminator() {
478        assert_eq!(collect_lines(""), Vec::<BString>::new());
479
480        assert_eq!(collect_lines("\n"), vec![""]);
481        assert_eq!(collect_lines("\n\n"), vec!["", ""]);
482        assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
483        assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
484        assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
485        assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
486
487        assert_eq!(collect_lines("\r\n"), vec![""]);
488        assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
489        assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
490        assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
491        assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
492        assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
493
494        assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
495    }
496
497    #[test]
498    fn lines_with_terminator() {
499        assert_eq!(collect_lines_term(""), Vec::<BString>::new());
500
501        assert_eq!(collect_lines_term("\n"), vec!["\n"]);
502        assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
503        assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
504        assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
505        assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
506        assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
507
508        assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
509        assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
510        assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
511        assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
512        assert_eq!(
513            collect_lines_term("abc\r\nxyz\r\n"),
514            vec!["abc\r\n", "xyz\r\n"]
515        );
516        assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
517
518        assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
519    }
520}
bstr/io.rs

bstr/
io.rs