bstr/io.rs
1/*!
2Utilities for working with I/O using byte strings.
3
4This module currently only exports a single trait, `BufReadExt`, which provides
5facilities for conveniently and efficiently working with lines as byte strings.
6
7More APIs may be added in the future.
8*/
9
10use alloc::{vec, vec::Vec};
11
12use std::io;
13
14use crate::{ext_slice::ByteSlice, ext_vec::ByteVec};
15
16/// An extension trait for
17/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html)
18/// which provides convenience APIs for dealing with byte strings.
19pub trait BufReadExt: io::BufRead {
20 /// Returns an iterator over the lines of this reader, where each line
21 /// is represented as a byte string.
22 ///
23 /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
24 /// an error is yielded if there was a problem reading from the underlying
25 /// reader.
26 ///
27 /// On success, the next line in the iterator is returned. The line does
28 /// *not* contain a trailing `\n` or `\r\n`.
29 ///
30 /// # Examples
31 ///
32 /// Basic usage:
33 ///
34 /// ```
35 /// use std::io;
36 ///
37 /// use bstr::io::BufReadExt;
38 ///
39 /// # fn example() -> Result<(), io::Error> {
40 /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
41 ///
42 /// let mut lines = vec![];
43 /// for result in cursor.byte_lines() {
44 /// let line = result?;
45 /// lines.push(line);
46 /// }
47 /// assert_eq!(lines.len(), 3);
48 /// assert_eq!(lines[0], "lorem".as_bytes());
49 /// assert_eq!(lines[1], "ipsum".as_bytes());
50 /// assert_eq!(lines[2], "dolor".as_bytes());
51 /// # Ok(()) }; example().unwrap()
52 /// ```
53 fn byte_lines(self) -> ByteLines<Self>
54 where
55 Self: Sized,
56 {
57 ByteLines { buf: self }
58 }
59
60 /// Returns an iterator over byte-terminated records of this reader, where
61 /// each record is represented as a byte string.
62 ///
63 /// Each item yielded by this iterator is a `io::Result<Vec<u8>>`, where
64 /// an error is yielded if there was a problem reading from the underlying
65 /// reader.
66 ///
67 /// On success, the next record in the iterator is returned. The record
68 /// does *not* contain its trailing terminator.
69 ///
70 /// Note that calling `byte_records(b'\n')` differs from `byte_lines()` in
71 /// that it has no special handling for `\r`.
72 ///
73 /// # Examples
74 ///
75 /// Basic usage:
76 ///
77 /// ```
78 /// use std::io;
79 ///
80 /// use bstr::io::BufReadExt;
81 ///
82 /// # fn example() -> Result<(), io::Error> {
83 /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
84 ///
85 /// let mut records = vec![];
86 /// for result in cursor.byte_records(b'\x00') {
87 /// let record = result?;
88 /// records.push(record);
89 /// }
90 /// assert_eq!(records.len(), 3);
91 /// assert_eq!(records[0], "lorem".as_bytes());
92 /// assert_eq!(records[1], "ipsum".as_bytes());
93 /// assert_eq!(records[2], "dolor".as_bytes());
94 /// # Ok(()) }; example().unwrap()
95 /// ```
96 fn byte_records(self, terminator: u8) -> ByteRecords<Self>
97 where
98 Self: Sized,
99 {
100 ByteRecords { terminator, buf: self }
101 }
102
103 /// Executes the given closure on each line in the underlying reader.
104 ///
105 /// If the closure returns an error (or if the underlying reader returns an
106 /// error), then iteration is stopped and the error is returned. If false
107 /// is returned, then iteration is stopped and no error is returned.
108 ///
109 /// The closure given is called on exactly the same values as yielded by
110 /// the [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
111 /// iterator. Namely, lines do _not_ contain trailing `\n` or `\r\n` bytes.
112 ///
113 /// This routine is useful for iterating over lines as quickly as
114 /// possible. Namely, a single allocation is reused for each line.
115 ///
116 /// # Examples
117 ///
118 /// Basic usage:
119 ///
120 /// ```
121 /// use std::io;
122 ///
123 /// use bstr::io::BufReadExt;
124 ///
125 /// # fn example() -> Result<(), io::Error> {
126 /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
127 ///
128 /// let mut lines = vec![];
129 /// cursor.for_byte_line(|line| {
130 /// lines.push(line.to_vec());
131 /// Ok(true)
132 /// })?;
133 /// assert_eq!(lines.len(), 3);
134 /// assert_eq!(lines[0], "lorem".as_bytes());
135 /// assert_eq!(lines[1], "ipsum".as_bytes());
136 /// assert_eq!(lines[2], "dolor".as_bytes());
137 /// # Ok(()) }; example().unwrap()
138 /// ```
139 fn for_byte_line<F>(&mut self, mut for_each_line: F) -> io::Result<()>
140 where
141 Self: Sized,
142 F: FnMut(&[u8]) -> io::Result<bool>,
143 {
144 self.for_byte_line_with_terminator(|line| {
145 for_each_line(&trim_line_slice(&line))
146 })
147 }
148
149 /// Executes the given closure on each byte-terminated record in the
150 /// underlying reader.
151 ///
152 /// If the closure returns an error (or if the underlying reader returns an
153 /// error), then iteration is stopped and the error is returned. If false
154 /// is returned, then iteration is stopped and no error is returned.
155 ///
156 /// The closure given is called on exactly the same values as yielded by
157 /// the [`byte_records`](trait.BufReadExt.html#method.byte_records)
158 /// iterator. Namely, records do _not_ contain a trailing terminator byte.
159 ///
160 /// This routine is useful for iterating over records as quickly as
161 /// possible. Namely, a single allocation is reused for each record.
162 ///
163 /// # Examples
164 ///
165 /// Basic usage:
166 ///
167 /// ```
168 /// use std::io;
169 ///
170 /// use bstr::io::BufReadExt;
171 ///
172 /// # fn example() -> Result<(), io::Error> {
173 /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
174 ///
175 /// let mut records = vec![];
176 /// cursor.for_byte_record(b'\x00', |record| {
177 /// records.push(record.to_vec());
178 /// Ok(true)
179 /// })?;
180 /// assert_eq!(records.len(), 3);
181 /// assert_eq!(records[0], "lorem".as_bytes());
182 /// assert_eq!(records[1], "ipsum".as_bytes());
183 /// assert_eq!(records[2], "dolor".as_bytes());
184 /// # Ok(()) }; example().unwrap()
185 /// ```
186 fn for_byte_record<F>(
187 &mut self,
188 terminator: u8,
189 mut for_each_record: F,
190 ) -> io::Result<()>
191 where
192 Self: Sized,
193 F: FnMut(&[u8]) -> io::Result<bool>,
194 {
195 self.for_byte_record_with_terminator(terminator, |chunk| {
196 for_each_record(&trim_record_slice(&chunk, terminator))
197 })
198 }
199
200 /// Executes the given closure on each line in the underlying reader.
201 ///
202 /// If the closure returns an error (or if the underlying reader returns an
203 /// error), then iteration is stopped and the error is returned. If false
204 /// is returned, then iteration is stopped and no error is returned.
205 ///
206 /// Unlike
207 /// [`for_byte_line`](trait.BufReadExt.html#method.for_byte_line),
208 /// the lines given to the closure *do* include the line terminator, if one
209 /// exists.
210 ///
211 /// This routine is useful for iterating over lines as quickly as
212 /// possible. Namely, a single allocation is reused for each line.
213 ///
214 /// This is identical to `for_byte_record_with_terminator` with a
215 /// terminator of `\n`.
216 ///
217 /// # Examples
218 ///
219 /// Basic usage:
220 ///
221 /// ```
222 /// use std::io;
223 ///
224 /// use bstr::io::BufReadExt;
225 ///
226 /// # fn example() -> Result<(), io::Error> {
227 /// let mut cursor = io::Cursor::new(b"lorem\nipsum\r\ndolor");
228 ///
229 /// let mut lines = vec![];
230 /// cursor.for_byte_line_with_terminator(|line| {
231 /// lines.push(line.to_vec());
232 /// Ok(true)
233 /// })?;
234 /// assert_eq!(lines.len(), 3);
235 /// assert_eq!(lines[0], "lorem\n".as_bytes());
236 /// assert_eq!(lines[1], "ipsum\r\n".as_bytes());
237 /// assert_eq!(lines[2], "dolor".as_bytes());
238 /// # Ok(()) }; example().unwrap()
239 /// ```
240 fn for_byte_line_with_terminator<F>(
241 &mut self,
242 for_each_line: F,
243 ) -> io::Result<()>
244 where
245 Self: Sized,
246 F: FnMut(&[u8]) -> io::Result<bool>,
247 {
248 self.for_byte_record_with_terminator(b'\n', for_each_line)
249 }
250
251 /// Executes the given closure on each byte-terminated record in the
252 /// underlying reader.
253 ///
254 /// If the closure returns an error (or if the underlying reader returns an
255 /// error), then iteration is stopped and the error is returned. If false
256 /// is returned, then iteration is stopped and no error is returned.
257 ///
258 /// Unlike
259 /// [`for_byte_record`](trait.BufReadExt.html#method.for_byte_record),
260 /// the lines given to the closure *do* include the record terminator, if
261 /// one exists.
262 ///
263 /// This routine is useful for iterating over records as quickly as
264 /// possible. Namely, a single allocation is reused for each record.
265 ///
266 /// # Examples
267 ///
268 /// Basic usage:
269 ///
270 /// ```
271 /// use std::io;
272 ///
273 /// use bstr::{io::BufReadExt, B};
274 ///
275 /// # fn example() -> Result<(), io::Error> {
276 /// let mut cursor = io::Cursor::new(b"lorem\x00ipsum\x00dolor");
277 ///
278 /// let mut records = vec![];
279 /// cursor.for_byte_record_with_terminator(b'\x00', |record| {
280 /// records.push(record.to_vec());
281 /// Ok(true)
282 /// })?;
283 /// assert_eq!(records.len(), 3);
284 /// assert_eq!(records[0], B(b"lorem\x00"));
285 /// assert_eq!(records[1], B("ipsum\x00"));
286 /// assert_eq!(records[2], B("dolor"));
287 /// # Ok(()) }; example().unwrap()
288 /// ```
289 fn for_byte_record_with_terminator<F>(
290 &mut self,
291 terminator: u8,
292 mut for_each_record: F,
293 ) -> io::Result<()>
294 where
295 Self: Sized,
296 F: FnMut(&[u8]) -> io::Result<bool>,
297 {
298 let mut bytes = vec![];
299 let mut res = Ok(());
300 let mut consumed = 0;
301 'outer: loop {
302 // Lend out complete record slices from our buffer
303 {
304 let mut buf = self.fill_buf()?;
305 if buf.is_empty() {
306 break;
307 }
308 while let Some(index) = buf.find_byte(terminator) {
309 let (record, rest) = buf.split_at(index + 1);
310 buf = rest;
311 consumed += record.len();
312 match for_each_record(&record) {
313 Ok(false) => break 'outer,
314 Err(err) => {
315 res = Err(err);
316 break 'outer;
317 }
318 _ => (),
319 }
320 }
321
322 // Copy the final record fragment to our local buffer. This
323 // saves read_until() from re-scanning a buffer we know
324 // contains no remaining terminators.
325 bytes.extend_from_slice(&buf);
326 consumed += buf.len();
327 }
328
329 self.consume(consumed);
330 consumed = 0;
331
332 // N.B. read_until uses a different version of memchr that may
333 // be slower than the memchr crate that bstr uses. However, this
334 // should only run for a fairly small number of records, assuming a
335 // decent buffer size.
336 self.read_until(terminator, &mut bytes)?;
337 if bytes.is_empty() || !for_each_record(&bytes)? {
338 break;
339 }
340 bytes.clear();
341 }
342 self.consume(consumed);
343 res
344 }
345}
346
347impl<B: io::BufRead> BufReadExt for B {}
348
349/// An iterator over lines from an instance of
350/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
351///
352/// This iterator is generally created by calling the
353/// [`byte_lines`](trait.BufReadExt.html#method.byte_lines)
354/// method on the
355/// [`BufReadExt`](trait.BufReadExt.html)
356/// trait.
357#[derive(Debug)]
358pub struct ByteLines<B> {
359 buf: B,
360}
361
362/// An iterator over records from an instance of
363/// [`std::io::BufRead`](https://doc.rust-lang.org/std/io/trait.BufRead.html).
364///
365/// A byte record is any sequence of bytes terminated by a particular byte
366/// chosen by the caller. For example, NUL separated byte strings are said to
367/// be NUL-terminated byte records.
368///
369/// This iterator is generally created by calling the
370/// [`byte_records`](trait.BufReadExt.html#method.byte_records)
371/// method on the
372/// [`BufReadExt`](trait.BufReadExt.html)
373/// trait.
374#[derive(Debug)]
375pub struct ByteRecords<B> {
376 buf: B,
377 terminator: u8,
378}
379
380impl<B: io::BufRead> Iterator for ByteLines<B> {
381 type Item = io::Result<Vec<u8>>;
382
383 fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
384 let mut bytes = vec![];
385 match self.buf.read_until(b'\n', &mut bytes) {
386 Err(e) => Some(Err(e)),
387 Ok(0) => None,
388 Ok(_) => {
389 trim_line(&mut bytes);
390 Some(Ok(bytes))
391 }
392 }
393 }
394}
395
396impl<B: io::BufRead> Iterator for ByteRecords<B> {
397 type Item = io::Result<Vec<u8>>;
398
399 fn next(&mut self) -> Option<io::Result<Vec<u8>>> {
400 let mut bytes = vec![];
401 match self.buf.read_until(self.terminator, &mut bytes) {
402 Err(e) => Some(Err(e)),
403 Ok(0) => None,
404 Ok(_) => {
405 trim_record(&mut bytes, self.terminator);
406 Some(Ok(bytes))
407 }
408 }
409 }
410}
411
412fn trim_line(line: &mut Vec<u8>) {
413 if line.last_byte() == Some(b'\n') {
414 line.pop_byte();
415 if line.last_byte() == Some(b'\r') {
416 line.pop_byte();
417 }
418 }
419}
420
421fn trim_line_slice(mut line: &[u8]) -> &[u8] {
422 if line.last_byte() == Some(b'\n') {
423 line = &line[..line.len() - 1];
424 if line.last_byte() == Some(b'\r') {
425 line = &line[..line.len() - 1];
426 }
427 }
428 line
429}
430
431fn trim_record(record: &mut Vec<u8>, terminator: u8) {
432 if record.last_byte() == Some(terminator) {
433 record.pop_byte();
434 }
435}
436
437fn trim_record_slice(mut record: &[u8], terminator: u8) -> &[u8] {
438 if record.last_byte() == Some(terminator) {
439 record = &record[..record.len() - 1];
440 }
441 record
442}
443
444#[cfg(all(test, feature = "std"))]
445mod tests {
446 use alloc::{vec, vec::Vec};
447
448 use crate::bstring::BString;
449
450 use super::BufReadExt;
451
452 fn collect_lines<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
453 let mut lines = vec![];
454 slice
455 .as_ref()
456 .for_byte_line(|line| {
457 lines.push(BString::from(line.to_vec()));
458 Ok(true)
459 })
460 .unwrap();
461 lines
462 }
463
464 fn collect_lines_term<B: AsRef<[u8]>>(slice: B) -> Vec<BString> {
465 let mut lines = vec![];
466 slice
467 .as_ref()
468 .for_byte_line_with_terminator(|line| {
469 lines.push(BString::from(line.to_vec()));
470 Ok(true)
471 })
472 .unwrap();
473 lines
474 }
475
476 #[test]
477 fn lines_without_terminator() {
478 assert_eq!(collect_lines(""), Vec::<BString>::new());
479
480 assert_eq!(collect_lines("\n"), vec![""]);
481 assert_eq!(collect_lines("\n\n"), vec!["", ""]);
482 assert_eq!(collect_lines("a\nb\n"), vec!["a", "b"]);
483 assert_eq!(collect_lines("a\nb"), vec!["a", "b"]);
484 assert_eq!(collect_lines("abc\nxyz\n"), vec!["abc", "xyz"]);
485 assert_eq!(collect_lines("abc\nxyz"), vec!["abc", "xyz"]);
486
487 assert_eq!(collect_lines("\r\n"), vec![""]);
488 assert_eq!(collect_lines("\r\n\r\n"), vec!["", ""]);
489 assert_eq!(collect_lines("a\r\nb\r\n"), vec!["a", "b"]);
490 assert_eq!(collect_lines("a\r\nb"), vec!["a", "b"]);
491 assert_eq!(collect_lines("abc\r\nxyz\r\n"), vec!["abc", "xyz"]);
492 assert_eq!(collect_lines("abc\r\nxyz"), vec!["abc", "xyz"]);
493
494 assert_eq!(collect_lines("abc\rxyz"), vec!["abc\rxyz"]);
495 }
496
497 #[test]
498 fn lines_with_terminator() {
499 assert_eq!(collect_lines_term(""), Vec::<BString>::new());
500
501 assert_eq!(collect_lines_term("\n"), vec!["\n"]);
502 assert_eq!(collect_lines_term("\n\n"), vec!["\n", "\n"]);
503 assert_eq!(collect_lines_term("a\nb\n"), vec!["a\n", "b\n"]);
504 assert_eq!(collect_lines_term("a\nb"), vec!["a\n", "b"]);
505 assert_eq!(collect_lines_term("abc\nxyz\n"), vec!["abc\n", "xyz\n"]);
506 assert_eq!(collect_lines_term("abc\nxyz"), vec!["abc\n", "xyz"]);
507
508 assert_eq!(collect_lines_term("\r\n"), vec!["\r\n"]);
509 assert_eq!(collect_lines_term("\r\n\r\n"), vec!["\r\n", "\r\n"]);
510 assert_eq!(collect_lines_term("a\r\nb\r\n"), vec!["a\r\n", "b\r\n"]);
511 assert_eq!(collect_lines_term("a\r\nb"), vec!["a\r\n", "b"]);
512 assert_eq!(
513 collect_lines_term("abc\r\nxyz\r\n"),
514 vec!["abc\r\n", "xyz\r\n"]
515 );
516 assert_eq!(collect_lines_term("abc\r\nxyz"), vec!["abc\r\n", "xyz"]);
517
518 assert_eq!(collect_lines_term("abc\rxyz"), vec!["abc\rxyz"]);
519 }
520}