utf8/
read.rs

1use std::io::{self, BufRead};
2use std::error::Error;
3use std::fmt;
4use std::str;
5use super::*;
6
7/// Wraps a `std::io::BufRead` buffered byte stream and decode it as UTF-8.
8pub struct BufReadDecoder<B: BufRead> {
9    buf_read: B,
10    bytes_consumed: usize,
11    incomplete: Incomplete,
12}
13
14#[derive(Debug)]
15pub enum BufReadDecoderError<'a> {
16    /// Represents one UTF-8 error in the byte stream.
17    ///
18    /// In lossy decoding, each such error should be replaced with U+FFFD.
19    /// (See `BufReadDecoder::next_lossy` and `BufReadDecoderError::lossy`.)
20    InvalidByteSequence(&'a [u8]),
21
22    /// An I/O error from the underlying byte stream
23    Io(io::Error),
24}
25
26impl<'a> BufReadDecoderError<'a> {
27    /// Replace UTF-8 errors with U+FFFD
28    pub fn lossy(self) -> Result<&'static str, io::Error> {
29        match self {
30            BufReadDecoderError::Io(error) => Err(error),
31            BufReadDecoderError::InvalidByteSequence(_) => Ok(REPLACEMENT_CHARACTER),
32        }
33    }
34}
35
36impl<'a> fmt::Display for BufReadDecoderError<'a> {
37    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
38        match *self {
39            BufReadDecoderError::InvalidByteSequence(bytes) => {
40                write!(f, "invalid byte sequence: {:02x?}", bytes)
41            }
42            BufReadDecoderError::Io(ref err) => write!(f, "underlying bytestream error: {}", err),
43        }
44    }
45}
46
47impl<'a> Error for BufReadDecoderError<'a> {
48    fn source(&self) -> Option<&(dyn Error + 'static)> {
49        match *self {
50            BufReadDecoderError::InvalidByteSequence(_) => None,
51            BufReadDecoderError::Io(ref err) => Some(err),
52        }
53    }
54}
55
56impl<B: BufRead> BufReadDecoder<B> {
57    /// This is to `Read::read_to_string` what `String::from_utf8_lossy` is to `String::from_utf8`.
58    pub fn read_to_string_lossy(buf_read: B) -> io::Result<String> {
59        let mut decoder = Self::new(buf_read);
60        let mut string = String::new();
61        while let Some(result) = decoder.next_lossy() {
62            string.push_str(result?)
63        }
64        Ok(string)
65    }
66
67    pub fn new(buf_read: B) -> Self {
68        Self {
69            buf_read,
70            bytes_consumed: 0,
71            incomplete: Incomplete::empty(),
72        }
73    }
74
75    /// Same as `BufReadDecoder::next_strict`, but replace UTF-8 errors with U+FFFD.
76    pub fn next_lossy(&mut self) -> Option<io::Result<&str>> {
77        self.next_strict().map(|result| result.or_else(|e| e.lossy()))
78    }
79
80    /// Decode and consume the next chunk of UTF-8 input.
81    ///
82    /// This method is intended to be called repeatedly until it returns `None`,
83    /// which represents EOF from the underlying byte stream.
84    /// This is similar to `Iterator::next`,
85    /// except that decoded chunks borrow the decoder (~iterator)
86    /// so they need to be handled or copied before the next chunk can start decoding.
87    pub fn next_strict(&mut self) -> Option<Result<&str, BufReadDecoderError>> {
88        enum BytesSource {
89            BufRead(usize),
90            Incomplete,
91        }
92        macro_rules! try_io {
93            ($io_result: expr) => {
94                match $io_result {
95                    Ok(value) => value,
96                    Err(error) => return Some(Err(BufReadDecoderError::Io(error)))
97                }
98            }
99        }
100        let (source, result) = loop {
101            if self.bytes_consumed > 0 {
102                self.buf_read.consume(self.bytes_consumed);
103                self.bytes_consumed = 0;
104            }
105            let buf = try_io!(self.buf_read.fill_buf());
106
107            // Force loop iteration to go through an explicit `continue`
108            enum Unreachable {}
109            let _: Unreachable = if self.incomplete.is_empty() {
110                if buf.is_empty() {
111                    return None  // EOF
112                }
113                match str::from_utf8(buf) {
114                    Ok(_) => {
115                        break (BytesSource::BufRead(buf.len()), Ok(()))
116                    }
117                    Err(error) => {
118                        let valid_up_to = error.valid_up_to();
119                        if valid_up_to > 0 {
120                            break (BytesSource::BufRead(valid_up_to), Ok(()))
121                        }
122                        match error.error_len() {
123                            Some(invalid_sequence_length) => {
124                                break (BytesSource::BufRead(invalid_sequence_length), Err(()))
125                            }
126                            None => {
127                                self.bytes_consumed = buf.len();
128                                self.incomplete = Incomplete::new(buf);
129                                // need more input bytes
130                                continue
131                            }
132                        }
133                    }
134                }
135            } else {
136                if buf.is_empty() {
137                    break (BytesSource::Incomplete, Err(()))  // EOF with incomplete code point
138                }
139                let (consumed, opt_result) = self.incomplete.try_complete_offsets(buf);
140                self.bytes_consumed = consumed;
141                match opt_result {
142                    None => {
143                        // need more input bytes
144                        continue
145                    }
146                    Some(result) => {
147                        break (BytesSource::Incomplete, result)
148                    }
149                }
150            };
151        };
152        let bytes = match source {
153            BytesSource::BufRead(byte_count) => {
154                self.bytes_consumed = byte_count;
155                let buf = try_io!(self.buf_read.fill_buf());
156                &buf[..byte_count]
157            }
158            BytesSource::Incomplete => {
159                self.incomplete.take_buffer()
160            }
161        };
162        match result {
163            Ok(()) => Some(Ok(unsafe { str::from_utf8_unchecked(bytes) })),
164            Err(()) => Some(Err(BufReadDecoderError::InvalidByteSequence(bytes))),
165        }
166    }
167}