utf8/
lib.rs

1mod lossy;
2mod read;
3
4pub use lossy::LossyDecoder;
5pub use read::{BufReadDecoder, BufReadDecoderError};
6
7use std::cmp;
8use std::error::Error;
9use std::fmt;
10use std::str;
11
12/// The replacement character, U+FFFD. In lossy decoding, insert it for every decoding error.
13pub const REPLACEMENT_CHARACTER: &'static str = "\u{FFFD}";
14
15#[derive(Debug, Copy, Clone)]
16pub enum DecodeError<'a> {
17    /// In lossy decoding insert `valid_prefix`, then `"\u{FFFD}"`,
18    /// then call `decode()` again with `remaining_input`.
19    Invalid {
20        valid_prefix: &'a str,
21        invalid_sequence: &'a [u8],
22        remaining_input: &'a [u8],
23    },
24
25    /// Call the `incomplete_suffix.try_complete` method with more input when available.
26    /// If no more input is available, this is an invalid byte sequence.
27    Incomplete {
28        valid_prefix: &'a str,
29        incomplete_suffix: Incomplete,
30    },
31}
32
33impl<'a> fmt::Display for DecodeError<'a> {
34    fn fmt(&self, f: &mut fmt::Formatter) -> fmt::Result {
35        match *self {
36            DecodeError::Invalid {
37                valid_prefix,
38                invalid_sequence,
39                remaining_input,
40            } => write!(
41                f,
42                "found invalid byte sequence {invalid_sequence:02x?} after \
43                 {valid_byte_count} valid bytes, followed by {unprocessed_byte_count} more \
44                 unprocessed bytes",
45                invalid_sequence = invalid_sequence,
46                valid_byte_count = valid_prefix.len(),
47                unprocessed_byte_count = remaining_input.len()
48            ),
49            DecodeError::Incomplete {
50                valid_prefix,
51                incomplete_suffix,
52            } => write!(
53                f,
54                "found incomplete byte sequence {incomplete_suffix:02x?} after \
55                 {valid_byte_count} bytes",
56                incomplete_suffix = incomplete_suffix,
57                valid_byte_count = valid_prefix.len()
58            ),
59        }
60    }
61}
62
63impl<'a> Error for DecodeError<'a> {}
64
65#[derive(Debug, Copy, Clone)]
66pub struct Incomplete {
67    pub buffer: [u8; 4],
68    pub buffer_len: u8,
69}
70
71pub fn decode(input: &[u8]) -> Result<&str, DecodeError> {
72    let error = match str::from_utf8(input) {
73        Ok(valid) => return Ok(valid),
74        Err(error) => error,
75    };
76
77    // FIXME: separate function from here to guide inlining?
78    let (valid, after_valid) = input.split_at(error.valid_up_to());
79    let valid = unsafe {
80        str::from_utf8_unchecked(valid)
81    };
82
83    match error.error_len() {
84        Some(invalid_sequence_length) => {
85            let (invalid, rest) = after_valid.split_at(invalid_sequence_length);
86            Err(DecodeError::Invalid {
87                valid_prefix: valid,
88                invalid_sequence: invalid,
89                remaining_input: rest
90            })
91        }
92        None => {
93            Err(DecodeError::Incomplete {
94                valid_prefix: valid,
95                incomplete_suffix: Incomplete::new(after_valid),
96            })
97        }
98    }
99}
100
101impl Incomplete {
102    pub fn empty() -> Self {
103        Incomplete {
104            buffer: [0, 0, 0, 0],
105            buffer_len: 0,
106        }
107    }
108
109    pub fn is_empty(&self) -> bool {
110        self.buffer_len == 0
111    }
112
113    pub fn new(bytes: &[u8]) -> Self {
114        let mut buffer = [0, 0, 0, 0];
115        let len = bytes.len();
116        buffer[..len].copy_from_slice(bytes);
117        Incomplete {
118            buffer: buffer,
119            buffer_len: len as u8,
120        }
121    }
122
123    /// * `None`: still incomplete, call `try_complete` again with more input.
124    ///   If no more input is available, this is invalid byte sequence.
125    /// * `Some((result, remaining_input))`: We’re done with this `Incomplete`.
126    ///   To keep decoding, pass `remaining_input` to `decode()`.
127    pub fn try_complete<'input>(&mut self, input: &'input [u8])
128                                -> Option<(Result<&str, &[u8]>, &'input [u8])> {
129        let (consumed, opt_result) = self.try_complete_offsets(input);
130        let result = opt_result?;
131        let remaining_input = &input[consumed..];
132        let result_bytes = self.take_buffer();
133        let result = match result {
134            Ok(()) => Ok(unsafe { str::from_utf8_unchecked(result_bytes) }),
135            Err(()) => Err(result_bytes),
136        };
137        Some((result, remaining_input))
138    }
139
140    fn take_buffer(&mut self) -> &[u8] {
141        let len = self.buffer_len as usize;
142        self.buffer_len = 0;
143        &self.buffer[..len as usize]
144    }
145
146    /// (consumed_from_input, None): not enough input
147    /// (consumed_from_input, Some(Err(()))): error bytes in buffer
148    /// (consumed_from_input, Some(Ok(()))): UTF-8 string in buffer
149    fn try_complete_offsets(&mut self, input: &[u8]) -> (usize, Option<Result<(), ()>>) {
150        let initial_buffer_len = self.buffer_len as usize;
151        let copied_from_input;
152        {
153            let unwritten = &mut self.buffer[initial_buffer_len..];
154            copied_from_input = cmp::min(unwritten.len(), input.len());
155            unwritten[..copied_from_input].copy_from_slice(&input[..copied_from_input]);
156        }
157        let spliced = &self.buffer[..initial_buffer_len + copied_from_input];
158        match str::from_utf8(spliced) {
159            Ok(_) => {
160                self.buffer_len = spliced.len() as u8;
161                (copied_from_input, Some(Ok(())))
162            }
163            Err(error) => {
164                let valid_up_to = error.valid_up_to();
165                if valid_up_to > 0 {
166                    let consumed = valid_up_to.checked_sub(initial_buffer_len).unwrap();
167                    self.buffer_len = valid_up_to as u8;
168                    (consumed, Some(Ok(())))
169                } else {
170                    match error.error_len() {
171                        Some(invalid_sequence_length) => {
172                            let consumed = invalid_sequence_length
173                                .checked_sub(initial_buffer_len).unwrap();
174                            self.buffer_len = invalid_sequence_length as u8;
175                            (consumed, Some(Err(())))
176                        }
177                        None => {
178                            self.buffer_len = spliced.len() as u8;
179                            (copied_from_input, None)
180                        }
181                    }
182                }
183            }
184        }
185    }
186}