bstr/
utf8.rs

Help
1use core::{char, cmp, fmt, str};
2
3use crate::{ascii, bstr::BStr, ext_slice::ByteSlice};
4
5// The UTF-8 decoder provided here is based on the one presented here:
6// https://bjoern.hoehrmann.de/utf-8/decoder/dfa/
7//
8// We *could* have done UTF-8 decoding by using a DFA generated by `\p{any}`
9// using regex-automata that is roughly the same size. The real benefit of
10// Hoehrmann's formulation is that the byte class mapping below is manually
11// tailored such that each byte's class doubles as a shift to mask out the
12// bits necessary for constructing the leading bits of each codepoint value
13// from the initial byte.
14//
15// There are some minor differences between this implementation and Hoehrmann's
16// formulation.
17//
18// Firstly, we make REJECT have state ID 0, since it makes the state table
19// itself a little easier to read and is consistent with the notion that 0
20// means "false" or "bad."
21//
22// Secondly, when doing bulk decoding, we add a SIMD accelerated ASCII fast
23// path.
24//
25// Thirdly, we pre-multiply the state IDs to avoid a multiplication instruction
26// in the core decoding loop. (Which is what regex-automata would do by
27// default.)
28//
29// Fourthly, we split the byte class mapping and transition table into two
30// arrays because it's clearer.
31//
32// It is unlikely that this is the fastest way to do UTF-8 decoding, however,
33// it is fairly simple.
34
35const ACCEPT: usize = 12;
36const REJECT: usize = 0;
37
38/// SAFETY: The decode below function relies on the correctness of these
39/// equivalence classes.
40#[cfg_attr(rustfmt, rustfmt::skip)]
41const CLASSES: [u8; 256] = [
42   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
43   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
44   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
45   0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,  0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,
46   1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,1,  9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,9,
47   7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,  7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,7,
48   8,8,2,2,2,2,2,2,2,2,2,2,2,2,2,2,  2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,2,
49  10,3,3,3,3,3,3,3,3,3,3,3,3,4,3,3, 11,6,6,6,5,8,8,8,8,8,8,8,8,8,8,8,
50];
51
52/// SAFETY: The decode below function relies on the correctness of this state
53/// machine.
54#[cfg_attr(rustfmt, rustfmt::skip)]
55const STATES_FORWARD: &'static [u8] = &[
56  0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
57  12, 0, 24, 36, 60, 96, 84, 0, 0, 0, 48, 72,
58  0, 12, 0, 0, 0, 0, 0, 12, 0, 12, 0, 0,
59  0, 24, 0, 0, 0, 0, 0, 24, 0, 24, 0, 0,
60  0, 0, 0, 0, 0, 0, 0, 24, 0, 0, 0, 0,
61  0, 24, 0, 0, 0, 0, 0, 0, 0, 24, 0, 0,
62  0, 0, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
63  0, 36, 0, 0, 0, 0, 0, 36, 0, 36, 0, 0,
64  0, 36, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
65];
66
67/// An iterator over Unicode scalar values in a byte string.
68///
69/// When invalid UTF-8 byte sequences are found, they are substituted with the
70/// Unicode replacement codepoint (`U+FFFD`) using the
71/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
72///
73/// This iterator is created by the
74/// [`chars`](trait.ByteSlice.html#method.chars) method provided by the
75/// [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
76#[derive(Clone, Debug)]
77pub struct Chars<'a> {
78    bs: &'a [u8],
79}
80
81impl<'a> Chars<'a> {
82    pub(crate) fn new(bs: &'a [u8]) -> Chars<'a> {
83        Chars { bs }
84    }
85
86    /// View the underlying data as a subslice of the original data.
87    ///
88    /// The slice returned has the same lifetime as the original slice, and so
89    /// the iterator can continue to be used while this exists.
90    ///
91    /// # Examples
92    ///
93    /// ```
94    /// use bstr::ByteSlice;
95    ///
96    /// let mut chars = b"abc".chars();
97    ///
98    /// assert_eq!(b"abc", chars.as_bytes());
99    /// chars.next();
100    /// assert_eq!(b"bc", chars.as_bytes());
101    /// chars.next();
102    /// chars.next();
103    /// assert_eq!(b"", chars.as_bytes());
104    /// ```
105    #[inline]
106    pub fn as_bytes(&self) -> &'a [u8] {
107        self.bs
108    }
109}
110
111impl<'a> Iterator for Chars<'a> {
112    type Item = char;
113
114    #[inline]
115    fn next(&mut self) -> Option<char> {
116        let (ch, size) = decode_lossy(self.bs);
117        if size == 0 {
118            return None;
119        }
120        self.bs = &self.bs[size..];
121        Some(ch)
122    }
123}
124
125impl<'a> DoubleEndedIterator for Chars<'a> {
126    #[inline]
127    fn next_back(&mut self) -> Option<char> {
128        let (ch, size) = decode_last_lossy(self.bs);
129        if size == 0 {
130            return None;
131        }
132        self.bs = &self.bs[..self.bs.len() - size];
133        Some(ch)
134    }
135}
136
137/// An iterator over Unicode scalar values in a byte string and their
138/// byte index positions.
139///
140/// When invalid UTF-8 byte sequences are found, they are substituted with the
141/// Unicode replacement codepoint (`U+FFFD`) using the
142/// ["maximal subpart" strategy](https://www.unicode.org/review/pr-121.html).
143///
144/// Note that this is slightly different from the `CharIndices` iterator
145/// provided by the standard library. Aside from working on possibly invalid
146/// UTF-8, this iterator provides both the corresponding starting and ending
147/// byte indices of each codepoint yielded. The ending position is necessary to
148/// slice the original byte string when invalid UTF-8 bytes are converted into
149/// a Unicode replacement codepoint, since a single replacement codepoint can
150/// substitute anywhere from 1 to 3 invalid bytes (inclusive).
151///
152/// This iterator is created by the
153/// [`char_indices`](trait.ByteSlice.html#method.char_indices) method provided
154/// by the [`ByteSlice`](trait.ByteSlice.html) extension trait for `&[u8]`.
155#[derive(Clone, Debug)]
156pub struct CharIndices<'a> {
157    bs: &'a [u8],
158    forward_index: usize,
159    reverse_index: usize,
160}
161
162impl<'a> CharIndices<'a> {
163    pub(crate) fn new(bs: &'a [u8]) -> CharIndices<'a> {
164        CharIndices { bs, forward_index: 0, reverse_index: bs.len() }
165    }
166
167    /// View the underlying data as a subslice of the original data.
168    ///
169    /// The slice returned has the same lifetime as the original slice, and so
170    /// the iterator can continue to be used while this exists.
171    ///
172    /// # Examples
173    ///
174    /// ```
175    /// use bstr::ByteSlice;
176    ///
177    /// let mut it = b"abc".char_indices();
178    ///
179    /// assert_eq!(b"abc", it.as_bytes());
180    /// it.next();
181    /// assert_eq!(b"bc", it.as_bytes());
182    /// it.next();
183    /// it.next();
184    /// assert_eq!(b"", it.as_bytes());
185    /// ```
186    #[inline]
187    pub fn as_bytes(&self) -> &'a [u8] {
188        self.bs
189    }
190}
191
192impl<'a> Iterator for CharIndices<'a> {
193    type Item = (usize, usize, char);
194
195    #[inline]
196    fn next(&mut self) -> Option<(usize, usize, char)> {
197        let index = self.forward_index;
198        let (ch, size) = decode_lossy(self.bs);
199        if size == 0 {
200            return None;
201        }
202        self.bs = &self.bs[size..];
203        self.forward_index += size;
204        Some((index, index + size, ch))
205    }
206}
207
208impl<'a> DoubleEndedIterator for CharIndices<'a> {
209    #[inline]
210    fn next_back(&mut self) -> Option<(usize, usize, char)> {
211        let (ch, size) = decode_last_lossy(self.bs);
212        if size == 0 {
213            return None;
214        }
215        self.bs = &self.bs[..self.bs.len() - size];
216        self.reverse_index -= size;
217        Some((self.reverse_index, self.reverse_index + size, ch))
218    }
219}
220
221impl<'a> ::core::iter::FusedIterator for CharIndices<'a> {}
222
223/// An iterator over chunks of valid UTF-8 in a byte slice.
224///
225/// See [`utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks).
226#[derive(Clone, Debug)]
227pub struct Utf8Chunks<'a> {
228    pub(super) bytes: &'a [u8],
229}
230
231/// A chunk of valid UTF-8, possibly followed by invalid UTF-8 bytes.
232///
233/// This is yielded by the
234/// [`Utf8Chunks`](struct.Utf8Chunks.html)
235/// iterator, which can be created via the
236/// [`ByteSlice::utf8_chunks`](trait.ByteSlice.html#method.utf8_chunks)
237/// method.
238///
239/// The `'a` lifetime parameter corresponds to the lifetime of the bytes that
240/// are being iterated over.
241#[cfg_attr(test, derive(Debug, PartialEq))]
242pub struct Utf8Chunk<'a> {
243    /// A valid UTF-8 piece, at the start, end, or between invalid UTF-8 bytes.
244    ///
245    /// This is empty between adjacent invalid UTF-8 byte sequences.
246    valid: &'a str,
247    /// A sequence of invalid UTF-8 bytes.
248    ///
249    /// Can only be empty in the last chunk.
250    ///
251    /// Should be replaced by a single unicode replacement character, if not
252    /// empty.
253    invalid: &'a BStr,
254    /// Indicates whether the invalid sequence could've been valid if there
255    /// were more bytes.
256    ///
257    /// Can only be true in the last chunk.
258    incomplete: bool,
259}
260
261impl<'a> Utf8Chunk<'a> {
262    /// Returns the (possibly empty) valid UTF-8 bytes in this chunk.
263    ///
264    /// This may be empty if there are consecutive sequences of invalid UTF-8
265    /// bytes.
266    #[inline]
267    pub fn valid(&self) -> &'a str {
268        self.valid
269    }
270
271    /// Returns the (possibly empty) invalid UTF-8 bytes in this chunk that
272    /// immediately follow the valid UTF-8 bytes in this chunk.
273    ///
274    /// This is only empty when this chunk corresponds to the last chunk in
275    /// the original bytes.
276    ///
277    /// The maximum length of this slice is 3. That is, invalid UTF-8 byte
278    /// sequences greater than 1 always correspond to a valid _prefix_ of
279    /// a valid UTF-8 encoded codepoint. This corresponds to the "substitution
280    /// of maximal subparts" strategy that is described in more detail in the
281    /// docs for the
282    /// [`ByteSlice::to_str_lossy`](trait.ByteSlice.html#method.to_str_lossy)
283    /// method.
284    #[inline]
285    pub fn invalid(&self) -> &'a [u8] {
286        self.invalid.as_bytes()
287    }
288
289    /// Returns whether the invalid sequence might still become valid if more
290    /// bytes are added.
291    ///
292    /// Returns true if the end of the input was reached unexpectedly,
293    /// without encountering an unexpected byte.
294    ///
295    /// This can only be the case for the last chunk.
296    #[inline]
297    pub fn incomplete(&self) -> bool {
298        self.incomplete
299    }
300}
301
302impl<'a> Iterator for Utf8Chunks<'a> {
303    type Item = Utf8Chunk<'a>;
304
305    #[inline]
306    fn next(&mut self) -> Option<Utf8Chunk<'a>> {
307        if self.bytes.is_empty() {
308            return None;
309        }
310        match validate(self.bytes) {
311            Ok(()) => {
312                let valid = self.bytes;
313                self.bytes = &[];
314                Some(Utf8Chunk {
315                    // SAFETY: This is safe because of the guarantees provided
316                    // by utf8::validate.
317                    valid: unsafe { str::from_utf8_unchecked(valid) },
318                    invalid: [].as_bstr(),
319                    incomplete: false,
320                })
321            }
322            Err(e) => {
323                let (valid, rest) = self.bytes.split_at(e.valid_up_to());
324                // SAFETY: This is safe because of the guarantees provided by
325                // utf8::validate.
326                let valid = unsafe { str::from_utf8_unchecked(valid) };
327                let (invalid_len, incomplete) = match e.error_len() {
328                    Some(n) => (n, false),
329                    None => (rest.len(), true),
330                };
331                let (invalid, rest) = rest.split_at(invalid_len);
332                self.bytes = rest;
333                Some(Utf8Chunk {
334                    valid,
335                    invalid: invalid.as_bstr(),
336                    incomplete,
337                })
338            }
339        }
340    }
341
342    #[inline]
343    fn size_hint(&self) -> (usize, Option<usize>) {
344        if self.bytes.is_empty() {
345            (0, Some(0))
346        } else {
347            (1, Some(self.bytes.len()))
348        }
349    }
350}
351
352impl<'a> ::core::iter::FusedIterator for Utf8Chunks<'a> {}
353
354/// An error that occurs when UTF-8 decoding fails.
355///
356/// This error occurs when attempting to convert a non-UTF-8 byte
357/// string to a Rust string that must be valid UTF-8. For example,
358/// [`to_str`](trait.ByteSlice.html#method.to_str) is one such method.
359///
360/// # Example
361///
362/// This example shows what happens when a given byte sequence is invalid,
363/// but ends with a sequence that is a possible prefix of valid UTF-8.
364///
365/// ```
366/// use bstr::{B, ByteSlice};
367///
368/// let s = B(b"foobar\xF1\x80\x80");
369/// let err = s.to_str().unwrap_err();
370/// assert_eq!(err.valid_up_to(), 6);
371/// assert_eq!(err.error_len(), None);
372/// ```
373///
374/// This example shows what happens when a given byte sequence contains
375/// invalid UTF-8.
376///
377/// ```
378/// use bstr::ByteSlice;
379///
380/// let s = b"foobar\xF1\x80\x80quux";
381/// let err = s.to_str().unwrap_err();
382/// assert_eq!(err.valid_up_to(), 6);
383/// // The error length reports the maximum number of bytes that correspond to
384/// // a valid prefix of a UTF-8 encoded codepoint.
385/// assert_eq!(err.error_len(), Some(3));
386///
387/// // In contrast to the above which contains a single invalid prefix,
388/// // consider the case of multiple individual bytes that are never valid
389/// // prefixes. Note how the value of error_len changes!
390/// let s = b"foobar\xFF\xFFquux";
391/// let err = s.to_str().unwrap_err();
392/// assert_eq!(err.valid_up_to(), 6);
393/// assert_eq!(err.error_len(), Some(1));
394///
395/// // The fact that it's an invalid prefix does not change error_len even
396/// // when it immediately precedes the end of the string.
397/// let s = b"foobar\xFF";
398/// let err = s.to_str().unwrap_err();
399/// assert_eq!(err.valid_up_to(), 6);
400/// assert_eq!(err.error_len(), Some(1));
401/// ```
402#[derive(Clone, Debug, Eq, PartialEq)]
403pub struct Utf8Error {
404    valid_up_to: usize,
405    error_len: Option<usize>,
406}
407
408impl Utf8Error {
409    /// Returns the byte index of the position immediately following the last
410    /// valid UTF-8 byte.
411    ///
412    /// # Example
413    ///
414    /// This examples shows how `valid_up_to` can be used to retrieve a
415    /// possibly empty prefix that is guaranteed to be valid UTF-8:
416    ///
417    /// ```
418    /// use bstr::ByteSlice;
419    ///
420    /// let s = b"foobar\xF1\x80\x80quux";
421    /// let err = s.to_str().unwrap_err();
422    ///
423    /// // This is guaranteed to never panic.
424    /// let string = s[..err.valid_up_to()].to_str().unwrap();
425    /// assert_eq!(string, "foobar");
426    /// ```
427    #[inline]
428    pub fn valid_up_to(&self) -> usize {
429        self.valid_up_to
430    }
431
432    /// Returns the total number of invalid UTF-8 bytes immediately following
433    /// the position returned by `valid_up_to`. This value is always at least
434    /// `1`, but can be up to `3` if bytes form a valid prefix of some UTF-8
435    /// encoded codepoint.
436    ///
437    /// If the end of the original input was found before a valid UTF-8 encoded
438    /// codepoint could be completed, then this returns `None`. This is useful
439    /// when processing streams, where a `None` value signals that more input
440    /// might be needed.
441    #[inline]
442    pub fn error_len(&self) -> Option<usize> {
443        self.error_len
444    }
445}
446
447#[cfg(feature = "std")]
448impl std::error::Error for Utf8Error {
449    fn description(&self) -> &str {
450        "invalid UTF-8"
451    }
452}
453
454impl fmt::Display for Utf8Error {
455    fn fmt(&self, f: &mut fmt::Formatter<'_>) -> fmt::Result {
456        write!(f, "invalid UTF-8 found at byte offset {}", self.valid_up_to)
457    }
458}
459
460/// Returns OK if and only if the given slice is completely valid UTF-8.
461///
462/// If the slice isn't valid UTF-8, then an error is returned that explains
463/// the first location at which invalid UTF-8 was detected.
464pub fn validate(slice: &[u8]) -> Result<(), Utf8Error> {
465    // The fast path for validating UTF-8. It steps through a UTF-8 automaton
466    // and uses a SIMD accelerated ASCII fast path on x86_64. If an error is
467    // detected, it backs up and runs the slower version of the UTF-8 automaton
468    // to determine correct error information.
469    fn fast(slice: &[u8]) -> Result<(), Utf8Error> {
470        let mut state = ACCEPT;
471        let mut i = 0;
472
473        while i < slice.len() {
474            let b = slice[i];
475
476            // ASCII fast path. If we see two consecutive ASCII bytes, then try
477            // to validate as much ASCII as possible very quickly.
478            if state == ACCEPT
479                && b <= 0x7F
480                && slice.get(i + 1).map_or(false, |&b| b <= 0x7F)
481            {
482                i += ascii::first_non_ascii_byte(&slice[i..]);
483                continue;
484            }
485
486            state = step(state, b);
487            if state == REJECT {
488                return Err(find_valid_up_to(slice, i));
489            }
490            i += 1;
491        }
492        if state != ACCEPT {
493            Err(find_valid_up_to(slice, slice.len()))
494        } else {
495            Ok(())
496        }
497    }
498
499    // Given the first position at which a UTF-8 sequence was determined to be
500    // invalid, return an error that correctly reports the position at which
501    // the last complete UTF-8 sequence ends.
502    #[inline(never)]
503    fn find_valid_up_to(slice: &[u8], rejected_at: usize) -> Utf8Error {
504        // In order to find the last valid byte, we need to back up an amount
505        // that guarantees every preceding byte is part of a valid UTF-8
506        // code unit sequence. To do this, we simply locate the last leading
507        // byte that occurs before rejected_at.
508        let mut backup = rejected_at.saturating_sub(1);
509        while backup > 0 && !is_leading_or_invalid_utf8_byte(slice[backup]) {
510            backup -= 1;
511        }
512        let upto = cmp::min(slice.len(), rejected_at.saturating_add(1));
513        let mut err = slow(&slice[backup..upto]).unwrap_err();
514        err.valid_up_to += backup;
515        err
516    }
517
518    // Like top-level UTF-8 decoding, except it correctly reports a UTF-8 error
519    // when an invalid sequence is found. This is split out from validate so
520    // that the fast path doesn't need to keep track of the position of the
521    // last valid UTF-8 byte. In particular, tracking this requires checking
522    // for an ACCEPT state on each byte, which degrades throughput pretty
523    // badly.
524    fn slow(slice: &[u8]) -> Result<(), Utf8Error> {
525        let mut state = ACCEPT;
526        let mut valid_up_to = 0;
527        for (i, &b) in slice.iter().enumerate() {
528            state = step(state, b);
529            if state == ACCEPT {
530                valid_up_to = i + 1;
531            } else if state == REJECT {
532                // Our error length must always be at least 1.
533                let error_len = Some(cmp::max(1, i - valid_up_to));
534                return Err(Utf8Error { valid_up_to, error_len });
535            }
536        }
537        if state != ACCEPT {
538            Err(Utf8Error { valid_up_to, error_len: None })
539        } else {
540            Ok(())
541        }
542    }
543
544    // Advance to the next state given the current state and current byte.
545    fn step(state: usize, b: u8) -> usize {
546        let class = CLASSES[b as usize];
547        // SAFETY: This is safe because 'class' is always <=11 and 'state' is
548        // always <=96. Therefore, the maximal index is 96+11 = 107, where
549        // STATES_FORWARD.len() = 108 such that every index is guaranteed to be
550        // valid by construction of the state machine and the byte equivalence
551        // classes.
552        unsafe {
553            *STATES_FORWARD.get_unchecked(state + class as usize) as usize
554        }
555    }
556
557    fast(slice)
558}
559
560/// UTF-8 decode a single Unicode scalar value from the beginning of a slice.
561///
562/// When successful, the corresponding Unicode scalar value is returned along
563/// with the number of bytes it was encoded with. The number of bytes consumed
564/// for a successful decode is always between 1 and 4, inclusive.
565///
566/// When unsuccessful, `None` is returned along with the number of bytes that
567/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
568/// the number of bytes consumed is always between 0 and 3, inclusive, where
569/// 0 is only returned when `slice` is empty.
570///
571/// # Examples
572///
573/// Basic usage:
574///
575/// ```
576/// use bstr::decode_utf8;
577///
578/// // Decoding a valid codepoint.
579/// let (ch, size) = decode_utf8(b"\xE2\x98\x83");
580/// assert_eq!(Some('☃'), ch);
581/// assert_eq!(3, size);
582///
583/// // Decoding an incomplete codepoint.
584/// let (ch, size) = decode_utf8(b"\xE2\x98");
585/// assert_eq!(None, ch);
586/// assert_eq!(2, size);
587/// ```
588///
589/// This example shows how to iterate over all codepoints in UTF-8 encoded
590/// bytes, while replacing invalid UTF-8 sequences with the replacement
591/// codepoint:
592///
593/// ```
594/// use bstr::{B, decode_utf8};
595///
596/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
597/// let mut chars = vec![];
598/// while !bytes.is_empty() {
599///     let (ch, size) = decode_utf8(bytes);
600///     bytes = &bytes[size..];
601///     chars.push(ch.unwrap_or('\u{FFFD}'));
602/// }
603/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
604/// ```
605#[inline]
606pub fn decode<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
607    let slice = slice.as_ref();
608    match slice.get(0) {
609        None => return (None, 0),
610        Some(&b) if b <= 0x7F => return (Some(b as char), 1),
611        _ => {}
612    }
613
614    let (mut state, mut cp, mut i) = (ACCEPT, 0, 0);
615    while i < slice.len() {
616        decode_step(&mut state, &mut cp, slice[i]);
617        i += 1;
618
619        if state == ACCEPT {
620            // SAFETY: This is safe because `decode_step` guarantees that
621            // `cp` is a valid Unicode scalar value in an ACCEPT state.
622            let ch = unsafe { char::from_u32_unchecked(cp) };
623            return (Some(ch), i);
624        } else if state == REJECT {
625            // At this point, we always want to advance at least one byte.
626            return (None, cmp::max(1, i.saturating_sub(1)));
627        }
628    }
629    (None, i)
630}
631
632/// Lossily UTF-8 decode a single Unicode scalar value from the beginning of a
633/// slice.
634///
635/// When successful, the corresponding Unicode scalar value is returned along
636/// with the number of bytes it was encoded with. The number of bytes consumed
637/// for a successful decode is always between 1 and 4, inclusive.
638///
639/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
640/// along with the number of bytes that make up a maximal prefix of a valid
641/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
642/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
643/// empty.
644///
645/// # Examples
646///
647/// Basic usage:
648///
649/// ```ignore
650/// use bstr::decode_utf8_lossy;
651///
652/// // Decoding a valid codepoint.
653/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98\x83");
654/// assert_eq!('☃', ch);
655/// assert_eq!(3, size);
656///
657/// // Decoding an incomplete codepoint.
658/// let (ch, size) = decode_utf8_lossy(b"\xE2\x98");
659/// assert_eq!('\u{FFFD}', ch);
660/// assert_eq!(2, size);
661/// ```
662///
663/// This example shows how to iterate over all codepoints in UTF-8 encoded
664/// bytes, while replacing invalid UTF-8 sequences with the replacement
665/// codepoint:
666///
667/// ```ignore
668/// use bstr::{B, decode_utf8_lossy};
669///
670/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
671/// let mut chars = vec![];
672/// while !bytes.is_empty() {
673///     let (ch, size) = decode_utf8_lossy(bytes);
674///     bytes = &bytes[size..];
675///     chars.push(ch);
676/// }
677/// assert_eq!(vec!['☃', '\u{FFFD}', '𝞃', '\u{FFFD}', 'a'], chars);
678/// ```
679#[inline]
680pub fn decode_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
681    match decode(slice) {
682        (Some(ch), size) => (ch, size),
683        (None, size) => ('\u{FFFD}', size),
684    }
685}
686
687/// UTF-8 decode a single Unicode scalar value from the end of a slice.
688///
689/// When successful, the corresponding Unicode scalar value is returned along
690/// with the number of bytes it was encoded with. The number of bytes consumed
691/// for a successful decode is always between 1 and 4, inclusive.
692///
693/// When unsuccessful, `None` is returned along with the number of bytes that
694/// make up a maximal prefix of a valid UTF-8 code unit sequence. In this case,
695/// the number of bytes consumed is always between 0 and 3, inclusive, where
696/// 0 is only returned when `slice` is empty.
697///
698/// # Examples
699///
700/// Basic usage:
701///
702/// ```
703/// use bstr::decode_last_utf8;
704///
705/// // Decoding a valid codepoint.
706/// let (ch, size) = decode_last_utf8(b"\xE2\x98\x83");
707/// assert_eq!(Some('☃'), ch);
708/// assert_eq!(3, size);
709///
710/// // Decoding an incomplete codepoint.
711/// let (ch, size) = decode_last_utf8(b"\xE2\x98");
712/// assert_eq!(None, ch);
713/// assert_eq!(2, size);
714/// ```
715///
716/// This example shows how to iterate over all codepoints in UTF-8 encoded
717/// bytes in reverse, while replacing invalid UTF-8 sequences with the
718/// replacement codepoint:
719///
720/// ```
721/// use bstr::{B, decode_last_utf8};
722///
723/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
724/// let mut chars = vec![];
725/// while !bytes.is_empty() {
726///     let (ch, size) = decode_last_utf8(bytes);
727///     bytes = &bytes[..bytes.len()-size];
728///     chars.push(ch.unwrap_or('\u{FFFD}'));
729/// }
730/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
731/// ```
732#[inline]
733pub fn decode_last<B: AsRef<[u8]>>(slice: B) -> (Option<char>, usize) {
734    // TODO: We could implement this by reversing the UTF-8 automaton, but for
735    // now, we do it the slow way by using the forward automaton.
736
737    let slice = slice.as_ref();
738    if slice.is_empty() {
739        return (None, 0);
740    }
741    let mut start = slice.len() - 1;
742    let limit = slice.len().saturating_sub(4);
743    while start > limit && !is_leading_or_invalid_utf8_byte(slice[start]) {
744        start -= 1;
745    }
746    let (ch, size) = decode(&slice[start..]);
747    // If we didn't consume all of the bytes, then that means there's at least
748    // one stray byte that never occurs in a valid code unit prefix, so we can
749    // advance by one byte.
750    if start + size != slice.len() {
751        (None, 1)
752    } else {
753        (ch, size)
754    }
755}
756
757/// Lossily UTF-8 decode a single Unicode scalar value from the end of a slice.
758///
759/// When successful, the corresponding Unicode scalar value is returned along
760/// with the number of bytes it was encoded with. The number of bytes consumed
761/// for a successful decode is always between 1 and 4, inclusive.
762///
763/// When unsuccessful, the Unicode replacement codepoint (`U+FFFD`) is returned
764/// along with the number of bytes that make up a maximal prefix of a valid
765/// UTF-8 code unit sequence. In this case, the number of bytes consumed is
766/// always between 0 and 3, inclusive, where 0 is only returned when `slice` is
767/// empty.
768///
769/// # Examples
770///
771/// Basic usage:
772///
773/// ```ignore
774/// use bstr::decode_last_utf8_lossy;
775///
776/// // Decoding a valid codepoint.
777/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98\x83");
778/// assert_eq!('☃', ch);
779/// assert_eq!(3, size);
780///
781/// // Decoding an incomplete codepoint.
782/// let (ch, size) = decode_last_utf8_lossy(b"\xE2\x98");
783/// assert_eq!('\u{FFFD}', ch);
784/// assert_eq!(2, size);
785/// ```
786///
787/// This example shows how to iterate over all codepoints in UTF-8 encoded
788/// bytes in reverse, while replacing invalid UTF-8 sequences with the
789/// replacement codepoint:
790///
791/// ```ignore
792/// use bstr::decode_last_utf8_lossy;
793///
794/// let mut bytes = B(b"\xE2\x98\x83\xFF\xF0\x9D\x9E\x83\xE2\x98\x61");
795/// let mut chars = vec![];
796/// while !bytes.is_empty() {
797///     let (ch, size) = decode_last_utf8_lossy(bytes);
798///     bytes = &bytes[..bytes.len()-size];
799///     chars.push(ch);
800/// }
801/// assert_eq!(vec!['a', '\u{FFFD}', '𝞃', '\u{FFFD}', '☃'], chars);
802/// ```
803#[inline]
804pub fn decode_last_lossy<B: AsRef<[u8]>>(slice: B) -> (char, usize) {
805    match decode_last(slice) {
806        (Some(ch), size) => (ch, size),
807        (None, size) => ('\u{FFFD}', size),
808    }
809}
810
811/// SAFETY: The decode function relies on state being equal to ACCEPT only if
812/// cp is a valid Unicode scalar value.
813#[inline]
814pub fn decode_step(state: &mut usize, cp: &mut u32, b: u8) {
815    let class = CLASSES[b as usize];
816    if *state == ACCEPT {
817        *cp = (0xFF >> class) & (b as u32);
818    } else {
819        *cp = (b as u32 & 0b111111) | (*cp << 6);
820    }
821    *state = STATES_FORWARD[*state + class as usize] as usize;
822}
823
824/// Returns true if and only if the given byte is either a valid leading UTF-8
825/// byte, or is otherwise an invalid byte that can never appear anywhere in a
826/// valid UTF-8 sequence.
827fn is_leading_or_invalid_utf8_byte(b: u8) -> bool {
828    // In the ASCII case, the most significant bit is never set. The leading
829    // byte of a 2/3/4-byte sequence always has the top two most significant
830    // bits set. For bytes that can never appear anywhere in valid UTF-8, this
831    // also returns true, since every such byte has its two most significant
832    // bits set:
833    //
834    //     \xC0 :: 11000000
835    //     \xC1 :: 11000001
836    //     \xF5 :: 11110101
837    //     \xF6 :: 11110110
838    //     \xF7 :: 11110111
839    //     \xF8 :: 11111000
840    //     \xF9 :: 11111001
841    //     \xFA :: 11111010
842    //     \xFB :: 11111011
843    //     \xFC :: 11111100
844    //     \xFD :: 11111101
845    //     \xFE :: 11111110
846    //     \xFF :: 11111111
847    (b & 0b1100_0000) != 0b1000_0000
848}
849
850#[cfg(all(test, feature = "std"))]
851mod tests {
852    use core::char;
853
854    use alloc::{string::String, vec, vec::Vec};
855
856    use crate::{
857        ext_slice::{ByteSlice, B},
858        tests::LOSSY_TESTS,
859        utf8::{self, Utf8Error},
860    };
861
862    fn utf8e(valid_up_to: usize) -> Utf8Error {
863        Utf8Error { valid_up_to, error_len: None }
864    }
865
866    fn utf8e2(valid_up_to: usize, error_len: usize) -> Utf8Error {
867        Utf8Error { valid_up_to, error_len: Some(error_len) }
868    }
869
870    #[test]
871    #[cfg(not(miri))]
872    fn validate_all_codepoints() {
873        for i in 0..(0x10FFFF + 1) {
874            let cp = match char::from_u32(i) {
875                None => continue,
876                Some(cp) => cp,
877            };
878            let mut buf = [0; 4];
879            let s = cp.encode_utf8(&mut buf);
880            assert_eq!(Ok(()), utf8::validate(s.as_bytes()));
881        }
882    }
883
884    #[test]
885    fn validate_multiple_codepoints() {
886        assert_eq!(Ok(()), utf8::validate(b"abc"));
887        assert_eq!(Ok(()), utf8::validate(b"a\xE2\x98\x83a"));
888        assert_eq!(Ok(()), utf8::validate(b"a\xF0\x9D\x9C\xB7a"));
889        assert_eq!(Ok(()), utf8::validate(b"\xE2\x98\x83\xF0\x9D\x9C\xB7",));
890        assert_eq!(
891            Ok(()),
892            utf8::validate(b"a\xE2\x98\x83a\xF0\x9D\x9C\xB7a",)
893        );
894        assert_eq!(
895            Ok(()),
896            utf8::validate(b"\xEF\xBF\xBD\xE2\x98\x83\xEF\xBF\xBD",)
897        );
898    }
899
900    #[test]
901    fn validate_errors() {
902        // single invalid byte
903        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xFF"));
904        // single invalid byte after ASCII
905        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xFF"));
906        // single invalid byte after 2 byte sequence
907        assert_eq!(Err(utf8e2(2, 1)), utf8::validate(b"\xCE\xB2\xFF"));
908        // single invalid byte after 3 byte sequence
909        assert_eq!(Err(utf8e2(3, 1)), utf8::validate(b"\xE2\x98\x83\xFF"));
910        // single invalid byte after 4 byte sequence
911        assert_eq!(Err(utf8e2(4, 1)), utf8::validate(b"\xF0\x9D\x9D\xB1\xFF"));
912
913        // An invalid 2-byte sequence with a valid 1-byte prefix.
914        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCE\xF0"));
915        // An invalid 3-byte sequence with a valid 2-byte prefix.
916        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98\xF0"));
917        // An invalid 4-byte sequence with a valid 3-byte prefix.
918        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9D\xF0"));
919
920        // An overlong sequence. Should be \xE2\x82\xAC, but we encode the
921        // same codepoint value in 4 bytes. This not only tests that we reject
922        // overlong sequences, but that we get valid_up_to correct.
923        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xF0\x82\x82\xAC"));
924        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xF0\x82\x82\xAC"));
925        assert_eq!(
926            Err(utf8e2(3, 1)),
927            utf8::validate(b"\xE2\x98\x83\xF0\x82\x82\xAC",)
928        );
929
930        // Check that encoding a surrogate codepoint using the UTF-8 scheme
931        // fails validation.
932        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xED\xA0\x80"));
933        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xED\xA0\x80"));
934        assert_eq!(
935            Err(utf8e2(3, 1)),
936            utf8::validate(b"\xE2\x98\x83\xED\xA0\x80",)
937        );
938
939        // Check that an incomplete 2-byte sequence fails.
940        assert_eq!(Err(utf8e2(0, 1)), utf8::validate(b"\xCEa"));
941        assert_eq!(Err(utf8e2(1, 1)), utf8::validate(b"a\xCEa"));
942        assert_eq!(
943            Err(utf8e2(3, 1)),
944            utf8::validate(b"\xE2\x98\x83\xCE\xE2\x98\x83",)
945        );
946        // Check that an incomplete 3-byte sequence fails.
947        assert_eq!(Err(utf8e2(0, 2)), utf8::validate(b"\xE2\x98a"));
948        assert_eq!(Err(utf8e2(1, 2)), utf8::validate(b"a\xE2\x98a"));
949        assert_eq!(
950            Err(utf8e2(3, 2)),
951            utf8::validate(b"\xE2\x98\x83\xE2\x98\xE2\x98\x83",)
952        );
953        // Check that an incomplete 4-byte sequence fails.
954        assert_eq!(Err(utf8e2(0, 3)), utf8::validate(b"\xF0\x9D\x9Ca"));
955        assert_eq!(Err(utf8e2(1, 3)), utf8::validate(b"a\xF0\x9D\x9Ca"));
956        assert_eq!(
957            Err(utf8e2(4, 3)),
958            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C\xE2\x98\x83",)
959        );
960        assert_eq!(
961            Err(utf8e2(6, 3)),
962            utf8::validate(b"foobar\xF1\x80\x80quux",)
963        );
964
965        // Check that an incomplete (EOF) 2-byte sequence fails.
966        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xCE"));
967        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xCE"));
968        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xCE"));
969        // Check that an incomplete (EOF) 3-byte sequence fails.
970        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xE2\x98"));
971        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xE2\x98"));
972        assert_eq!(Err(utf8e(3)), utf8::validate(b"\xE2\x98\x83\xE2\x98"));
973        // Check that an incomplete (EOF) 4-byte sequence fails.
974        assert_eq!(Err(utf8e(0)), utf8::validate(b"\xF0\x9D\x9C"));
975        assert_eq!(Err(utf8e(1)), utf8::validate(b"a\xF0\x9D\x9C"));
976        assert_eq!(
977            Err(utf8e(4)),
978            utf8::validate(b"\xF0\x9D\x9C\xB1\xF0\x9D\x9C",)
979        );
980
981        // Test that we errors correct even after long valid sequences. This
982        // checks that our "backup" logic for detecting errors is correct.
983        assert_eq!(
984            Err(utf8e2(8, 1)),
985            utf8::validate(b"\xe2\x98\x83\xce\xb2\xe3\x83\x84\xFF",)
986        );
987    }
988
989    #[test]
990    fn decode_valid() {
991        fn d(mut s: &str) -> Vec<char> {
992            let mut chars = vec![];
993            while !s.is_empty() {
994                let (ch, size) = utf8::decode(s.as_bytes());
995                s = &s[size..];
996                chars.push(ch.unwrap());
997            }
998            chars
999        }
1000
1001        assert_eq!(vec!['☃'], d("☃"));
1002        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1003        assert_eq!(vec!['α', 'β', 'γ', 'δ', 'ε'], d("αβγδε"));
1004        assert_eq!(vec!['☃', '⛄', '⛇'], d("☃⛄⛇"));
1005        assert_eq!(vec!['𝗮', '𝗯', '𝗰', '𝗱', '𝗲'], d("𝗮𝗯𝗰𝗱𝗲"));
1006    }
1007
1008    #[test]
1009    fn decode_invalid() {
1010        let (ch, size) = utf8::decode(b"");
1011        assert_eq!(None, ch);
1012        assert_eq!(0, size);
1013
1014        let (ch, size) = utf8::decode(b"\xFF");
1015        assert_eq!(None, ch);
1016        assert_eq!(1, size);
1017
1018        let (ch, size) = utf8::decode(b"\xCE\xF0");
1019        assert_eq!(None, ch);
1020        assert_eq!(1, size);
1021
1022        let (ch, size) = utf8::decode(b"\xE2\x98\xF0");
1023        assert_eq!(None, ch);
1024        assert_eq!(2, size);
1025
1026        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D");
1027        assert_eq!(None, ch);
1028        assert_eq!(3, size);
1029
1030        let (ch, size) = utf8::decode(b"\xF0\x9D\x9D\xF0");
1031        assert_eq!(None, ch);
1032        assert_eq!(3, size);
1033
1034        let (ch, size) = utf8::decode(b"\xF0\x82\x82\xAC");
1035        assert_eq!(None, ch);
1036        assert_eq!(1, size);
1037
1038        let (ch, size) = utf8::decode(b"\xED\xA0\x80");
1039        assert_eq!(None, ch);
1040        assert_eq!(1, size);
1041
1042        let (ch, size) = utf8::decode(b"\xCEa");
1043        assert_eq!(None, ch);
1044        assert_eq!(1, size);
1045
1046        let (ch, size) = utf8::decode(b"\xE2\x98a");
1047        assert_eq!(None, ch);
1048        assert_eq!(2, size);
1049
1050        let (ch, size) = utf8::decode(b"\xF0\x9D\x9Ca");
1051        assert_eq!(None, ch);
1052        assert_eq!(3, size);
1053    }
1054
1055    #[test]
1056    fn decode_lossy() {
1057        let (ch, size) = utf8::decode_lossy(b"");
1058        assert_eq!('\u{FFFD}', ch);
1059        assert_eq!(0, size);
1060
1061        let (ch, size) = utf8::decode_lossy(b"\xFF");
1062        assert_eq!('\u{FFFD}', ch);
1063        assert_eq!(1, size);
1064
1065        let (ch, size) = utf8::decode_lossy(b"\xCE\xF0");
1066        assert_eq!('\u{FFFD}', ch);
1067        assert_eq!(1, size);
1068
1069        let (ch, size) = utf8::decode_lossy(b"\xE2\x98\xF0");
1070        assert_eq!('\u{FFFD}', ch);
1071        assert_eq!(2, size);
1072
1073        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9D\xF0");
1074        assert_eq!('\u{FFFD}', ch);
1075        assert_eq!(3, size);
1076
1077        let (ch, size) = utf8::decode_lossy(b"\xF0\x82\x82\xAC");
1078        assert_eq!('\u{FFFD}', ch);
1079        assert_eq!(1, size);
1080
1081        let (ch, size) = utf8::decode_lossy(b"\xED\xA0\x80");
1082        assert_eq!('\u{FFFD}', ch);
1083        assert_eq!(1, size);
1084
1085        let (ch, size) = utf8::decode_lossy(b"\xCEa");
1086        assert_eq!('\u{FFFD}', ch);
1087        assert_eq!(1, size);
1088
1089        let (ch, size) = utf8::decode_lossy(b"\xE2\x98a");
1090        assert_eq!('\u{FFFD}', ch);
1091        assert_eq!(2, size);
1092
1093        let (ch, size) = utf8::decode_lossy(b"\xF0\x9D\x9Ca");
1094        assert_eq!('\u{FFFD}', ch);
1095        assert_eq!(3, size);
1096    }
1097
1098    #[test]
1099    fn decode_last_valid() {
1100        fn d(mut s: &str) -> Vec<char> {
1101            let mut chars = vec![];
1102            while !s.is_empty() {
1103                let (ch, size) = utf8::decode_last(s.as_bytes());
1104                s = &s[..s.len() - size];
1105                chars.push(ch.unwrap());
1106            }
1107            chars
1108        }
1109
1110        assert_eq!(vec!['☃'], d("☃"));
1111        assert_eq!(vec!['☃', '☃'], d("☃☃"));
1112        assert_eq!(vec!['ε', 'δ', 'γ', 'β', 'α'], d("αβγδε"));
1113        assert_eq!(vec!['⛇', '⛄', '☃'], d("☃⛄⛇"));
1114        assert_eq!(vec!['𝗲', '𝗱', '𝗰', '𝗯', '𝗮'], d("𝗮𝗯𝗰𝗱𝗲"));
1115    }
1116
1117    #[test]
1118    fn decode_last_invalid() {
1119        let (ch, size) = utf8::decode_last(b"");
1120        assert_eq!(None, ch);
1121        assert_eq!(0, size);
1122
1123        let (ch, size) = utf8::decode_last(b"\xFF");
1124        assert_eq!(None, ch);
1125        assert_eq!(1, size);
1126
1127        let (ch, size) = utf8::decode_last(b"\xCE\xF0");
1128        assert_eq!(None, ch);
1129        assert_eq!(1, size);
1130
1131        let (ch, size) = utf8::decode_last(b"\xCE");
1132        assert_eq!(None, ch);
1133        assert_eq!(1, size);
1134
1135        let (ch, size) = utf8::decode_last(b"\xE2\x98\xF0");
1136        assert_eq!(None, ch);
1137        assert_eq!(1, size);
1138
1139        let (ch, size) = utf8::decode_last(b"\xE2\x98");
1140        assert_eq!(None, ch);
1141        assert_eq!(2, size);
1142
1143        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D\xF0");
1144        assert_eq!(None, ch);
1145        assert_eq!(1, size);
1146
1147        let (ch, size) = utf8::decode_last(b"\xF0\x9D\x9D");
1148        assert_eq!(None, ch);
1149        assert_eq!(3, size);
1150
1151        let (ch, size) = utf8::decode_last(b"\xF0\x82\x82\xAC");
1152        assert_eq!(None, ch);
1153        assert_eq!(1, size);
1154
1155        let (ch, size) = utf8::decode_last(b"\xED\xA0\x80");
1156        assert_eq!(None, ch);
1157        assert_eq!(1, size);
1158
1159        let (ch, size) = utf8::decode_last(b"\xED\xA0");
1160        assert_eq!(None, ch);
1161        assert_eq!(1, size);
1162
1163        let (ch, size) = utf8::decode_last(b"\xED");
1164        assert_eq!(None, ch);
1165        assert_eq!(1, size);
1166
1167        let (ch, size) = utf8::decode_last(b"a\xCE");
1168        assert_eq!(None, ch);
1169        assert_eq!(1, size);
1170
1171        let (ch, size) = utf8::decode_last(b"a\xE2\x98");
1172        assert_eq!(None, ch);
1173        assert_eq!(2, size);
1174
1175        let (ch, size) = utf8::decode_last(b"a\xF0\x9D\x9C");
1176        assert_eq!(None, ch);
1177        assert_eq!(3, size);
1178    }
1179
1180    #[test]
1181    fn decode_last_lossy() {
1182        let (ch, size) = utf8::decode_last_lossy(b"");
1183        assert_eq!('\u{FFFD}', ch);
1184        assert_eq!(0, size);
1185
1186        let (ch, size) = utf8::decode_last_lossy(b"\xFF");
1187        assert_eq!('\u{FFFD}', ch);
1188        assert_eq!(1, size);
1189
1190        let (ch, size) = utf8::decode_last_lossy(b"\xCE\xF0");
1191        assert_eq!('\u{FFFD}', ch);
1192        assert_eq!(1, size);
1193
1194        let (ch, size) = utf8::decode_last_lossy(b"\xCE");
1195        assert_eq!('\u{FFFD}', ch);
1196        assert_eq!(1, size);
1197
1198        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98\xF0");
1199        assert_eq!('\u{FFFD}', ch);
1200        assert_eq!(1, size);
1201
1202        let (ch, size) = utf8::decode_last_lossy(b"\xE2\x98");
1203        assert_eq!('\u{FFFD}', ch);
1204        assert_eq!(2, size);
1205
1206        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D\xF0");
1207        assert_eq!('\u{FFFD}', ch);
1208        assert_eq!(1, size);
1209
1210        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x9D\x9D");
1211        assert_eq!('\u{FFFD}', ch);
1212        assert_eq!(3, size);
1213
1214        let (ch, size) = utf8::decode_last_lossy(b"\xF0\x82\x82\xAC");
1215        assert_eq!('\u{FFFD}', ch);
1216        assert_eq!(1, size);
1217
1218        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0\x80");
1219        assert_eq!('\u{FFFD}', ch);
1220        assert_eq!(1, size);
1221
1222        let (ch, size) = utf8::decode_last_lossy(b"\xED\xA0");
1223        assert_eq!('\u{FFFD}', ch);
1224        assert_eq!(1, size);
1225
1226        let (ch, size) = utf8::decode_last_lossy(b"\xED");
1227        assert_eq!('\u{FFFD}', ch);
1228        assert_eq!(1, size);
1229
1230        let (ch, size) = utf8::decode_last_lossy(b"a\xCE");
1231        assert_eq!('\u{FFFD}', ch);
1232        assert_eq!(1, size);
1233
1234        let (ch, size) = utf8::decode_last_lossy(b"a\xE2\x98");
1235        assert_eq!('\u{FFFD}', ch);
1236        assert_eq!(2, size);
1237
1238        let (ch, size) = utf8::decode_last_lossy(b"a\xF0\x9D\x9C");
1239        assert_eq!('\u{FFFD}', ch);
1240        assert_eq!(3, size);
1241    }
1242
1243    #[test]
1244    fn chars() {
1245        for (i, &(expected, input)) in LOSSY_TESTS.iter().enumerate() {
1246            let got: String = B(input).chars().collect();
1247            assert_eq!(
1248                expected, got,
1249                "chars(ith: {:?}, given: {:?})",
1250                i, input,
1251            );
1252            let got: String =
1253                B(input).char_indices().map(|(_, _, ch)| ch).collect();
1254            assert_eq!(
1255                expected, got,
1256                "char_indices(ith: {:?}, given: {:?})",
1257                i, input,
1258            );
1259
1260            let expected: String = expected.chars().rev().collect();
1261
1262            let got: String = B(input).chars().rev().collect();
1263            assert_eq!(
1264                expected, got,
1265                "chars.rev(ith: {:?}, given: {:?})",
1266                i, input,
1267            );
1268            let got: String =
1269                B(input).char_indices().rev().map(|(_, _, ch)| ch).collect();
1270            assert_eq!(
1271                expected, got,
1272                "char_indices.rev(ith: {:?}, given: {:?})",
1273                i, input,
1274            );
1275        }
1276    }
1277
1278    #[test]
1279    fn utf8_chunks() {
1280        let mut c = utf8::Utf8Chunks { bytes: b"123\xC0" };
1281        assert_eq!(
1282            (c.next(), c.next()),
1283            (
1284                Some(utf8::Utf8Chunk {
1285                    valid: "123",
1286                    invalid: b"\xC0".as_bstr(),
1287                    incomplete: false,
1288                }),
1289                None,
1290            )
1291        );
1292
1293        let mut c = utf8::Utf8Chunks { bytes: b"123\xFF\xFF" };
1294        assert_eq!(
1295            (c.next(), c.next(), c.next()),
1296            (
1297                Some(utf8::Utf8Chunk {
1298                    valid: "123",
1299                    invalid: b"\xFF".as_bstr(),
1300                    incomplete: false,
1301                }),
1302                Some(utf8::Utf8Chunk {
1303                    valid: "",
1304                    invalid: b"\xFF".as_bstr(),
1305                    incomplete: false,
1306                }),
1307                None,
1308            )
1309        );
1310
1311        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0" };
1312        assert_eq!(
1313            (c.next(), c.next()),
1314            (
1315                Some(utf8::Utf8Chunk {
1316                    valid: "123",
1317                    invalid: b"\xD0".as_bstr(),
1318                    incomplete: true,
1319                }),
1320                None,
1321            )
1322        );
1323
1324        let mut c = utf8::Utf8Chunks { bytes: b"123\xD0456" };
1325        assert_eq!(
1326            (c.next(), c.next(), c.next()),
1327            (
1328                Some(utf8::Utf8Chunk {
1329                    valid: "123",
1330                    invalid: b"\xD0".as_bstr(),
1331                    incomplete: false,
1332                }),
1333                Some(utf8::Utf8Chunk {
1334                    valid: "456",
1335                    invalid: b"".as_bstr(),
1336                    incomplete: false,
1337                }),
1338                None,
1339            )
1340        );
1341
1342        let mut c = utf8::Utf8Chunks { bytes: b"123\xE2\x98" };
1343        assert_eq!(
1344            (c.next(), c.next()),
1345            (
1346                Some(utf8::Utf8Chunk {
1347                    valid: "123",
1348                    invalid: b"\xE2\x98".as_bstr(),
1349                    incomplete: true,
1350                }),
1351                None,
1352            )
1353        );
1354
1355        let mut c = utf8::Utf8Chunks { bytes: b"123\xF4\x8F\xBF" };
1356        assert_eq!(
1357            (c.next(), c.next()),
1358            (
1359                Some(utf8::Utf8Chunk {
1360                    valid: "123",
1361                    invalid: b"\xF4\x8F\xBF".as_bstr(),
1362                    incomplete: true,
1363                }),
1364                None,
1365            )
1366        );
1367    }
1368}
bstr/utf8.rs

bstr/
utf8.rs