encoding_rs/
utf_8.rs

Help
1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10use super::*;
11use crate::ascii::ascii_to_basic_latin;
12use crate::ascii::basic_latin_to_ascii;
13use crate::ascii::validate_ascii;
14use crate::handles::*;
15use crate::mem::convert_utf16_to_utf8_partial;
16use crate::variant::*;
17
18cfg_if! {
19    if #[cfg(feature = "simd-accel")] {
20        use ::std::intrinsics::unlikely;
21        use ::std::intrinsics::likely;
22    } else {
23        #[inline(always)]
24        // Unsafe to match the intrinsic, which is needlessly unsafe.
25        unsafe fn unlikely(b: bool) -> bool {
26            b
27        }
28        #[inline(always)]
29        // Unsafe to match the intrinsic, which is needlessly unsafe.
30        unsafe fn likely(b: bool) -> bool {
31            b
32        }
33    }
34}
35
36#[repr(align(64))] // Align to cache lines
37pub struct Utf8Data {
38    pub table: [u8; 384],
39}
40
41// BEGIN GENERATED CODE. PLEASE DO NOT EDIT.
42// Instead, please regenerate using generate-encoding-data.py
43
44pub static UTF8_DATA: Utf8Data = Utf8Data {
45    table: [
46        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
47        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
48        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
49        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
50        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
51        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
52        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
53        252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
54        148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
55        164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
56        164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59        252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60        252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61        4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62        4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
63        8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
64        4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
65    ],
66};
67
68// END GENERATED CODE
69
70pub fn utf8_valid_up_to(src: &[u8]) -> usize {
71    let mut read = 0;
72    'outer: loop {
73        let mut byte = {
74            let src_remaining = &src[read..];
75            match validate_ascii(src_remaining) {
76                None => {
77                    return src.len();
78                }
79                Some((non_ascii, consumed)) => {
80                    read += consumed;
81                    non_ascii
82                }
83            }
84        };
85        // Check for the longest sequence to avoid checking twice for the
86        // multi-byte sequences. This can't overflow with 64-bit address space,
87        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
88        // to overflow would mean that the source slice would be so large that
89        // the address space of the process would not have space for any code.
90        // Therefore, the slice cannot be so long that this would overflow.
91        if unsafe { likely(read + 4 <= src.len()) } {
92            'inner: loop {
93                // At this point, `byte` is not included in `read`, because we
94                // don't yet know that a) the UTF-8 sequence is valid and b) that there
95                // is output space if it is an astral sequence.
96                // Inspecting the lead byte directly is faster than what the
97                // std lib does!
98                if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
99                    // Two-byte
100                    let second = unsafe { *(src.get_unchecked(read + 1)) };
101                    if !in_inclusive_range8(second, 0x80, 0xBF) {
102                        break 'outer;
103                    }
104                    read += 2;
105
106                    // Next lead (manually inlined)
107                    if unsafe { likely(read + 4 <= src.len()) } {
108                        byte = unsafe { *(src.get_unchecked(read)) };
109                        if byte < 0x80 {
110                            read += 1;
111                            continue 'outer;
112                        }
113                        continue 'inner;
114                    }
115                    break 'inner;
116                }
117                if unsafe { likely(byte < 0xF0) } {
118                    'three: loop {
119                        // Three-byte
120                        let second = unsafe { *(src.get_unchecked(read + 1)) };
121                        let third = unsafe { *(src.get_unchecked(read + 2)) };
122                        if ((UTF8_DATA.table[usize::from(second)]
123                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
124                            | (third >> 6))
125                            != 2
126                        {
127                            break 'outer;
128                        }
129                        read += 3;
130
131                        // Next lead (manually inlined)
132                        if unsafe { likely(read + 4 <= src.len()) } {
133                            byte = unsafe { *(src.get_unchecked(read)) };
134                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
135                                continue 'three;
136                            }
137                            if unsafe { likely(byte < 0x80) } {
138                                read += 1;
139                                continue 'outer;
140                            }
141                            continue 'inner;
142                        }
143                        break 'inner;
144                    }
145                }
146                // Four-byte
147                let second = unsafe { *(src.get_unchecked(read + 1)) };
148                let third = unsafe { *(src.get_unchecked(read + 2)) };
149                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
150                if (u16::from(
151                    UTF8_DATA.table[usize::from(second)]
152                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
153                ) | u16::from(third >> 6)
154                    | (u16::from(fourth & 0xC0) << 2))
155                    != 0x202
156                {
157                    break 'outer;
158                }
159                read += 4;
160
161                // Next lead
162                if unsafe { likely(read + 4 <= src.len()) } {
163                    byte = unsafe { *(src.get_unchecked(read)) };
164                    if byte < 0x80 {
165                        read += 1;
166                        continue 'outer;
167                    }
168                    continue 'inner;
169                }
170                break 'inner;
171            }
172        }
173        // We can't have a complete 4-byte sequence, but we could still have
174        // one to three shorter sequences.
175        'tail: loop {
176            // >= is better for bound check elision than ==
177            if read >= src.len() {
178                break 'outer;
179            }
180            byte = src[read];
181            // At this point, `byte` is not included in `read`, because we
182            // don't yet know that a) the UTF-8 sequence is valid and b) that there
183            // is output space if it is an astral sequence.
184            // Inspecting the lead byte directly is faster than what the
185            // std lib does!
186            if byte < 0x80 {
187                read += 1;
188                continue 'tail;
189            }
190            if in_inclusive_range8(byte, 0xC2, 0xDF) {
191                // Two-byte
192                let new_read = read + 2;
193                if new_read > src.len() {
194                    break 'outer;
195                }
196                let second = src[read + 1];
197                if !in_inclusive_range8(second, 0x80, 0xBF) {
198                    break 'outer;
199                }
200                read += 2;
201                continue 'tail;
202            }
203            // We need to exclude valid four byte lead bytes, because
204            // `UTF8_DATA.second_mask` covers
205            if byte < 0xF0 {
206                // Three-byte
207                let new_read = read + 3;
208                if new_read > src.len() {
209                    break 'outer;
210                }
211                let second = src[read + 1];
212                let third = src[read + 2];
213                if ((UTF8_DATA.table[usize::from(second)]
214                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
215                    | (third >> 6))
216                    != 2
217                {
218                    break 'outer;
219                }
220                read += 3;
221                // `'tail` handles sequences shorter than 4, so
222                // there can't be another sequence after this one.
223                break 'outer;
224            }
225            break 'outer;
226        }
227    }
228    read
229}
230
231#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
232pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
233    let mut read = 0;
234    let mut written = 0;
235    'outer: loop {
236        let mut byte = {
237            let src_remaining = &src[read..];
238            let dst_remaining = &mut dst[written..];
239            let length = ::std::cmp::min(src_remaining.len(), dst_remaining.len());
240            match unsafe {
241                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
242            } {
243                None => {
244                    read += length;
245                    written += length;
246                    break 'outer;
247                }
248                Some((non_ascii, consumed)) => {
249                    read += consumed;
250                    written += consumed;
251                    non_ascii
252                }
253            }
254        };
255        // Check for the longest sequence to avoid checking twice for the
256        // multi-byte sequences. This can't overflow with 64-bit address space,
257        // because full 64 bits aren't in use. In the 32-bit PAE case, for this
258        // to overflow would mean that the source slice would be so large that
259        // the address space of the process would not have space for any code.
260        // Therefore, the slice cannot be so long that this would overflow.
261        if unsafe { likely(read + 4 <= src.len()) } {
262            'inner: loop {
263                // At this point, `byte` is not included in `read`, because we
264                // don't yet know that a) the UTF-8 sequence is valid and b) that there
265                // is output space if it is an astral sequence.
266                // We know, thanks to `ascii_to_basic_latin` that there is output
267                // space for at least one UTF-16 code unit, so no need to check
268                // for output space in the BMP cases.
269                // Inspecting the lead byte directly is faster than what the
270                // std lib does!
271                if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
272                    // Two-byte
273                    let second = unsafe { *(src.get_unchecked(read + 1)) };
274                    if !in_inclusive_range8(second, 0x80, 0xBF) {
275                        break 'outer;
276                    }
277                    unsafe {
278                        *(dst.get_unchecked_mut(written)) =
279                            ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
280                    };
281                    read += 2;
282                    written += 1;
283
284                    // Next lead (manually inlined)
285                    if written == dst.len() {
286                        break 'outer;
287                    }
288                    if unsafe { likely(read + 4 <= src.len()) } {
289                        byte = unsafe { *(src.get_unchecked(read)) };
290                        if byte < 0x80 {
291                            unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
292                            read += 1;
293                            written += 1;
294                            continue 'outer;
295                        }
296                        continue 'inner;
297                    }
298                    break 'inner;
299                }
300                if unsafe { likely(byte < 0xF0) } {
301                    'three: loop {
302                        // Three-byte
303                        let second = unsafe { *(src.get_unchecked(read + 1)) };
304                        let third = unsafe { *(src.get_unchecked(read + 2)) };
305                        if ((UTF8_DATA.table[usize::from(second)]
306                            & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
307                            | (third >> 6))
308                            != 2
309                        {
310                            break 'outer;
311                        }
312                        let point = ((u16::from(byte) & 0xF) << 12)
313                            | ((u16::from(second) & 0x3F) << 6)
314                            | (u16::from(third) & 0x3F);
315                        unsafe { *(dst.get_unchecked_mut(written)) = point };
316                        read += 3;
317                        written += 1;
318
319                        // Next lead (manually inlined)
320                        if written == dst.len() {
321                            break 'outer;
322                        }
323                        if unsafe { likely(read + 4 <= src.len()) } {
324                            byte = unsafe { *(src.get_unchecked(read)) };
325                            if in_inclusive_range8(byte, 0xE0, 0xEF) {
326                                continue 'three;
327                            }
328                            if unsafe { likely(byte < 0x80) } {
329                                unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
330                                read += 1;
331                                written += 1;
332                                continue 'outer;
333                            }
334                            continue 'inner;
335                        }
336                        break 'inner;
337                    }
338                }
339                // Four-byte
340                if written + 1 == dst.len() {
341                    break 'outer;
342                }
343                let second = unsafe { *(src.get_unchecked(read + 1)) };
344                let third = unsafe { *(src.get_unchecked(read + 2)) };
345                let fourth = unsafe { *(src.get_unchecked(read + 3)) };
346                if (u16::from(
347                    UTF8_DATA.table[usize::from(second)]
348                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
349                ) | u16::from(third >> 6)
350                    | (u16::from(fourth & 0xC0) << 2))
351                    != 0x202
352                {
353                    break 'outer;
354                }
355                let point = ((u32::from(byte) & 0x7) << 18)
356                    | ((u32::from(second) & 0x3F) << 12)
357                    | ((u32::from(third) & 0x3F) << 6)
358                    | (u32::from(fourth) & 0x3F);
359                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
360                unsafe {
361                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
362                };
363                read += 4;
364                written += 2;
365
366                // Next lead
367                if written == dst.len() {
368                    break 'outer;
369                }
370                if unsafe { likely(read + 4 <= src.len()) } {
371                    byte = unsafe { *(src.get_unchecked(read)) };
372                    if byte < 0x80 {
373                        unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
374                        read += 1;
375                        written += 1;
376                        continue 'outer;
377                    }
378                    continue 'inner;
379                }
380                break 'inner;
381            }
382        }
383        // We can't have a complete 4-byte sequence, but we could still have
384        // one to three shorter sequences.
385        'tail: loop {
386            // >= is better for bound check elision than ==
387            if read >= src.len() || written >= dst.len() {
388                break 'outer;
389            }
390            byte = src[read];
391            // At this point, `byte` is not included in `read`, because we
392            // don't yet know that a) the UTF-8 sequence is valid and b) that there
393            // is output space if it is an astral sequence.
394            // Inspecting the lead byte directly is faster than what the
395            // std lib does!
396            if byte < 0x80 {
397                dst[written] = u16::from(byte);
398                read += 1;
399                written += 1;
400                continue 'tail;
401            }
402            if in_inclusive_range8(byte, 0xC2, 0xDF) {
403                // Two-byte
404                let new_read = read + 2;
405                if new_read > src.len() {
406                    break 'outer;
407                }
408                let second = src[read + 1];
409                if !in_inclusive_range8(second, 0x80, 0xBF) {
410                    break 'outer;
411                }
412                dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
413                read += 2;
414                written += 1;
415                continue 'tail;
416            }
417            // We need to exclude valid four byte lead bytes, because
418            // `UTF8_DATA.second_mask` covers
419            if byte < 0xF0 {
420                // Three-byte
421                let new_read = read + 3;
422                if new_read > src.len() {
423                    break 'outer;
424                }
425                let second = src[read + 1];
426                let third = src[read + 2];
427                if ((UTF8_DATA.table[usize::from(second)]
428                    & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
429                    | (third >> 6))
430                    != 2
431                {
432                    break 'outer;
433                }
434                let point = ((u16::from(byte) & 0xF) << 12)
435                    | ((u16::from(second) & 0x3F) << 6)
436                    | (u16::from(third) & 0x3F);
437                dst[written] = point;
438                read += 3;
439                written += 1;
440                // `'tail` handles sequences shorter than 4, so
441                // there can't be another sequence after this one.
442                break 'outer;
443            }
444            break 'outer;
445        }
446    }
447    (read, written)
448}
449
450pub struct Utf8Decoder {
451    code_point: u32,
452    bytes_seen: usize,   // 1, 2 or 3: counts continuations only
453    bytes_needed: usize, // 1, 2 or 3: counts continuations only
454    lower_boundary: u8,
455    upper_boundary: u8,
456}
457
458impl Utf8Decoder {
459    pub fn new_inner() -> Utf8Decoder {
460        Utf8Decoder {
461            code_point: 0,
462            bytes_seen: 0,
463            bytes_needed: 0,
464            lower_boundary: 0x80u8,
465            upper_boundary: 0xBFu8,
466        }
467    }
468
469    pub fn new() -> VariantDecoder {
470        VariantDecoder::Utf8(Utf8Decoder::new_inner())
471    }
472
473    pub fn in_neutral_state(&self) -> bool {
474        self.bytes_needed == 0
475    }
476
477    fn extra_from_state(&self) -> usize {
478        if self.bytes_needed == 0 {
479            0
480        } else {
481            self.bytes_seen + 1
482        }
483    }
484
485    pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
486        byte_length.checked_add(1 + self.extra_from_state())
487    }
488
489    pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
490        byte_length.checked_add(3 + self.extra_from_state())
491    }
492
493    pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
494        checked_add(
495            3,
496            checked_mul(3, byte_length.checked_add(self.extra_from_state())),
497        )
498    }
499
500    decoder_functions!(
501        {},
502        {
503            // This is the fast path. The rest runs only at the
504            // start and end for partial sequences.
505            if self.bytes_needed == 0 {
506                dest.copy_utf8_up_to_invalid_from(&mut source);
507            }
508        },
509        {
510            if self.bytes_needed != 0 {
511                let bad_bytes = (self.bytes_seen + 1) as u8;
512                self.code_point = 0;
513                self.bytes_needed = 0;
514                self.bytes_seen = 0;
515                return (
516                    DecoderResult::Malformed(bad_bytes, 0),
517                    src_consumed,
518                    dest.written(),
519                );
520            }
521        },
522        {
523            if self.bytes_needed == 0 {
524                if b < 0x80u8 {
525                    destination_handle.write_ascii(b);
526                    continue;
527                }
528                if b < 0xC2u8 {
529                    return (
530                        DecoderResult::Malformed(1, 0),
531                        unread_handle.consumed(),
532                        destination_handle.written(),
533                    );
534                }
535                if b < 0xE0u8 {
536                    self.bytes_needed = 1;
537                    self.code_point = u32::from(b) & 0x1F;
538                    continue;
539                }
540                if b < 0xF0u8 {
541                    if b == 0xE0u8 {
542                        self.lower_boundary = 0xA0u8;
543                    } else if b == 0xEDu8 {
544                        self.upper_boundary = 0x9Fu8;
545                    }
546                    self.bytes_needed = 2;
547                    self.code_point = u32::from(b) & 0xF;
548                    continue;
549                }
550                if b < 0xF5u8 {
551                    if b == 0xF0u8 {
552                        self.lower_boundary = 0x90u8;
553                    } else if b == 0xF4u8 {
554                        self.upper_boundary = 0x8Fu8;
555                    }
556                    self.bytes_needed = 3;
557                    self.code_point = u32::from(b) & 0x7;
558                    continue;
559                }
560                return (
561                    DecoderResult::Malformed(1, 0),
562                    unread_handle.consumed(),
563                    destination_handle.written(),
564                );
565            }
566            // self.bytes_needed != 0
567            if !(b >= self.lower_boundary && b <= self.upper_boundary) {
568                let bad_bytes = (self.bytes_seen + 1) as u8;
569                self.code_point = 0;
570                self.bytes_needed = 0;
571                self.bytes_seen = 0;
572                self.lower_boundary = 0x80u8;
573                self.upper_boundary = 0xBFu8;
574                return (
575                    DecoderResult::Malformed(bad_bytes, 0),
576                    unread_handle.unread(),
577                    destination_handle.written(),
578                );
579            }
580            self.lower_boundary = 0x80u8;
581            self.upper_boundary = 0xBFu8;
582            self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
583            self.bytes_seen += 1;
584            if self.bytes_seen != self.bytes_needed {
585                continue;
586            }
587            if self.bytes_needed == 3 {
588                destination_handle.write_astral(self.code_point);
589            } else {
590                destination_handle.write_bmp_excl_ascii(self.code_point as u16);
591            }
592            self.code_point = 0;
593            self.bytes_needed = 0;
594            self.bytes_seen = 0;
595            continue;
596        },
597        self,
598        src_consumed,
599        dest,
600        source,
601        b,
602        destination_handle,
603        unread_handle,
604        check_space_astral
605    );
606}
607
608#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
609#[inline(never)]
610pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
611    let mut read = 0;
612    let mut written = 0;
613    'outer: loop {
614        let mut unit = {
615            let src_remaining = &src[read..];
616            let dst_remaining = &mut dst[written..];
617            let length = if dst_remaining.len() < src_remaining.len() {
618                dst_remaining.len()
619            } else {
620                src_remaining.len()
621            };
622            match unsafe {
623                basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
624            } {
625                None => {
626                    read += length;
627                    written += length;
628                    return (read, written);
629                }
630                Some((non_ascii, consumed)) => {
631                    read += consumed;
632                    written += consumed;
633                    non_ascii
634                }
635            }
636        };
637        'inner: loop {
638            // The following loop is only broken out of as a goto forward.
639            loop {
640                // Unfortunately, this check isn't enough for the compiler to elide
641                // the bound checks on writes to dst, which is why they are manually
642                // elided, which makes a measurable difference.
643                if written.checked_add(4).unwrap() > dst.len() {
644                    return (read, written);
645                }
646                read += 1;
647                if unit < 0x800 {
648                    unsafe {
649                        *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
650                        written += 1;
651                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
652                        written += 1;
653                    }
654                    break;
655                }
656                let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
657                if unsafe { likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) } {
658                    unsafe {
659                        *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
660                        written += 1;
661                        *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
662                        written += 1;
663                        *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
664                        written += 1;
665                    }
666                    break;
667                }
668                if unsafe { likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) } {
669                    // high surrogate
670                    // read > src.len() is impossible, but using
671                    // >= instead of == allows the compiler to elide a bound check.
672                    if read >= src.len() {
673                        debug_assert_eq!(read, src.len());
674                        // Unpaired surrogate at the end of the buffer.
675                        unsafe {
676                            *(dst.get_unchecked_mut(written)) = 0xEFu8;
677                            written += 1;
678                            *(dst.get_unchecked_mut(written)) = 0xBFu8;
679                            written += 1;
680                            *(dst.get_unchecked_mut(written)) = 0xBDu8;
681                            written += 1;
682                        }
683                        return (read, written);
684                    }
685                    let second = src[read];
686                    let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
687                    if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } {
688                        // The next code unit is a low surrogate. Advance position.
689                        read += 1;
690                        let astral = (u32::from(unit) << 10) + u32::from(second)
691                            - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
692                        unsafe {
693                            *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
694                            written += 1;
695                            *(dst.get_unchecked_mut(written)) =
696                                ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
697                            written += 1;
698                            *(dst.get_unchecked_mut(written)) =
699                                ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
700                            written += 1;
701                            *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
702                            written += 1;
703                        }
704                        break;
705                    }
706                    // The next code unit is not a low surrogate. Don't advance
707                    // position and treat the high surrogate as unpaired.
708                    // Fall through
709                }
710                // Unpaired low surrogate
711                unsafe {
712                    *(dst.get_unchecked_mut(written)) = 0xEFu8;
713                    written += 1;
714                    *(dst.get_unchecked_mut(written)) = 0xBFu8;
715                    written += 1;
716                    *(dst.get_unchecked_mut(written)) = 0xBDu8;
717                    written += 1;
718                }
719                break;
720            }
721            // Now see if the next unit is Basic Latin
722            // read > src.len() is impossible, but using
723            // >= instead of == allows the compiler to elide a bound check.
724            if read >= src.len() {
725                debug_assert_eq!(read, src.len());
726                return (read, written);
727            }
728            unit = src[read];
729            if unsafe { unlikely(unit < 0x80) } {
730                // written > dst.len() is impossible, but using
731                // >= instead of == allows the compiler to elide a bound check.
732                if written >= dst.len() {
733                    debug_assert_eq!(written, dst.len());
734                    return (read, written);
735                }
736                dst[written] = unit as u8;
737                read += 1;
738                written += 1;
739                // Mysteriously, adding a punctuation check here makes
740                // the expected benificiary cases *slower*!
741                continue 'outer;
742            }
743            continue 'inner;
744        }
745    }
746}
747
748#[inline(never)]
749pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
750    // Everything below is cold code!
751    let mut read = 0;
752    let mut written = 0;
753    let mut unit = src[read];
754    // We now have up to 3 output slots, so an astral character
755    // will not fit.
756    if unit < 0x800 {
757        loop {
758            if unit < 0x80 {
759                if written >= dst.len() {
760                    return (read, written);
761                }
762                read += 1;
763                dst[written] = unit as u8;
764                written += 1;
765            } else if unit < 0x800 {
766                if written + 2 > dst.len() {
767                    return (read, written);
768                }
769                read += 1;
770                dst[written] = (unit >> 6) as u8 | 0xC0u8;
771                written += 1;
772                dst[written] = (unit & 0x3F) as u8 | 0x80u8;
773                written += 1;
774            } else {
775                return (read, written);
776            }
777            // read > src.len() is impossible, but using
778            // >= instead of == allows the compiler to elide a bound check.
779            if read >= src.len() {
780                debug_assert_eq!(read, src.len());
781                return (read, written);
782            }
783            unit = src[read];
784        }
785    }
786    // Could be an unpaired surrogate, but we'll need 3 output
787    // slots in any case.
788    if written + 3 > dst.len() {
789        return (read, written);
790    }
791    read += 1;
792    let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
793    if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
794        // Got surrogate
795        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
796            // Got high surrogate
797            if read >= src.len() {
798                // Unpaired high surrogate
799                unit = 0xFFFD;
800            } else {
801                let second = src[read];
802                if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
803                    // Valid surrogate pair, but we know it won't fit.
804                    read -= 1;
805                    return (read, written);
806                }
807                // Unpaired high
808                unit = 0xFFFD;
809            }
810        } else {
811            // Unpaired low
812            unit = 0xFFFD;
813        }
814    }
815    dst[written] = (unit >> 12) as u8 | 0xE0u8;
816    written += 1;
817    dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
818    written += 1;
819    dst[written] = (unit & 0x3F) as u8 | 0x80u8;
820    written += 1;
821    debug_assert_eq!(written, dst.len());
822    (read, written)
823}
824
825pub struct Utf8Encoder;
826
827impl Utf8Encoder {
828    pub fn new(encoding: &'static Encoding) -> Encoder {
829        Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
830    }
831
832    pub fn max_buffer_length_from_utf16_without_replacement(
833        &self,
834        u16_length: usize,
835    ) -> Option<usize> {
836        u16_length.checked_mul(3)
837    }
838
839    pub fn max_buffer_length_from_utf8_without_replacement(
840        &self,
841        byte_length: usize,
842    ) -> Option<usize> {
843        Some(byte_length)
844    }
845
846    pub fn encode_from_utf16_raw(
847        &mut self,
848        src: &[u16],
849        dst: &mut [u8],
850        _last: bool,
851    ) -> (EncoderResult, usize, usize) {
852        let (read, written) = convert_utf16_to_utf8_partial(src, dst);
853        (
854            if read == src.len() {
855                EncoderResult::InputEmpty
856            } else {
857                EncoderResult::OutputFull
858            },
859            read,
860            written,
861        )
862    }
863
864    pub fn encode_from_utf8_raw(
865        &mut self,
866        src: &str,
867        dst: &mut [u8],
868        _last: bool,
869    ) -> (EncoderResult, usize, usize) {
870        let bytes = src.as_bytes();
871        let mut to_write = bytes.len();
872        if to_write <= dst.len() {
873            (&mut dst[..to_write]).copy_from_slice(bytes);
874            return (EncoderResult::InputEmpty, to_write, to_write);
875        }
876        to_write = dst.len();
877        // Move back until we find a UTF-8 sequence boundary.
878        while (bytes[to_write] & 0xC0) == 0x80 {
879            to_write -= 1;
880        }
881        (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
882        (EncoderResult::OutputFull, to_write, to_write)
883    }
884}
885
886// Any copyright to the test code below this comment is dedicated to the
887// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
888
889#[cfg(test)]
890mod tests {
891    use super::super::testing::*;
892    use super::super::*;
893
894    //    fn decode_utf8_to_utf16(bytes: &[u8], expect: &[u16]) {
895    //        decode_to_utf16_without_replacement(UTF_8, bytes, expect);
896    //    }
897
898    fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
899        decode_to_utf8(UTF_8, bytes, expect);
900    }
901
902    fn decode_valid_utf8(string: &str) {
903        decode_utf8_to_utf8(string.as_bytes(), string);
904    }
905
906    fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
907        encode_from_utf16(UTF_8, string, expect);
908    }
909
910    fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
911        encode_from_utf8(UTF_8, string, expect);
912    }
913
914    fn encode_utf8_from_utf16_with_output_limit(
915        string: &[u16],
916        expect: &str,
917        limit: usize,
918        expect_result: EncoderResult,
919    ) {
920        let mut dst = Vec::new();
921        {
922            dst.resize(limit, 0u8);
923            let mut encoder = UTF_8.new_encoder();
924            let (result, read, written) =
925                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
926            assert_eq!(result, expect_result);
927            if expect_result == EncoderResult::InputEmpty {
928                assert_eq!(read, string.len());
929            }
930            assert_eq!(&dst[..written], expect.as_bytes());
931        }
932        {
933            dst.resize(64, 0u8);
934            for (i, elem) in dst.iter_mut().enumerate() {
935                *elem = i as u8;
936            }
937            let mut encoder = UTF_8.new_encoder();
938            let (_, _, mut j) =
939                encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
940            while j < dst.len() {
941                assert_eq!(usize::from(dst[j]), j);
942                j += 1;
943            }
944        }
945    }
946
947    #[test]
948    fn test_utf8_decode() {
949        // Empty
950        decode_valid_utf8("");
951        // ASCII
952        decode_valid_utf8("ab");
953        // Low BMP
954        decode_valid_utf8("a\u{E4}Z");
955        // High BMP
956        decode_valid_utf8("a\u{2603}Z");
957        // Astral
958        decode_valid_utf8("a\u{1F4A9}Z");
959        // Low BMP with last byte missing
960        decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
961        decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
962        // High BMP with last byte missing
963        decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
964        decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
965        // Astral with last byte missing
966        decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
967        decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
968        // Lone highest continuation
969        decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
970        decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
971        // Two lone highest continuations
972        decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
973        decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
974        // Low BMP followed by lowest lone continuation
975        decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
976        decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
977        // Low BMP followed by highest lone continuation
978        decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
979        decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
980        // High BMP followed by lowest lone continuation
981        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
982        decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
983        // High BMP followed by highest lone continuation
984        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
985        decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
986        // Astral followed by lowest lone continuation
987        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
988        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
989        // Astral followed by highest lone continuation
990        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
991        decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");
992
993        // Boundary conditions
994        // Lowest single-byte
995        decode_valid_utf8("Z\x00");
996        decode_valid_utf8("Z\x00Z");
997        // Lowest single-byte as two-byte overlong sequence
998        decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
999        decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
1000        // Lowest single-byte as three-byte overlong sequence
1001        decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1002        decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1003        // Lowest single-byte as four-byte overlong sequence
1004        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1005        decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1006        // One below lowest single-byte
1007        decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
1008        decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
1009        // Highest single-byte
1010        decode_valid_utf8("a\x7F");
1011        decode_valid_utf8("a\x7FZ");
1012        // Highest single-byte as two-byte overlong sequence
1013        decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
1014        decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
1015        // Highest single-byte as three-byte overlong sequence
1016        decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1017        decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1018        // Highest single-byte as four-byte overlong sequence
1019        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1020        decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1021        // One past highest single byte (also lone continuation)
1022        decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
1023        decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
1024        // Two lone continuations
1025        decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
1026        decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
1027        // Three lone continuations
1028        decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1029        decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1030        // Four lone continuations
1031        decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1032        decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1033        // Lowest two-byte
1034        decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
1035        decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
1036        // Lowest two-byte as three-byte overlong sequence
1037        decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1038        decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1039        // Lowest two-byte as four-byte overlong sequence
1040        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1041        decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1042        // Lead one below lowest two-byte
1043        decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
1044        decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
1045        // Trail one below lowest two-byte
1046        decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
1047        decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
1048        // Highest two-byte
1049        decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
1050        decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
1051        // Highest two-byte as three-byte overlong sequence
1052        decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1053        decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1054        // Highest two-byte as four-byte overlong sequence
1055        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1056        decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1057        // Lowest three-byte
1058        decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
1059        decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
1060        // Lowest three-byte as four-byte overlong sequence
1061        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1062        decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1063        // Highest below surrogates
1064        decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
1065        decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
1066        // Highest below surrogates as four-byte overlong sequence
1067        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1068        decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1069        // First surrogate
1070        decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1071        decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1072        // First surrogate as four-byte overlong sequence
1073        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1074        decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1075        // Last surrogate
1076        decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1077        decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1078        // Last surrogate as four-byte overlong sequence
1079        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1080        decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1081        // Lowest above surrogates
1082        decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
1083        decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
1084        // Lowest above surrogates as four-byte overlong sequence
1085        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1086        decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1087        // Highest three-byte
1088        decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
1089        decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
1090        // Highest three-byte as four-byte overlong sequence
1091        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1092        decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1093        // Lowest four-byte
1094        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
1095        decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
1096        // Highest four-byte
1097        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
1098        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
1099        // One past highest four-byte
1100        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1101        decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1102
1103        // Highest four-byte with last byte replaced with 0xFF
1104        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
1105        decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
1106    }
1107
1108    #[test]
1109    fn test_utf8_encode() {
1110        // Empty
1111        encode_utf8_from_utf16(&[], b"");
1112        encode_utf8_from_utf8("", b"");
1113
1114        encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
1115        encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
1116        encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
1117        encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
1118        encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
1119        encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
1120        encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
1121        encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1122        encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
1123        encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1124        encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
1125        encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
1126        encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
1127        encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
1128        encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
1129    }
1130
1131    #[test]
1132    fn test_encode_utf8_from_utf16_with_output_limit() {
1133        encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
1134        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
1135        encode_utf8_from_utf16_with_output_limit(
1136            &[0x2603],
1137            "\u{2603}",
1138            3,
1139            EncoderResult::InputEmpty,
1140        );
1141        encode_utf8_from_utf16_with_output_limit(
1142            &[0xD83D, 0xDCA9],
1143            "\u{1F4A9}",
1144            4,
1145            EncoderResult::InputEmpty,
1146        );
1147
1148        encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
1149        encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
1150        encode_utf8_from_utf16_with_output_limit(
1151            &[0xD83D, 0xDCA9],
1152            "",
1153            3,
1154            EncoderResult::OutputFull,
1155        );
1156
1157        encode_utf8_from_utf16_with_output_limit(
1158            &[0x0063, 0x0062],
1159            "\u{63}\u{62}",
1160            2,
1161            EncoderResult::InputEmpty,
1162        );
1163        encode_utf8_from_utf16_with_output_limit(
1164            &[0x0063, 0x00A7],
1165            "\u{63}\u{A7}",
1166            3,
1167            EncoderResult::InputEmpty,
1168        );
1169        encode_utf8_from_utf16_with_output_limit(
1170            &[0x0063, 0x2603],
1171            "\u{63}\u{2603}",
1172            4,
1173            EncoderResult::InputEmpty,
1174        );
1175        encode_utf8_from_utf16_with_output_limit(
1176            &[0x0063, 0xD83D, 0xDCA9],
1177            "\u{63}\u{1F4A9}",
1178            5,
1179            EncoderResult::InputEmpty,
1180        );
1181
1182        encode_utf8_from_utf16_with_output_limit(
1183            &[0x0063, 0x00A7],
1184            "\u{63}",
1185            2,
1186            EncoderResult::OutputFull,
1187        );
1188        encode_utf8_from_utf16_with_output_limit(
1189            &[0x0063, 0x2603],
1190            "\u{63}",
1191            3,
1192            EncoderResult::OutputFull,
1193        );
1194        encode_utf8_from_utf16_with_output_limit(
1195            &[0x0063, 0xD83D, 0xDCA9],
1196            "\u{63}",
1197            4,
1198            EncoderResult::OutputFull,
1199        );
1200
1201        encode_utf8_from_utf16_with_output_limit(
1202            &[0x00B6, 0x0062],
1203            "\u{B6}\u{62}",
1204            3,
1205            EncoderResult::InputEmpty,
1206        );
1207        encode_utf8_from_utf16_with_output_limit(
1208            &[0x00B6, 0x00A7],
1209            "\u{B6}\u{A7}",
1210            4,
1211            EncoderResult::InputEmpty,
1212        );
1213        encode_utf8_from_utf16_with_output_limit(
1214            &[0x00B6, 0x2603],
1215            "\u{B6}\u{2603}",
1216            5,
1217            EncoderResult::InputEmpty,
1218        );
1219        encode_utf8_from_utf16_with_output_limit(
1220            &[0x00B6, 0xD83D, 0xDCA9],
1221            "\u{B6}\u{1F4A9}",
1222            6,
1223            EncoderResult::InputEmpty,
1224        );
1225
1226        encode_utf8_from_utf16_with_output_limit(
1227            &[0x00B6, 0x00A7],
1228            "\u{B6}",
1229            3,
1230            EncoderResult::OutputFull,
1231        );
1232        encode_utf8_from_utf16_with_output_limit(
1233            &[0x00B6, 0x2603],
1234            "\u{B6}",
1235            4,
1236            EncoderResult::OutputFull,
1237        );
1238        encode_utf8_from_utf16_with_output_limit(
1239            &[0x00B6, 0xD83D, 0xDCA9],
1240            "\u{B6}",
1241            5,
1242            EncoderResult::OutputFull,
1243        );
1244
1245        encode_utf8_from_utf16_with_output_limit(
1246            &[0x263A, 0x0062],
1247            "\u{263A}\u{62}",
1248            4,
1249            EncoderResult::InputEmpty,
1250        );
1251        encode_utf8_from_utf16_with_output_limit(
1252            &[0x263A, 0x00A7],
1253            "\u{263A}\u{A7}",
1254            5,
1255            EncoderResult::InputEmpty,
1256        );
1257        encode_utf8_from_utf16_with_output_limit(
1258            &[0x263A, 0x2603],
1259            "\u{263A}\u{2603}",
1260            6,
1261            EncoderResult::InputEmpty,
1262        );
1263        encode_utf8_from_utf16_with_output_limit(
1264            &[0x263A, 0xD83D, 0xDCA9],
1265            "\u{263A}\u{1F4A9}",
1266            7,
1267            EncoderResult::InputEmpty,
1268        );
1269
1270        encode_utf8_from_utf16_with_output_limit(
1271            &[0x263A, 0x00A7],
1272            "\u{263A}",
1273            4,
1274            EncoderResult::OutputFull,
1275        );
1276        encode_utf8_from_utf16_with_output_limit(
1277            &[0x263A, 0x2603],
1278            "\u{263A}",
1279            5,
1280            EncoderResult::OutputFull,
1281        );
1282        encode_utf8_from_utf16_with_output_limit(
1283            &[0x263A, 0xD83D, 0xDCA9],
1284            "\u{263A}",
1285            6,
1286            EncoderResult::OutputFull,
1287        );
1288
1289        encode_utf8_from_utf16_with_output_limit(
1290            &[0xD83D, 0xDE0E, 0x0062],
1291            "\u{1F60E}\u{62}",
1292            5,
1293            EncoderResult::InputEmpty,
1294        );
1295        encode_utf8_from_utf16_with_output_limit(
1296            &[0xD83D, 0xDE0E, 0x00A7],
1297            "\u{1F60E}\u{A7}",
1298            6,
1299            EncoderResult::InputEmpty,
1300        );
1301        encode_utf8_from_utf16_with_output_limit(
1302            &[0xD83D, 0xDE0E, 0x2603],
1303            "\u{1F60E}\u{2603}",
1304            7,
1305            EncoderResult::InputEmpty,
1306        );
1307        encode_utf8_from_utf16_with_output_limit(
1308            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1309            "\u{1F60E}\u{1F4A9}",
1310            8,
1311            EncoderResult::InputEmpty,
1312        );
1313
1314        encode_utf8_from_utf16_with_output_limit(
1315            &[0xD83D, 0xDE0E, 0x00A7],
1316            "\u{1F60E}",
1317            5,
1318            EncoderResult::OutputFull,
1319        );
1320        encode_utf8_from_utf16_with_output_limit(
1321            &[0xD83D, 0xDE0E, 0x2603],
1322            "\u{1F60E}",
1323            6,
1324            EncoderResult::OutputFull,
1325        );
1326        encode_utf8_from_utf16_with_output_limit(
1327            &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1328            "\u{1F60E}",
1329            7,
1330            EncoderResult::OutputFull,
1331        );
1332
1333        encode_utf8_from_utf16_with_output_limit(
1334            &[0x0063, 0x00B6, 0x0062, 0x0062],
1335            "\u{63}\u{B6}\u{62}\u{62}",
1336            5,
1337            EncoderResult::InputEmpty,
1338        );
1339        encode_utf8_from_utf16_with_output_limit(
1340            &[0x0063, 0x00B6, 0x0062, 0x0062],
1341            "\u{63}\u{B6}\u{62}",
1342            4,
1343            EncoderResult::OutputFull,
1344        );
1345
1346        encode_utf8_from_utf16_with_output_limit(
1347            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1348            "\u{63}\u{B6}\u{62}\u{62}\u{62}",
1349            6,
1350            EncoderResult::InputEmpty,
1351        );
1352        encode_utf8_from_utf16_with_output_limit(
1353            &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1354            "\u{63}\u{B6}\u{62}\u{62}",
1355            5,
1356            EncoderResult::OutputFull,
1357        );
1358
1359        encode_utf8_from_utf16_with_output_limit(
1360            &[0x263A, 0x0062, 0x0062],
1361            "\u{263A}\u{62}\u{62}",
1362            5,
1363            EncoderResult::InputEmpty,
1364        );
1365        encode_utf8_from_utf16_with_output_limit(
1366            &[0x263A, 0x0062, 0x0062],
1367            "\u{263A}\u{62}",
1368            4,
1369            EncoderResult::OutputFull,
1370        );
1371
1372        encode_utf8_from_utf16_with_output_limit(
1373            &[0x263A, 0x0062, 0x0062, 0x0062],
1374            "\u{263A}\u{62}\u{62}\u{62}",
1375            6,
1376            EncoderResult::InputEmpty,
1377        );
1378        encode_utf8_from_utf16_with_output_limit(
1379            &[0x263A, 0x0062, 0x0062, 0x0062],
1380            "\u{263A}\u{62}\u{62}",
1381            5,
1382            EncoderResult::OutputFull,
1383        );
1384
1385        encode_utf8_from_utf16_with_output_limit(
1386            &[0x0063, 0x00B6, 0x00A7],
1387            "\u{63}\u{B6}\u{A7}",
1388            5,
1389            EncoderResult::InputEmpty,
1390        );
1391        encode_utf8_from_utf16_with_output_limit(
1392            &[0x0063, 0x00B6, 0x00A7],
1393            "\u{63}\u{B6}",
1394            4,
1395            EncoderResult::OutputFull,
1396        );
1397
1398        encode_utf8_from_utf16_with_output_limit(
1399            &[0x0063, 0x00B6, 0x00A7, 0x0062],
1400            "\u{63}\u{B6}\u{A7}\u{62}",
1401            6,
1402            EncoderResult::InputEmpty,
1403        );
1404        encode_utf8_from_utf16_with_output_limit(
1405            &[0x0063, 0x00B6, 0x00A7, 0x0062],
1406            "\u{63}\u{B6}\u{A7}",
1407            5,
1408            EncoderResult::OutputFull,
1409        );
1410
1411        encode_utf8_from_utf16_with_output_limit(
1412            &[0x263A, 0x00A7, 0x0062],
1413            "\u{263A}\u{A7}\u{62}",
1414            6,
1415            EncoderResult::InputEmpty,
1416        );
1417        encode_utf8_from_utf16_with_output_limit(
1418            &[0x263A, 0x00A7, 0x0062],
1419            "\u{263A}\u{A7}",
1420            5,
1421            EncoderResult::OutputFull,
1422        );
1423
1424        encode_utf8_from_utf16_with_output_limit(
1425            &[0x0063, 0x00B6, 0x0062, 0x00A7],
1426            "\u{63}\u{B6}\u{62}\u{A7}",
1427            6,
1428            EncoderResult::InputEmpty,
1429        );
1430        encode_utf8_from_utf16_with_output_limit(
1431            &[0x0063, 0x00B6, 0x0062, 0x00A7],
1432            "\u{63}\u{B6}\u{62}",
1433            5,
1434            EncoderResult::OutputFull,
1435        );
1436
1437        encode_utf8_from_utf16_with_output_limit(
1438            &[0x263A, 0x0062, 0x00A7],
1439            "\u{263A}\u{62}\u{A7}",
1440            6,
1441            EncoderResult::InputEmpty,
1442        );
1443        encode_utf8_from_utf16_with_output_limit(
1444            &[0x263A, 0x0062, 0x00A7],
1445            "\u{263A}\u{62}",
1446            5,
1447            EncoderResult::OutputFull,
1448        );
1449
1450        encode_utf8_from_utf16_with_output_limit(
1451            &[0x0063, 0x00B6, 0x2603],
1452            "\u{63}\u{B6}\u{2603}",
1453            6,
1454            EncoderResult::InputEmpty,
1455        );
1456        encode_utf8_from_utf16_with_output_limit(
1457            &[0x0063, 0x00B6, 0x2603],
1458            "\u{63}\u{B6}",
1459            5,
1460            EncoderResult::OutputFull,
1461        );
1462
1463        encode_utf8_from_utf16_with_output_limit(
1464            &[0x263A, 0x2603],
1465            "\u{263A}\u{2603}",
1466            6,
1467            EncoderResult::InputEmpty,
1468        );
1469        encode_utf8_from_utf16_with_output_limit(
1470            &[0x263A, 0x2603],
1471            "\u{263A}",
1472            5,
1473            EncoderResult::OutputFull,
1474        );
1475
1476        encode_utf8_from_utf16_with_output_limit(
1477            &[0x0063, 0x00B6, 0xD83D],
1478            "\u{63}\u{B6}\u{FFFD}",
1479            6,
1480            EncoderResult::InputEmpty,
1481        );
1482        encode_utf8_from_utf16_with_output_limit(
1483            &[0x0063, 0x00B6, 0xD83D],
1484            "\u{63}\u{B6}",
1485            5,
1486            EncoderResult::OutputFull,
1487        );
1488
1489        encode_utf8_from_utf16_with_output_limit(
1490            &[0x263A, 0xD83D],
1491            "\u{263A}\u{FFFD}",
1492            6,
1493            EncoderResult::InputEmpty,
1494        );
1495        encode_utf8_from_utf16_with_output_limit(
1496            &[0x263A, 0xD83D],
1497            "\u{263A}",
1498            5,
1499            EncoderResult::OutputFull,
1500        );
1501
1502        encode_utf8_from_utf16_with_output_limit(
1503            &[0x0063, 0x00B6, 0xDCA9],
1504            "\u{63}\u{B6}\u{FFFD}",
1505            6,
1506            EncoderResult::InputEmpty,
1507        );
1508        encode_utf8_from_utf16_with_output_limit(
1509            &[0x0063, 0x00B6, 0xDCA9],
1510            "\u{63}\u{B6}",
1511            5,
1512            EncoderResult::OutputFull,
1513        );
1514
1515        encode_utf8_from_utf16_with_output_limit(
1516            &[0x263A, 0xDCA9],
1517            "\u{263A}\u{FFFD}",
1518            6,
1519            EncoderResult::InputEmpty,
1520        );
1521        encode_utf8_from_utf16_with_output_limit(
1522            &[0x263A, 0xDCA9],
1523            "\u{263A}",
1524            5,
1525            EncoderResult::OutputFull,
1526        );
1527    }
1528
1529    #[test]
1530    fn test_utf8_max_length_from_utf16() {
1531        let mut encoder = UTF_8.new_encoder();
1532        let mut output = [0u8; 13];
1533        let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
1534        let needed = encoder
1535            .max_buffer_length_from_utf16_without_replacement(input.len())
1536            .unwrap();
1537        let (result, _, _) =
1538            encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
1539        assert_eq!(result, EncoderResult::InputEmpty);
1540    }
1541
1542    #[test]
1543    fn test_decode_bom_prefixed_split_byte_triple() {
1544        let mut output = [0u16; 20];
1545        let mut decoder = UTF_8.new_decoder();
1546        {
1547            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1548            let (result, read, written, had_errors) =
1549                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1550            assert_eq!(result, CoderResult::InputEmpty);
1551            assert_eq!(read, 1);
1552            assert_eq!(written, 0);
1553            assert!(!had_errors);
1554        }
1555        {
1556            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1557            let (result, read, written, had_errors) =
1558                decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
1559            assert_eq!(result, CoderResult::InputEmpty);
1560            assert_eq!(read, 1);
1561            assert_eq!(written, 0);
1562            assert!(!had_errors);
1563        }
1564        {
1565            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1566            let (result, read, written, had_errors) =
1567                decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
1568            assert_eq!(result, CoderResult::InputEmpty);
1569            assert_eq!(read, 1);
1570            assert_eq!(written, 1);
1571            assert!(!had_errors);
1572            assert_eq!(output[0], 0xFFFE);
1573        }
1574    }
1575
1576    #[test]
1577    fn test_decode_bom_prefixed_split_byte_pair() {
1578        let mut output = [0u16; 20];
1579        let mut decoder = UTF_8.new_decoder();
1580        {
1581            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1582            let (result, read, written, had_errors) =
1583                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1584            assert_eq!(result, CoderResult::InputEmpty);
1585            assert_eq!(read, 1);
1586            assert_eq!(written, 0);
1587            assert!(!had_errors);
1588        }
1589        {
1590            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1591            let (result, read, written, had_errors) =
1592                decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
1593            assert_eq!(result, CoderResult::InputEmpty);
1594            assert_eq!(read, 1);
1595            assert_eq!(written, 1);
1596            assert!(had_errors);
1597            assert_eq!(output[0], 0xFFFD);
1598        }
1599    }
1600
1601    #[test]
1602    fn test_decode_bom_prefix() {
1603        let mut output = [0u16; 20];
1604        let mut decoder = UTF_8.new_decoder();
1605        {
1606            let needed = decoder.max_utf16_buffer_length(1).unwrap();
1607            let (result, read, written, had_errors) =
1608                decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
1609            assert_eq!(result, CoderResult::InputEmpty);
1610            assert_eq!(read, 1);
1611            assert_eq!(written, 1);
1612            assert!(had_errors);
1613            assert_eq!(output[0], 0xFFFD);
1614        }
1615    }
1616
1617    #[test]
1618    fn test_tail() {
1619        let mut output = [0u16; 1];
1620        let mut decoder = UTF_8.new_decoder_without_bom_handling();
1621        {
1622            let (result, read, written, had_errors) =
1623                decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
1624            assert_eq!(result, CoderResult::OutputFull);
1625            assert_eq!(read, 2);
1626            assert_eq!(written, 1);
1627            assert!(!had_errors);
1628            assert_eq!(output[0], 0x00E4);
1629        }
1630    }
1631}
encoding_rs/utf_8.rs

encoding_rs/
utf_8.rs