1use super::*;
11use crate::ascii::ascii_to_basic_latin;
12use crate::ascii::basic_latin_to_ascii;
13use crate::ascii::validate_ascii;
14use crate::handles::*;
15use crate::mem::convert_utf16_to_utf8_partial;
16use crate::variant::*;
17
18cfg_if! {
19 if #[cfg(feature = "simd-accel")] {
20 use ::std::intrinsics::unlikely;
21 use ::std::intrinsics::likely;
22 } else {
23 #[inline(always)]
24 unsafe fn unlikely(b: bool) -> bool {
26 b
27 }
28 #[inline(always)]
29 unsafe fn likely(b: bool) -> bool {
31 b
32 }
33 }
34}
35
36#[repr(align(64))] pub struct Utf8Data {
38 pub table: [u8; 384],
39}
40
41pub static UTF8_DATA: Utf8Data = Utf8Data {
45 table: [
46 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
47 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
48 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
49 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
50 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
51 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
52 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
53 252, 252, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 84, 148, 148, 148,
54 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 148, 164, 164, 164, 164, 164,
55 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164, 164,
56 164, 164, 164, 164, 164, 164, 164, 164, 164, 252, 252, 252, 252, 252, 252, 252, 252, 252,
57 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
58 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
59 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252, 252,
60 252, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
61 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
62 4, 4, 4, 4, 4, 4, 4, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8,
63 8, 8, 8, 8, 8, 8, 8, 16, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 8, 32, 8, 8, 64, 8, 8, 8, 128, 4,
64 4, 4, 4, 4, 4, 4, 4, 4, 4, 4,
65 ],
66};
67
68pub fn utf8_valid_up_to(src: &[u8]) -> usize {
71 let mut read = 0;
72 'outer: loop {
73 let mut byte = {
74 let src_remaining = &src[read..];
75 match validate_ascii(src_remaining) {
76 None => {
77 return src.len();
78 }
79 Some((non_ascii, consumed)) => {
80 read += consumed;
81 non_ascii
82 }
83 }
84 };
85 if unsafe { likely(read + 4 <= src.len()) } {
92 'inner: loop {
93 if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
99 let second = unsafe { *(src.get_unchecked(read + 1)) };
101 if !in_inclusive_range8(second, 0x80, 0xBF) {
102 break 'outer;
103 }
104 read += 2;
105
106 if unsafe { likely(read + 4 <= src.len()) } {
108 byte = unsafe { *(src.get_unchecked(read)) };
109 if byte < 0x80 {
110 read += 1;
111 continue 'outer;
112 }
113 continue 'inner;
114 }
115 break 'inner;
116 }
117 if unsafe { likely(byte < 0xF0) } {
118 'three: loop {
119 let second = unsafe { *(src.get_unchecked(read + 1)) };
121 let third = unsafe { *(src.get_unchecked(read + 2)) };
122 if ((UTF8_DATA.table[usize::from(second)]
123 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
124 | (third >> 6))
125 != 2
126 {
127 break 'outer;
128 }
129 read += 3;
130
131 if unsafe { likely(read + 4 <= src.len()) } {
133 byte = unsafe { *(src.get_unchecked(read)) };
134 if in_inclusive_range8(byte, 0xE0, 0xEF) {
135 continue 'three;
136 }
137 if unsafe { likely(byte < 0x80) } {
138 read += 1;
139 continue 'outer;
140 }
141 continue 'inner;
142 }
143 break 'inner;
144 }
145 }
146 let second = unsafe { *(src.get_unchecked(read + 1)) };
148 let third = unsafe { *(src.get_unchecked(read + 2)) };
149 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
150 if (u16::from(
151 UTF8_DATA.table[usize::from(second)]
152 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
153 ) | u16::from(third >> 6)
154 | (u16::from(fourth & 0xC0) << 2))
155 != 0x202
156 {
157 break 'outer;
158 }
159 read += 4;
160
161 if unsafe { likely(read + 4 <= src.len()) } {
163 byte = unsafe { *(src.get_unchecked(read)) };
164 if byte < 0x80 {
165 read += 1;
166 continue 'outer;
167 }
168 continue 'inner;
169 }
170 break 'inner;
171 }
172 }
173 'tail: loop {
176 if read >= src.len() {
178 break 'outer;
179 }
180 byte = src[read];
181 if byte < 0x80 {
187 read += 1;
188 continue 'tail;
189 }
190 if in_inclusive_range8(byte, 0xC2, 0xDF) {
191 let new_read = read + 2;
193 if new_read > src.len() {
194 break 'outer;
195 }
196 let second = src[read + 1];
197 if !in_inclusive_range8(second, 0x80, 0xBF) {
198 break 'outer;
199 }
200 read += 2;
201 continue 'tail;
202 }
203 if byte < 0xF0 {
206 let new_read = read + 3;
208 if new_read > src.len() {
209 break 'outer;
210 }
211 let second = src[read + 1];
212 let third = src[read + 2];
213 if ((UTF8_DATA.table[usize::from(second)]
214 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
215 | (third >> 6))
216 != 2
217 {
218 break 'outer;
219 }
220 read += 3;
221 break 'outer;
224 }
225 break 'outer;
226 }
227 }
228 read
229}
230
231#[cfg_attr(feature = "cargo-clippy", allow(never_loop, cyclomatic_complexity))]
232pub fn convert_utf8_to_utf16_up_to_invalid(src: &[u8], dst: &mut [u16]) -> (usize, usize) {
233 let mut read = 0;
234 let mut written = 0;
235 'outer: loop {
236 let mut byte = {
237 let src_remaining = &src[read..];
238 let dst_remaining = &mut dst[written..];
239 let length = ::std::cmp::min(src_remaining.len(), dst_remaining.len());
240 match unsafe {
241 ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
242 } {
243 None => {
244 read += length;
245 written += length;
246 break 'outer;
247 }
248 Some((non_ascii, consumed)) => {
249 read += consumed;
250 written += consumed;
251 non_ascii
252 }
253 }
254 };
255 if unsafe { likely(read + 4 <= src.len()) } {
262 'inner: loop {
263 if unsafe { likely(in_inclusive_range8(byte, 0xC2, 0xDF)) } {
272 let second = unsafe { *(src.get_unchecked(read + 1)) };
274 if !in_inclusive_range8(second, 0x80, 0xBF) {
275 break 'outer;
276 }
277 unsafe {
278 *(dst.get_unchecked_mut(written)) =
279 ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F)
280 };
281 read += 2;
282 written += 1;
283
284 if written == dst.len() {
286 break 'outer;
287 }
288 if unsafe { likely(read + 4 <= src.len()) } {
289 byte = unsafe { *(src.get_unchecked(read)) };
290 if byte < 0x80 {
291 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
292 read += 1;
293 written += 1;
294 continue 'outer;
295 }
296 continue 'inner;
297 }
298 break 'inner;
299 }
300 if unsafe { likely(byte < 0xF0) } {
301 'three: loop {
302 let second = unsafe { *(src.get_unchecked(read + 1)) };
304 let third = unsafe { *(src.get_unchecked(read + 2)) };
305 if ((UTF8_DATA.table[usize::from(second)]
306 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
307 | (third >> 6))
308 != 2
309 {
310 break 'outer;
311 }
312 let point = ((u16::from(byte) & 0xF) << 12)
313 | ((u16::from(second) & 0x3F) << 6)
314 | (u16::from(third) & 0x3F);
315 unsafe { *(dst.get_unchecked_mut(written)) = point };
316 read += 3;
317 written += 1;
318
319 if written == dst.len() {
321 break 'outer;
322 }
323 if unsafe { likely(read + 4 <= src.len()) } {
324 byte = unsafe { *(src.get_unchecked(read)) };
325 if in_inclusive_range8(byte, 0xE0, 0xEF) {
326 continue 'three;
327 }
328 if unsafe { likely(byte < 0x80) } {
329 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
330 read += 1;
331 written += 1;
332 continue 'outer;
333 }
334 continue 'inner;
335 }
336 break 'inner;
337 }
338 }
339 if written + 1 == dst.len() {
341 break 'outer;
342 }
343 let second = unsafe { *(src.get_unchecked(read + 1)) };
344 let third = unsafe { *(src.get_unchecked(read + 2)) };
345 let fourth = unsafe { *(src.get_unchecked(read + 3)) };
346 if (u16::from(
347 UTF8_DATA.table[usize::from(second)]
348 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) },
349 ) | u16::from(third >> 6)
350 | (u16::from(fourth & 0xC0) << 2))
351 != 0x202
352 {
353 break 'outer;
354 }
355 let point = ((u32::from(byte) & 0x7) << 18)
356 | ((u32::from(second) & 0x3F) << 12)
357 | ((u32::from(third) & 0x3F) << 6)
358 | (u32::from(fourth) & 0x3F);
359 unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
360 unsafe {
361 *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
362 };
363 read += 4;
364 written += 2;
365
366 if written == dst.len() {
368 break 'outer;
369 }
370 if unsafe { likely(read + 4 <= src.len()) } {
371 byte = unsafe { *(src.get_unchecked(read)) };
372 if byte < 0x80 {
373 unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
374 read += 1;
375 written += 1;
376 continue 'outer;
377 }
378 continue 'inner;
379 }
380 break 'inner;
381 }
382 }
383 'tail: loop {
386 if read >= src.len() || written >= dst.len() {
388 break 'outer;
389 }
390 byte = src[read];
391 if byte < 0x80 {
397 dst[written] = u16::from(byte);
398 read += 1;
399 written += 1;
400 continue 'tail;
401 }
402 if in_inclusive_range8(byte, 0xC2, 0xDF) {
403 let new_read = read + 2;
405 if new_read > src.len() {
406 break 'outer;
407 }
408 let second = src[read + 1];
409 if !in_inclusive_range8(second, 0x80, 0xBF) {
410 break 'outer;
411 }
412 dst[written] = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
413 read += 2;
414 written += 1;
415 continue 'tail;
416 }
417 if byte < 0xF0 {
420 let new_read = read + 3;
422 if new_read > src.len() {
423 break 'outer;
424 }
425 let second = src[read + 1];
426 let third = src[read + 2];
427 if ((UTF8_DATA.table[usize::from(second)]
428 & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
429 | (third >> 6))
430 != 2
431 {
432 break 'outer;
433 }
434 let point = ((u16::from(byte) & 0xF) << 12)
435 | ((u16::from(second) & 0x3F) << 6)
436 | (u16::from(third) & 0x3F);
437 dst[written] = point;
438 read += 3;
439 written += 1;
440 break 'outer;
443 }
444 break 'outer;
445 }
446 }
447 (read, written)
448}
449
450pub struct Utf8Decoder {
451 code_point: u32,
452 bytes_seen: usize, bytes_needed: usize, lower_boundary: u8,
455 upper_boundary: u8,
456}
457
458impl Utf8Decoder {
459 pub fn new_inner() -> Utf8Decoder {
460 Utf8Decoder {
461 code_point: 0,
462 bytes_seen: 0,
463 bytes_needed: 0,
464 lower_boundary: 0x80u8,
465 upper_boundary: 0xBFu8,
466 }
467 }
468
469 pub fn new() -> VariantDecoder {
470 VariantDecoder::Utf8(Utf8Decoder::new_inner())
471 }
472
473 pub fn in_neutral_state(&self) -> bool {
474 self.bytes_needed == 0
475 }
476
477 fn extra_from_state(&self) -> usize {
478 if self.bytes_needed == 0 {
479 0
480 } else {
481 self.bytes_seen + 1
482 }
483 }
484
485 pub fn max_utf16_buffer_length(&self, byte_length: usize) -> Option<usize> {
486 byte_length.checked_add(1 + self.extra_from_state())
487 }
488
489 pub fn max_utf8_buffer_length_without_replacement(&self, byte_length: usize) -> Option<usize> {
490 byte_length.checked_add(3 + self.extra_from_state())
491 }
492
493 pub fn max_utf8_buffer_length(&self, byte_length: usize) -> Option<usize> {
494 checked_add(
495 3,
496 checked_mul(3, byte_length.checked_add(self.extra_from_state())),
497 )
498 }
499
500 decoder_functions!(
501 {},
502 {
503 if self.bytes_needed == 0 {
506 dest.copy_utf8_up_to_invalid_from(&mut source);
507 }
508 },
509 {
510 if self.bytes_needed != 0 {
511 let bad_bytes = (self.bytes_seen + 1) as u8;
512 self.code_point = 0;
513 self.bytes_needed = 0;
514 self.bytes_seen = 0;
515 return (
516 DecoderResult::Malformed(bad_bytes, 0),
517 src_consumed,
518 dest.written(),
519 );
520 }
521 },
522 {
523 if self.bytes_needed == 0 {
524 if b < 0x80u8 {
525 destination_handle.write_ascii(b);
526 continue;
527 }
528 if b < 0xC2u8 {
529 return (
530 DecoderResult::Malformed(1, 0),
531 unread_handle.consumed(),
532 destination_handle.written(),
533 );
534 }
535 if b < 0xE0u8 {
536 self.bytes_needed = 1;
537 self.code_point = u32::from(b) & 0x1F;
538 continue;
539 }
540 if b < 0xF0u8 {
541 if b == 0xE0u8 {
542 self.lower_boundary = 0xA0u8;
543 } else if b == 0xEDu8 {
544 self.upper_boundary = 0x9Fu8;
545 }
546 self.bytes_needed = 2;
547 self.code_point = u32::from(b) & 0xF;
548 continue;
549 }
550 if b < 0xF5u8 {
551 if b == 0xF0u8 {
552 self.lower_boundary = 0x90u8;
553 } else if b == 0xF4u8 {
554 self.upper_boundary = 0x8Fu8;
555 }
556 self.bytes_needed = 3;
557 self.code_point = u32::from(b) & 0x7;
558 continue;
559 }
560 return (
561 DecoderResult::Malformed(1, 0),
562 unread_handle.consumed(),
563 destination_handle.written(),
564 );
565 }
566 if !(b >= self.lower_boundary && b <= self.upper_boundary) {
568 let bad_bytes = (self.bytes_seen + 1) as u8;
569 self.code_point = 0;
570 self.bytes_needed = 0;
571 self.bytes_seen = 0;
572 self.lower_boundary = 0x80u8;
573 self.upper_boundary = 0xBFu8;
574 return (
575 DecoderResult::Malformed(bad_bytes, 0),
576 unread_handle.unread(),
577 destination_handle.written(),
578 );
579 }
580 self.lower_boundary = 0x80u8;
581 self.upper_boundary = 0xBFu8;
582 self.code_point = (self.code_point << 6) | (u32::from(b) & 0x3F);
583 self.bytes_seen += 1;
584 if self.bytes_seen != self.bytes_needed {
585 continue;
586 }
587 if self.bytes_needed == 3 {
588 destination_handle.write_astral(self.code_point);
589 } else {
590 destination_handle.write_bmp_excl_ascii(self.code_point as u16);
591 }
592 self.code_point = 0;
593 self.bytes_needed = 0;
594 self.bytes_seen = 0;
595 continue;
596 },
597 self,
598 src_consumed,
599 dest,
600 source,
601 b,
602 destination_handle,
603 unread_handle,
604 check_space_astral
605 );
606}
607
608#[cfg_attr(feature = "cargo-clippy", allow(never_loop))]
609#[inline(never)]
610pub fn convert_utf16_to_utf8_partial_inner(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
611 let mut read = 0;
612 let mut written = 0;
613 'outer: loop {
614 let mut unit = {
615 let src_remaining = &src[read..];
616 let dst_remaining = &mut dst[written..];
617 let length = if dst_remaining.len() < src_remaining.len() {
618 dst_remaining.len()
619 } else {
620 src_remaining.len()
621 };
622 match unsafe {
623 basic_latin_to_ascii(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
624 } {
625 None => {
626 read += length;
627 written += length;
628 return (read, written);
629 }
630 Some((non_ascii, consumed)) => {
631 read += consumed;
632 written += consumed;
633 non_ascii
634 }
635 }
636 };
637 'inner: loop {
638 loop {
640 if written.checked_add(4).unwrap() > dst.len() {
644 return (read, written);
645 }
646 read += 1;
647 if unit < 0x800 {
648 unsafe {
649 *(dst.get_unchecked_mut(written)) = (unit >> 6) as u8 | 0xC0u8;
650 written += 1;
651 *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
652 written += 1;
653 }
654 break;
655 }
656 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
657 if unsafe { likely(unit_minus_surrogate_start > (0xDFFF - 0xD800)) } {
658 unsafe {
659 *(dst.get_unchecked_mut(written)) = (unit >> 12) as u8 | 0xE0u8;
660 written += 1;
661 *(dst.get_unchecked_mut(written)) = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
662 written += 1;
663 *(dst.get_unchecked_mut(written)) = (unit & 0x3F) as u8 | 0x80u8;
664 written += 1;
665 }
666 break;
667 }
668 if unsafe { likely(unit_minus_surrogate_start <= (0xDBFF - 0xD800)) } {
669 if read >= src.len() {
673 debug_assert_eq!(read, src.len());
674 unsafe {
676 *(dst.get_unchecked_mut(written)) = 0xEFu8;
677 written += 1;
678 *(dst.get_unchecked_mut(written)) = 0xBFu8;
679 written += 1;
680 *(dst.get_unchecked_mut(written)) = 0xBDu8;
681 written += 1;
682 }
683 return (read, written);
684 }
685 let second = src[read];
686 let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
687 if unsafe { likely(second_minus_low_surrogate_start <= (0xDFFF - 0xDC00)) } {
688 read += 1;
690 let astral = (u32::from(unit) << 10) + u32::from(second)
691 - (((0xD800u32 << 10) - 0x10000u32) + 0xDC00u32);
692 unsafe {
693 *(dst.get_unchecked_mut(written)) = (astral >> 18) as u8 | 0xF0u8;
694 written += 1;
695 *(dst.get_unchecked_mut(written)) =
696 ((astral & 0x3F000u32) >> 12) as u8 | 0x80u8;
697 written += 1;
698 *(dst.get_unchecked_mut(written)) =
699 ((astral & 0xFC0u32) >> 6) as u8 | 0x80u8;
700 written += 1;
701 *(dst.get_unchecked_mut(written)) = (astral & 0x3F) as u8 | 0x80u8;
702 written += 1;
703 }
704 break;
705 }
706 }
710 unsafe {
712 *(dst.get_unchecked_mut(written)) = 0xEFu8;
713 written += 1;
714 *(dst.get_unchecked_mut(written)) = 0xBFu8;
715 written += 1;
716 *(dst.get_unchecked_mut(written)) = 0xBDu8;
717 written += 1;
718 }
719 break;
720 }
721 if read >= src.len() {
725 debug_assert_eq!(read, src.len());
726 return (read, written);
727 }
728 unit = src[read];
729 if unsafe { unlikely(unit < 0x80) } {
730 if written >= dst.len() {
733 debug_assert_eq!(written, dst.len());
734 return (read, written);
735 }
736 dst[written] = unit as u8;
737 read += 1;
738 written += 1;
739 continue 'outer;
742 }
743 continue 'inner;
744 }
745 }
746}
747
748#[inline(never)]
749pub fn convert_utf16_to_utf8_partial_tail(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
750 let mut read = 0;
752 let mut written = 0;
753 let mut unit = src[read];
754 if unit < 0x800 {
757 loop {
758 if unit < 0x80 {
759 if written >= dst.len() {
760 return (read, written);
761 }
762 read += 1;
763 dst[written] = unit as u8;
764 written += 1;
765 } else if unit < 0x800 {
766 if written + 2 > dst.len() {
767 return (read, written);
768 }
769 read += 1;
770 dst[written] = (unit >> 6) as u8 | 0xC0u8;
771 written += 1;
772 dst[written] = (unit & 0x3F) as u8 | 0x80u8;
773 written += 1;
774 } else {
775 return (read, written);
776 }
777 if read >= src.len() {
780 debug_assert_eq!(read, src.len());
781 return (read, written);
782 }
783 unit = src[read];
784 }
785 }
786 if written + 3 > dst.len() {
789 return (read, written);
790 }
791 read += 1;
792 let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
793 if unit_minus_surrogate_start <= (0xDFFF - 0xD800) {
794 if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
796 if read >= src.len() {
798 unit = 0xFFFD;
800 } else {
801 let second = src[read];
802 if in_inclusive_range16(second, 0xDC00, 0xDFFF) {
803 read -= 1;
805 return (read, written);
806 }
807 unit = 0xFFFD;
809 }
810 } else {
811 unit = 0xFFFD;
813 }
814 }
815 dst[written] = (unit >> 12) as u8 | 0xE0u8;
816 written += 1;
817 dst[written] = ((unit & 0xFC0) >> 6) as u8 | 0x80u8;
818 written += 1;
819 dst[written] = (unit & 0x3F) as u8 | 0x80u8;
820 written += 1;
821 debug_assert_eq!(written, dst.len());
822 (read, written)
823}
824
825pub struct Utf8Encoder;
826
827impl Utf8Encoder {
828 pub fn new(encoding: &'static Encoding) -> Encoder {
829 Encoder::new(encoding, VariantEncoder::Utf8(Utf8Encoder))
830 }
831
832 pub fn max_buffer_length_from_utf16_without_replacement(
833 &self,
834 u16_length: usize,
835 ) -> Option<usize> {
836 u16_length.checked_mul(3)
837 }
838
839 pub fn max_buffer_length_from_utf8_without_replacement(
840 &self,
841 byte_length: usize,
842 ) -> Option<usize> {
843 Some(byte_length)
844 }
845
846 pub fn encode_from_utf16_raw(
847 &mut self,
848 src: &[u16],
849 dst: &mut [u8],
850 _last: bool,
851 ) -> (EncoderResult, usize, usize) {
852 let (read, written) = convert_utf16_to_utf8_partial(src, dst);
853 (
854 if read == src.len() {
855 EncoderResult::InputEmpty
856 } else {
857 EncoderResult::OutputFull
858 },
859 read,
860 written,
861 )
862 }
863
864 pub fn encode_from_utf8_raw(
865 &mut self,
866 src: &str,
867 dst: &mut [u8],
868 _last: bool,
869 ) -> (EncoderResult, usize, usize) {
870 let bytes = src.as_bytes();
871 let mut to_write = bytes.len();
872 if to_write <= dst.len() {
873 (&mut dst[..to_write]).copy_from_slice(bytes);
874 return (EncoderResult::InputEmpty, to_write, to_write);
875 }
876 to_write = dst.len();
877 while (bytes[to_write] & 0xC0) == 0x80 {
879 to_write -= 1;
880 }
881 (&mut dst[..to_write]).copy_from_slice(&bytes[..to_write]);
882 (EncoderResult::OutputFull, to_write, to_write)
883 }
884}
885
886#[cfg(test)]
890mod tests {
891 use super::super::testing::*;
892 use super::super::*;
893
894 fn decode_utf8_to_utf8(bytes: &[u8], expect: &str) {
899 decode_to_utf8(UTF_8, bytes, expect);
900 }
901
902 fn decode_valid_utf8(string: &str) {
903 decode_utf8_to_utf8(string.as_bytes(), string);
904 }
905
906 fn encode_utf8_from_utf16(string: &[u16], expect: &[u8]) {
907 encode_from_utf16(UTF_8, string, expect);
908 }
909
910 fn encode_utf8_from_utf8(string: &str, expect: &[u8]) {
911 encode_from_utf8(UTF_8, string, expect);
912 }
913
914 fn encode_utf8_from_utf16_with_output_limit(
915 string: &[u16],
916 expect: &str,
917 limit: usize,
918 expect_result: EncoderResult,
919 ) {
920 let mut dst = Vec::new();
921 {
922 dst.resize(limit, 0u8);
923 let mut encoder = UTF_8.new_encoder();
924 let (result, read, written) =
925 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
926 assert_eq!(result, expect_result);
927 if expect_result == EncoderResult::InputEmpty {
928 assert_eq!(read, string.len());
929 }
930 assert_eq!(&dst[..written], expect.as_bytes());
931 }
932 {
933 dst.resize(64, 0u8);
934 for (i, elem) in dst.iter_mut().enumerate() {
935 *elem = i as u8;
936 }
937 let mut encoder = UTF_8.new_encoder();
938 let (_, _, mut j) =
939 encoder.encode_from_utf16_without_replacement(string, &mut dst, false);
940 while j < dst.len() {
941 assert_eq!(usize::from(dst[j]), j);
942 j += 1;
943 }
944 }
945 }
946
947 #[test]
948 fn test_utf8_decode() {
949 decode_valid_utf8("");
951 decode_valid_utf8("ab");
953 decode_valid_utf8("a\u{E4}Z");
955 decode_valid_utf8("a\u{2603}Z");
957 decode_valid_utf8("a\u{1F4A9}Z");
959 decode_utf8_to_utf8(b"a\xC3Z", "a\u{FFFD}Z");
961 decode_utf8_to_utf8(b"a\xC3", "a\u{FFFD}");
962 decode_utf8_to_utf8(b"a\xE2\x98Z", "a\u{FFFD}Z");
964 decode_utf8_to_utf8(b"a\xE2\x98", "a\u{FFFD}");
965 decode_utf8_to_utf8(b"a\xF0\x9F\x92Z", "a\u{FFFD}Z");
967 decode_utf8_to_utf8(b"a\xF0\x9F\x92", "a\u{FFFD}");
968 decode_utf8_to_utf8(b"a\xBFZ", "a\u{FFFD}Z");
970 decode_utf8_to_utf8(b"a\xBF", "a\u{FFFD}");
971 decode_utf8_to_utf8(b"a\xBF\xBFZ", "a\u{FFFD}\u{FFFD}Z");
973 decode_utf8_to_utf8(b"a\xBF\xBF", "a\u{FFFD}\u{FFFD}");
974 decode_utf8_to_utf8(b"a\xC3\xA4\x80Z", "a\u{E4}\u{FFFD}Z");
976 decode_utf8_to_utf8(b"a\xC3\xA4\x80", "a\u{E4}\u{FFFD}");
977 decode_utf8_to_utf8(b"a\xC3\xA4\xBFZ", "a\u{E4}\u{FFFD}Z");
979 decode_utf8_to_utf8(b"a\xC3\xA4\xBF", "a\u{E4}\u{FFFD}");
980 decode_utf8_to_utf8(b"a\xE2\x98\x83\x80Z", "a\u{2603}\u{FFFD}Z");
982 decode_utf8_to_utf8(b"a\xE2\x98\x83\x80", "a\u{2603}\u{FFFD}");
983 decode_utf8_to_utf8(b"a\xE2\x98\x83\xBFZ", "a\u{2603}\u{FFFD}Z");
985 decode_utf8_to_utf8(b"a\xE2\x98\x83\xBF", "a\u{2603}\u{FFFD}");
986 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80Z", "a\u{1F4A9}\u{FFFD}Z");
988 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\x80", "a\u{1F4A9}\u{FFFD}");
989 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBFZ", "a\u{1F4A9}\u{FFFD}Z");
991 decode_utf8_to_utf8(b"a\xF0\x9F\x92\xA9\xBF", "a\u{1F4A9}\u{FFFD}");
992
993 decode_valid_utf8("Z\x00");
996 decode_valid_utf8("Z\x00Z");
997 decode_utf8_to_utf8(b"a\xC0\x80", "a\u{FFFD}\u{FFFD}");
999 decode_utf8_to_utf8(b"a\xC0\x80Z", "a\u{FFFD}\u{FFFD}Z");
1000 decode_utf8_to_utf8(b"a\xE0\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1002 decode_utf8_to_utf8(b"a\xE0\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1003 decode_utf8_to_utf8(b"a\xF0\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1005 decode_utf8_to_utf8(b"a\xF0\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1006 decode_utf8_to_utf8(b"a\xFF", "a\u{FFFD}");
1008 decode_utf8_to_utf8(b"a\xFFZ", "a\u{FFFD}Z");
1009 decode_valid_utf8("a\x7F");
1011 decode_valid_utf8("a\x7FZ");
1012 decode_utf8_to_utf8(b"a\xC1\xBF", "a\u{FFFD}\u{FFFD}");
1014 decode_utf8_to_utf8(b"a\xC1\xBFZ", "a\u{FFFD}\u{FFFD}Z");
1015 decode_utf8_to_utf8(b"a\xE0\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1017 decode_utf8_to_utf8(b"a\xE0\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1018 decode_utf8_to_utf8(b"a\xF0\x80\x81\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1020 decode_utf8_to_utf8(b"a\xF0\x80\x81\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1021 decode_utf8_to_utf8(b"a\x80Z", "a\u{FFFD}Z");
1023 decode_utf8_to_utf8(b"a\x80", "a\u{FFFD}");
1024 decode_utf8_to_utf8(b"a\x80\x80Z", "a\u{FFFD}\u{FFFD}Z");
1026 decode_utf8_to_utf8(b"a\x80\x80", "a\u{FFFD}\u{FFFD}");
1027 decode_utf8_to_utf8(b"a\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1029 decode_utf8_to_utf8(b"a\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1030 decode_utf8_to_utf8(b"a\x80\x80\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1032 decode_utf8_to_utf8(b"a\x80\x80\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1033 decode_utf8_to_utf8(b"a\xC2\x80", "a\u{0080}");
1035 decode_utf8_to_utf8(b"a\xC2\x80Z", "a\u{0080}Z");
1036 decode_utf8_to_utf8(b"a\xE0\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1038 decode_utf8_to_utf8(b"a\xE0\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1039 decode_utf8_to_utf8(b"a\xF0\x80\x82\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1041 decode_utf8_to_utf8(b"a\xF0\x80\x82\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1042 decode_utf8_to_utf8(b"a\xC1\x80", "a\u{FFFD}\u{FFFD}");
1044 decode_utf8_to_utf8(b"a\xC1\x80Z", "a\u{FFFD}\u{FFFD}Z");
1045 decode_utf8_to_utf8(b"a\xC2\x7F", "a\u{FFFD}\u{007F}");
1047 decode_utf8_to_utf8(b"a\xC2\x7FZ", "a\u{FFFD}\u{007F}Z");
1048 decode_utf8_to_utf8(b"a\xDF\xBF", "a\u{07FF}");
1050 decode_utf8_to_utf8(b"a\xDF\xBFZ", "a\u{07FF}Z");
1051 decode_utf8_to_utf8(b"a\xE0\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1053 decode_utf8_to_utf8(b"a\xE0\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1054 decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1056 decode_utf8_to_utf8(b"a\xF0\x80\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1057 decode_utf8_to_utf8(b"a\xE0\xA0\x80", "a\u{0800}");
1059 decode_utf8_to_utf8(b"a\xE0\xA0\x80Z", "a\u{0800}Z");
1060 decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1062 decode_utf8_to_utf8(b"a\xF0\x80\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1063 decode_utf8_to_utf8(b"a\xED\x9F\xBF", "a\u{D7FF}");
1065 decode_utf8_to_utf8(b"a\xED\x9F\xBFZ", "a\u{D7FF}Z");
1066 decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1068 decode_utf8_to_utf8(b"a\xF0\x8D\x9F\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1069 decode_utf8_to_utf8(b"a\xED\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1071 decode_utf8_to_utf8(b"a\xED\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1072 decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1074 decode_utf8_to_utf8(b"a\xF0\x8D\xA0\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1075 decode_utf8_to_utf8(b"a\xED\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}");
1077 decode_utf8_to_utf8(b"a\xED\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}Z");
1078 decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1080 decode_utf8_to_utf8(b"a\xF0\x8D\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1081 decode_utf8_to_utf8(b"a\xEE\x80\x80", "a\u{E000}");
1083 decode_utf8_to_utf8(b"a\xEE\x80\x80Z", "a\u{E000}Z");
1084 decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1086 decode_utf8_to_utf8(b"a\xF0\x8E\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1087 decode_utf8_to_utf8(b"a\xEF\xBF\xBF", "a\u{FFFF}");
1089 decode_utf8_to_utf8(b"a\xEF\xBF\xBFZ", "a\u{FFFF}Z");
1090 decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBF", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1092 decode_utf8_to_utf8(b"a\xF0\x8F\xBF\xBFZ", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1093 decode_utf8_to_utf8(b"a\xF0\x90\x80\x80", "a\u{10000}");
1095 decode_utf8_to_utf8(b"a\xF0\x90\x80\x80Z", "a\u{10000}Z");
1096 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBF", "a\u{10FFFF}");
1098 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xBFZ", "a\u{10FFFF}Z");
1099 decode_utf8_to_utf8(b"a\xF4\x90\x80\x80", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}");
1101 decode_utf8_to_utf8(b"a\xF4\x90\x80\x80Z", "a\u{FFFD}\u{FFFD}\u{FFFD}\u{FFFD}Z");
1102
1103 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFF", "a\u{FFFD}\u{FFFD}");
1105 decode_utf8_to_utf8(b"a\xF4\x8F\xBF\xFFZ", "a\u{FFFD}\u{FFFD}Z");
1106 }
1107
1108 #[test]
1109 fn test_utf8_encode() {
1110 encode_utf8_from_utf16(&[], b"");
1112 encode_utf8_from_utf8("", b"");
1113
1114 encode_utf8_from_utf16(&[0x0000], "\u{0000}".as_bytes());
1115 encode_utf8_from_utf16(&[0x007F], "\u{007F}".as_bytes());
1116 encode_utf8_from_utf16(&[0x0080], "\u{0080}".as_bytes());
1117 encode_utf8_from_utf16(&[0x07FF], "\u{07FF}".as_bytes());
1118 encode_utf8_from_utf16(&[0x0800], "\u{0800}".as_bytes());
1119 encode_utf8_from_utf16(&[0xD7FF], "\u{D7FF}".as_bytes());
1120 encode_utf8_from_utf16(&[0xD800], "\u{FFFD}".as_bytes());
1121 encode_utf8_from_utf16(&[0xD800, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1122 encode_utf8_from_utf16(&[0xDFFF], "\u{FFFD}".as_bytes());
1123 encode_utf8_from_utf16(&[0xDFFF, 0x0062], "\u{FFFD}\u{0062}".as_bytes());
1124 encode_utf8_from_utf16(&[0xE000], "\u{E000}".as_bytes());
1125 encode_utf8_from_utf16(&[0xFFFF], "\u{FFFF}".as_bytes());
1126 encode_utf8_from_utf16(&[0xD800, 0xDC00], "\u{10000}".as_bytes());
1127 encode_utf8_from_utf16(&[0xDBFF, 0xDFFF], "\u{10FFFF}".as_bytes());
1128 encode_utf8_from_utf16(&[0xDC00, 0xDEDE], "\u{FFFD}\u{FFFD}".as_bytes());
1129 }
1130
1131 #[test]
1132 fn test_encode_utf8_from_utf16_with_output_limit() {
1133 encode_utf8_from_utf16_with_output_limit(&[0x0062], "\u{62}", 1, EncoderResult::InputEmpty);
1134 encode_utf8_from_utf16_with_output_limit(&[0x00A7], "\u{A7}", 2, EncoderResult::InputEmpty);
1135 encode_utf8_from_utf16_with_output_limit(
1136 &[0x2603],
1137 "\u{2603}",
1138 3,
1139 EncoderResult::InputEmpty,
1140 );
1141 encode_utf8_from_utf16_with_output_limit(
1142 &[0xD83D, 0xDCA9],
1143 "\u{1F4A9}",
1144 4,
1145 EncoderResult::InputEmpty,
1146 );
1147
1148 encode_utf8_from_utf16_with_output_limit(&[0x00A7], "", 1, EncoderResult::OutputFull);
1149 encode_utf8_from_utf16_with_output_limit(&[0x2603], "", 2, EncoderResult::OutputFull);
1150 encode_utf8_from_utf16_with_output_limit(
1151 &[0xD83D, 0xDCA9],
1152 "",
1153 3,
1154 EncoderResult::OutputFull,
1155 );
1156
1157 encode_utf8_from_utf16_with_output_limit(
1158 &[0x0063, 0x0062],
1159 "\u{63}\u{62}",
1160 2,
1161 EncoderResult::InputEmpty,
1162 );
1163 encode_utf8_from_utf16_with_output_limit(
1164 &[0x0063, 0x00A7],
1165 "\u{63}\u{A7}",
1166 3,
1167 EncoderResult::InputEmpty,
1168 );
1169 encode_utf8_from_utf16_with_output_limit(
1170 &[0x0063, 0x2603],
1171 "\u{63}\u{2603}",
1172 4,
1173 EncoderResult::InputEmpty,
1174 );
1175 encode_utf8_from_utf16_with_output_limit(
1176 &[0x0063, 0xD83D, 0xDCA9],
1177 "\u{63}\u{1F4A9}",
1178 5,
1179 EncoderResult::InputEmpty,
1180 );
1181
1182 encode_utf8_from_utf16_with_output_limit(
1183 &[0x0063, 0x00A7],
1184 "\u{63}",
1185 2,
1186 EncoderResult::OutputFull,
1187 );
1188 encode_utf8_from_utf16_with_output_limit(
1189 &[0x0063, 0x2603],
1190 "\u{63}",
1191 3,
1192 EncoderResult::OutputFull,
1193 );
1194 encode_utf8_from_utf16_with_output_limit(
1195 &[0x0063, 0xD83D, 0xDCA9],
1196 "\u{63}",
1197 4,
1198 EncoderResult::OutputFull,
1199 );
1200
1201 encode_utf8_from_utf16_with_output_limit(
1202 &[0x00B6, 0x0062],
1203 "\u{B6}\u{62}",
1204 3,
1205 EncoderResult::InputEmpty,
1206 );
1207 encode_utf8_from_utf16_with_output_limit(
1208 &[0x00B6, 0x00A7],
1209 "\u{B6}\u{A7}",
1210 4,
1211 EncoderResult::InputEmpty,
1212 );
1213 encode_utf8_from_utf16_with_output_limit(
1214 &[0x00B6, 0x2603],
1215 "\u{B6}\u{2603}",
1216 5,
1217 EncoderResult::InputEmpty,
1218 );
1219 encode_utf8_from_utf16_with_output_limit(
1220 &[0x00B6, 0xD83D, 0xDCA9],
1221 "\u{B6}\u{1F4A9}",
1222 6,
1223 EncoderResult::InputEmpty,
1224 );
1225
1226 encode_utf8_from_utf16_with_output_limit(
1227 &[0x00B6, 0x00A7],
1228 "\u{B6}",
1229 3,
1230 EncoderResult::OutputFull,
1231 );
1232 encode_utf8_from_utf16_with_output_limit(
1233 &[0x00B6, 0x2603],
1234 "\u{B6}",
1235 4,
1236 EncoderResult::OutputFull,
1237 );
1238 encode_utf8_from_utf16_with_output_limit(
1239 &[0x00B6, 0xD83D, 0xDCA9],
1240 "\u{B6}",
1241 5,
1242 EncoderResult::OutputFull,
1243 );
1244
1245 encode_utf8_from_utf16_with_output_limit(
1246 &[0x263A, 0x0062],
1247 "\u{263A}\u{62}",
1248 4,
1249 EncoderResult::InputEmpty,
1250 );
1251 encode_utf8_from_utf16_with_output_limit(
1252 &[0x263A, 0x00A7],
1253 "\u{263A}\u{A7}",
1254 5,
1255 EncoderResult::InputEmpty,
1256 );
1257 encode_utf8_from_utf16_with_output_limit(
1258 &[0x263A, 0x2603],
1259 "\u{263A}\u{2603}",
1260 6,
1261 EncoderResult::InputEmpty,
1262 );
1263 encode_utf8_from_utf16_with_output_limit(
1264 &[0x263A, 0xD83D, 0xDCA9],
1265 "\u{263A}\u{1F4A9}",
1266 7,
1267 EncoderResult::InputEmpty,
1268 );
1269
1270 encode_utf8_from_utf16_with_output_limit(
1271 &[0x263A, 0x00A7],
1272 "\u{263A}",
1273 4,
1274 EncoderResult::OutputFull,
1275 );
1276 encode_utf8_from_utf16_with_output_limit(
1277 &[0x263A, 0x2603],
1278 "\u{263A}",
1279 5,
1280 EncoderResult::OutputFull,
1281 );
1282 encode_utf8_from_utf16_with_output_limit(
1283 &[0x263A, 0xD83D, 0xDCA9],
1284 "\u{263A}",
1285 6,
1286 EncoderResult::OutputFull,
1287 );
1288
1289 encode_utf8_from_utf16_with_output_limit(
1290 &[0xD83D, 0xDE0E, 0x0062],
1291 "\u{1F60E}\u{62}",
1292 5,
1293 EncoderResult::InputEmpty,
1294 );
1295 encode_utf8_from_utf16_with_output_limit(
1296 &[0xD83D, 0xDE0E, 0x00A7],
1297 "\u{1F60E}\u{A7}",
1298 6,
1299 EncoderResult::InputEmpty,
1300 );
1301 encode_utf8_from_utf16_with_output_limit(
1302 &[0xD83D, 0xDE0E, 0x2603],
1303 "\u{1F60E}\u{2603}",
1304 7,
1305 EncoderResult::InputEmpty,
1306 );
1307 encode_utf8_from_utf16_with_output_limit(
1308 &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1309 "\u{1F60E}\u{1F4A9}",
1310 8,
1311 EncoderResult::InputEmpty,
1312 );
1313
1314 encode_utf8_from_utf16_with_output_limit(
1315 &[0xD83D, 0xDE0E, 0x00A7],
1316 "\u{1F60E}",
1317 5,
1318 EncoderResult::OutputFull,
1319 );
1320 encode_utf8_from_utf16_with_output_limit(
1321 &[0xD83D, 0xDE0E, 0x2603],
1322 "\u{1F60E}",
1323 6,
1324 EncoderResult::OutputFull,
1325 );
1326 encode_utf8_from_utf16_with_output_limit(
1327 &[0xD83D, 0xDE0E, 0xD83D, 0xDCA9],
1328 "\u{1F60E}",
1329 7,
1330 EncoderResult::OutputFull,
1331 );
1332
1333 encode_utf8_from_utf16_with_output_limit(
1334 &[0x0063, 0x00B6, 0x0062, 0x0062],
1335 "\u{63}\u{B6}\u{62}\u{62}",
1336 5,
1337 EncoderResult::InputEmpty,
1338 );
1339 encode_utf8_from_utf16_with_output_limit(
1340 &[0x0063, 0x00B6, 0x0062, 0x0062],
1341 "\u{63}\u{B6}\u{62}",
1342 4,
1343 EncoderResult::OutputFull,
1344 );
1345
1346 encode_utf8_from_utf16_with_output_limit(
1347 &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1348 "\u{63}\u{B6}\u{62}\u{62}\u{62}",
1349 6,
1350 EncoderResult::InputEmpty,
1351 );
1352 encode_utf8_from_utf16_with_output_limit(
1353 &[0x0063, 0x00B6, 0x0062, 0x0062, 0x0062],
1354 "\u{63}\u{B6}\u{62}\u{62}",
1355 5,
1356 EncoderResult::OutputFull,
1357 );
1358
1359 encode_utf8_from_utf16_with_output_limit(
1360 &[0x263A, 0x0062, 0x0062],
1361 "\u{263A}\u{62}\u{62}",
1362 5,
1363 EncoderResult::InputEmpty,
1364 );
1365 encode_utf8_from_utf16_with_output_limit(
1366 &[0x263A, 0x0062, 0x0062],
1367 "\u{263A}\u{62}",
1368 4,
1369 EncoderResult::OutputFull,
1370 );
1371
1372 encode_utf8_from_utf16_with_output_limit(
1373 &[0x263A, 0x0062, 0x0062, 0x0062],
1374 "\u{263A}\u{62}\u{62}\u{62}",
1375 6,
1376 EncoderResult::InputEmpty,
1377 );
1378 encode_utf8_from_utf16_with_output_limit(
1379 &[0x263A, 0x0062, 0x0062, 0x0062],
1380 "\u{263A}\u{62}\u{62}",
1381 5,
1382 EncoderResult::OutputFull,
1383 );
1384
1385 encode_utf8_from_utf16_with_output_limit(
1386 &[0x0063, 0x00B6, 0x00A7],
1387 "\u{63}\u{B6}\u{A7}",
1388 5,
1389 EncoderResult::InputEmpty,
1390 );
1391 encode_utf8_from_utf16_with_output_limit(
1392 &[0x0063, 0x00B6, 0x00A7],
1393 "\u{63}\u{B6}",
1394 4,
1395 EncoderResult::OutputFull,
1396 );
1397
1398 encode_utf8_from_utf16_with_output_limit(
1399 &[0x0063, 0x00B6, 0x00A7, 0x0062],
1400 "\u{63}\u{B6}\u{A7}\u{62}",
1401 6,
1402 EncoderResult::InputEmpty,
1403 );
1404 encode_utf8_from_utf16_with_output_limit(
1405 &[0x0063, 0x00B6, 0x00A7, 0x0062],
1406 "\u{63}\u{B6}\u{A7}",
1407 5,
1408 EncoderResult::OutputFull,
1409 );
1410
1411 encode_utf8_from_utf16_with_output_limit(
1412 &[0x263A, 0x00A7, 0x0062],
1413 "\u{263A}\u{A7}\u{62}",
1414 6,
1415 EncoderResult::InputEmpty,
1416 );
1417 encode_utf8_from_utf16_with_output_limit(
1418 &[0x263A, 0x00A7, 0x0062],
1419 "\u{263A}\u{A7}",
1420 5,
1421 EncoderResult::OutputFull,
1422 );
1423
1424 encode_utf8_from_utf16_with_output_limit(
1425 &[0x0063, 0x00B6, 0x0062, 0x00A7],
1426 "\u{63}\u{B6}\u{62}\u{A7}",
1427 6,
1428 EncoderResult::InputEmpty,
1429 );
1430 encode_utf8_from_utf16_with_output_limit(
1431 &[0x0063, 0x00B6, 0x0062, 0x00A7],
1432 "\u{63}\u{B6}\u{62}",
1433 5,
1434 EncoderResult::OutputFull,
1435 );
1436
1437 encode_utf8_from_utf16_with_output_limit(
1438 &[0x263A, 0x0062, 0x00A7],
1439 "\u{263A}\u{62}\u{A7}",
1440 6,
1441 EncoderResult::InputEmpty,
1442 );
1443 encode_utf8_from_utf16_with_output_limit(
1444 &[0x263A, 0x0062, 0x00A7],
1445 "\u{263A}\u{62}",
1446 5,
1447 EncoderResult::OutputFull,
1448 );
1449
1450 encode_utf8_from_utf16_with_output_limit(
1451 &[0x0063, 0x00B6, 0x2603],
1452 "\u{63}\u{B6}\u{2603}",
1453 6,
1454 EncoderResult::InputEmpty,
1455 );
1456 encode_utf8_from_utf16_with_output_limit(
1457 &[0x0063, 0x00B6, 0x2603],
1458 "\u{63}\u{B6}",
1459 5,
1460 EncoderResult::OutputFull,
1461 );
1462
1463 encode_utf8_from_utf16_with_output_limit(
1464 &[0x263A, 0x2603],
1465 "\u{263A}\u{2603}",
1466 6,
1467 EncoderResult::InputEmpty,
1468 );
1469 encode_utf8_from_utf16_with_output_limit(
1470 &[0x263A, 0x2603],
1471 "\u{263A}",
1472 5,
1473 EncoderResult::OutputFull,
1474 );
1475
1476 encode_utf8_from_utf16_with_output_limit(
1477 &[0x0063, 0x00B6, 0xD83D],
1478 "\u{63}\u{B6}\u{FFFD}",
1479 6,
1480 EncoderResult::InputEmpty,
1481 );
1482 encode_utf8_from_utf16_with_output_limit(
1483 &[0x0063, 0x00B6, 0xD83D],
1484 "\u{63}\u{B6}",
1485 5,
1486 EncoderResult::OutputFull,
1487 );
1488
1489 encode_utf8_from_utf16_with_output_limit(
1490 &[0x263A, 0xD83D],
1491 "\u{263A}\u{FFFD}",
1492 6,
1493 EncoderResult::InputEmpty,
1494 );
1495 encode_utf8_from_utf16_with_output_limit(
1496 &[0x263A, 0xD83D],
1497 "\u{263A}",
1498 5,
1499 EncoderResult::OutputFull,
1500 );
1501
1502 encode_utf8_from_utf16_with_output_limit(
1503 &[0x0063, 0x00B6, 0xDCA9],
1504 "\u{63}\u{B6}\u{FFFD}",
1505 6,
1506 EncoderResult::InputEmpty,
1507 );
1508 encode_utf8_from_utf16_with_output_limit(
1509 &[0x0063, 0x00B6, 0xDCA9],
1510 "\u{63}\u{B6}",
1511 5,
1512 EncoderResult::OutputFull,
1513 );
1514
1515 encode_utf8_from_utf16_with_output_limit(
1516 &[0x263A, 0xDCA9],
1517 "\u{263A}\u{FFFD}",
1518 6,
1519 EncoderResult::InputEmpty,
1520 );
1521 encode_utf8_from_utf16_with_output_limit(
1522 &[0x263A, 0xDCA9],
1523 "\u{263A}",
1524 5,
1525 EncoderResult::OutputFull,
1526 );
1527 }
1528
1529 #[test]
1530 fn test_utf8_max_length_from_utf16() {
1531 let mut encoder = UTF_8.new_encoder();
1532 let mut output = [0u8; 13];
1533 let input = &[0x2C9Fu16, 0x2CA9u16, 0x2CA3u16, 0x2C9Fu16];
1534 let needed = encoder
1535 .max_buffer_length_from_utf16_without_replacement(input.len())
1536 .unwrap();
1537 let (result, _, _) =
1538 encoder.encode_from_utf16_without_replacement(input, &mut output[..needed], true);
1539 assert_eq!(result, EncoderResult::InputEmpty);
1540 }
1541
1542 #[test]
1543 fn test_decode_bom_prefixed_split_byte_triple() {
1544 let mut output = [0u16; 20];
1545 let mut decoder = UTF_8.new_decoder();
1546 {
1547 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1548 let (result, read, written, had_errors) =
1549 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1550 assert_eq!(result, CoderResult::InputEmpty);
1551 assert_eq!(read, 1);
1552 assert_eq!(written, 0);
1553 assert!(!had_errors);
1554 }
1555 {
1556 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1557 let (result, read, written, had_errors) =
1558 decoder.decode_to_utf16(b"\xBF", &mut output[..needed], false);
1559 assert_eq!(result, CoderResult::InputEmpty);
1560 assert_eq!(read, 1);
1561 assert_eq!(written, 0);
1562 assert!(!had_errors);
1563 }
1564 {
1565 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1566 let (result, read, written, had_errors) =
1567 decoder.decode_to_utf16(b"\xBE", &mut output[..needed], true);
1568 assert_eq!(result, CoderResult::InputEmpty);
1569 assert_eq!(read, 1);
1570 assert_eq!(written, 1);
1571 assert!(!had_errors);
1572 assert_eq!(output[0], 0xFFFE);
1573 }
1574 }
1575
1576 #[test]
1577 fn test_decode_bom_prefixed_split_byte_pair() {
1578 let mut output = [0u16; 20];
1579 let mut decoder = UTF_8.new_decoder();
1580 {
1581 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1582 let (result, read, written, had_errors) =
1583 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], false);
1584 assert_eq!(result, CoderResult::InputEmpty);
1585 assert_eq!(read, 1);
1586 assert_eq!(written, 0);
1587 assert!(!had_errors);
1588 }
1589 {
1590 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1591 let (result, read, written, had_errors) =
1592 decoder.decode_to_utf16(b"\xBC", &mut output[..needed], true);
1593 assert_eq!(result, CoderResult::InputEmpty);
1594 assert_eq!(read, 1);
1595 assert_eq!(written, 1);
1596 assert!(had_errors);
1597 assert_eq!(output[0], 0xFFFD);
1598 }
1599 }
1600
1601 #[test]
1602 fn test_decode_bom_prefix() {
1603 let mut output = [0u16; 20];
1604 let mut decoder = UTF_8.new_decoder();
1605 {
1606 let needed = decoder.max_utf16_buffer_length(1).unwrap();
1607 let (result, read, written, had_errors) =
1608 decoder.decode_to_utf16(b"\xEF", &mut output[..needed], true);
1609 assert_eq!(result, CoderResult::InputEmpty);
1610 assert_eq!(read, 1);
1611 assert_eq!(written, 1);
1612 assert!(had_errors);
1613 assert_eq!(output[0], 0xFFFD);
1614 }
1615 }
1616
1617 #[test]
1618 fn test_tail() {
1619 let mut output = [0u16; 1];
1620 let mut decoder = UTF_8.new_decoder_without_bom_handling();
1621 {
1622 let (result, read, written, had_errors) =
1623 decoder.decode_to_utf16("\u{E4}a".as_bytes(), &mut output[..], false);
1624 assert_eq!(result, CoderResult::OutputFull);
1625 assert_eq!(read, 2);
1626 assert_eq!(written, 1);
1627 assert!(!had_errors);
1628 assert_eq!(output[0], 0x00E4);
1629 }
1630 }
1631}