encoding_rs/
mem.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10//! Functions for converting between different in-RAM representations of text
11//! and for quickly checking if the Unicode Bidirectional Algorithm can be
12//! avoided.
13//!
14//! By using slices for output, the functions here seek to enable by-register
15//! (ALU register or SIMD register as available) operations in order to
16//! outperform iterator-based conversions available in the Rust standard
17//! library.
18//!
19//! _Note:_ "Latin1" in this module refers to the Unicode range from U+0000 to
20//! U+00FF, inclusive, and does not refer to the windows-1252 range. This
21//! in-memory encoding is sometimes used as a storage optimization of text
22//! when UTF-16 indexing and length semantics are exposed.
23//!
24//! The FFI binding for this module are in the
25//! [encoding_c_mem crate](https://github.com/hsivonen/encoding_c_mem).
26
27use std::borrow::Cow;
28
29use super::in_inclusive_range16;
30use super::in_inclusive_range32;
31use super::in_inclusive_range8;
32use super::in_range16;
33use super::in_range32;
34use super::DecoderResult;
35use crate::ascii::*;
36use crate::utf_8::*;
37
38macro_rules! non_fuzz_debug_assert {
39    ($($arg:tt)*) => (if !cfg!(fuzzing) { debug_assert!($($arg)*); })
40}
41
42cfg_if! {
43    if #[cfg(feature = "simd-accel")] {
44        use ::std::intrinsics::likely;
45        use ::std::intrinsics::unlikely;
46    } else {
47        #[inline(always)]
48        // Unsafe to match the intrinsic, which is needlessly unsafe.
49        unsafe fn likely(b: bool) -> bool {
50            b
51        }
52        #[inline(always)]
53        // Unsafe to match the intrinsic, which is needlessly unsafe.
54        unsafe fn unlikely(b: bool) -> bool {
55            b
56        }
57    }
58}
59
60/// Classification of text as Latin1 (all code points are below U+0100),
61/// left-to-right with some non-Latin1 characters or as containing at least
62/// some right-to-left characters.
63#[must_use]
64#[derive(Debug, PartialEq, Eq)]
65#[repr(C)]
66pub enum Latin1Bidi {
67    /// Every character is below U+0100.
68    Latin1 = 0,
69    /// There is at least one character that's U+0100 or higher, but there
70    /// are no right-to-left characters.
71    LeftToRight = 1,
72    /// There is at least one right-to-left character.
73    Bidi = 2,
74}
75
76// `as` truncates, so works on 32-bit, too.
77#[allow(dead_code)]
78const LATIN1_MASK: usize = 0xFF00_FF00_FF00_FF00u64 as usize;
79
80#[allow(unused_macros)]
81macro_rules! by_unit_check_alu {
82    ($name:ident, $unit:ty, $bound:expr, $mask:ident) => {
83        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
84        #[inline(always)]
85        fn $name(buffer: &[$unit]) -> bool {
86            let mut offset = 0usize;
87            let mut accu = 0usize;
88            let unit_size = ::std::mem::size_of::<$unit>();
89            let len = buffer.len();
90            if len >= ALU_ALIGNMENT / unit_size {
91                // The most common reason to return `false` is for the first code
92                // unit to fail the test, so check that first.
93                if buffer[0] >= $bound {
94                    return false;
95                }
96                let src = buffer.as_ptr();
97                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK))
98                    & ALU_ALIGNMENT_MASK)
99                    / unit_size;
100                if until_alignment + ALU_ALIGNMENT / unit_size <= len {
101                    if until_alignment != 0 {
102                        accu |= buffer[offset] as usize;
103                        offset += 1;
104                        until_alignment -= 1;
105                        while until_alignment != 0 {
106                            accu |= buffer[offset] as usize;
107                            offset += 1;
108                            until_alignment -= 1;
109                        }
110                        if accu >= $bound {
111                            return false;
112                        }
113                    }
114                    let len_minus_stride = len - ALU_ALIGNMENT / unit_size;
115                    if offset + (4 * (ALU_ALIGNMENT / unit_size)) <= len {
116                        let len_minus_unroll = len - (4 * (ALU_ALIGNMENT / unit_size));
117                        loop {
118                            let unroll_accu = unsafe { *(src.add(offset) as *const usize) }
119                                | unsafe {
120                                    *(src.add(offset + (ALU_ALIGNMENT / unit_size)) as *const usize)
121                                }
122                                | unsafe {
123                                    *(src.add(offset + (2 * (ALU_ALIGNMENT / unit_size)))
124                                        as *const usize)
125                                }
126                                | unsafe {
127                                    *(src.add(offset + (3 * (ALU_ALIGNMENT / unit_size)))
128                                        as *const usize)
129                                };
130                            if unroll_accu & $mask != 0 {
131                                return false;
132                            }
133                            offset += 4 * (ALU_ALIGNMENT / unit_size);
134                            if offset > len_minus_unroll {
135                                break;
136                            }
137                        }
138                    }
139                    while offset <= len_minus_stride {
140                        accu |= unsafe { *(src.add(offset) as *const usize) };
141                        offset += ALU_ALIGNMENT / unit_size;
142                    }
143                }
144            }
145            for &unit in &buffer[offset..] {
146                accu |= unit as usize;
147            }
148            accu & $mask == 0
149        }
150    };
151}
152
153#[allow(unused_macros)]
154macro_rules! by_unit_check_simd {
155    ($name:ident, $unit:ty, $splat:expr, $simd_ty:ty, $bound:expr, $func:ident) => {
156        #[inline(always)]
157        fn $name(buffer: &[$unit]) -> bool {
158            let mut offset = 0usize;
159            let mut accu = 0usize;
160            let unit_size = ::std::mem::size_of::<$unit>();
161            let len = buffer.len();
162            if len >= SIMD_STRIDE_SIZE / unit_size {
163                // The most common reason to return `false` is for the first code
164                // unit to fail the test, so check that first.
165                if buffer[0] >= $bound {
166                    return false;
167                }
168                let src = buffer.as_ptr();
169                let mut until_alignment = ((SIMD_ALIGNMENT
170                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
171                    & SIMD_ALIGNMENT_MASK)
172                    / unit_size;
173                if until_alignment + SIMD_STRIDE_SIZE / unit_size <= len {
174                    if until_alignment != 0 {
175                        accu |= buffer[offset] as usize;
176                        offset += 1;
177                        until_alignment -= 1;
178                        while until_alignment != 0 {
179                            accu |= buffer[offset] as usize;
180                            offset += 1;
181                            until_alignment -= 1;
182                        }
183                        if accu >= $bound {
184                            return false;
185                        }
186                    }
187                    let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
188                    if offset + (4 * (SIMD_STRIDE_SIZE / unit_size)) <= len {
189                        let len_minus_unroll = len - (4 * (SIMD_STRIDE_SIZE / unit_size));
190                        loop {
191                            let unroll_accu = unsafe { *(src.add(offset) as *const $simd_ty) }
192                                | unsafe {
193                                    *(src.add(offset + (SIMD_STRIDE_SIZE / unit_size))
194                                        as *const $simd_ty)
195                                }
196                                | unsafe {
197                                    *(src.add(offset + (2 * (SIMD_STRIDE_SIZE / unit_size)))
198                                        as *const $simd_ty)
199                                }
200                                | unsafe {
201                                    *(src.add(offset + (3 * (SIMD_STRIDE_SIZE / unit_size)))
202                                        as *const $simd_ty)
203                                };
204                            if !$func(unroll_accu) {
205                                return false;
206                            }
207                            offset += 4 * (SIMD_STRIDE_SIZE / unit_size);
208                            if offset > len_minus_unroll {
209                                break;
210                            }
211                        }
212                    }
213                    let mut simd_accu = $splat;
214                    while offset <= len_minus_stride {
215                        simd_accu = simd_accu | unsafe { *(src.add(offset) as *const $simd_ty) };
216                        offset += SIMD_STRIDE_SIZE / unit_size;
217                    }
218                    if !$func(simd_accu) {
219                        return false;
220                    }
221                }
222            }
223            for &unit in &buffer[offset..] {
224                accu |= unit as usize;
225            }
226            accu < $bound
227        }
228    };
229}
230
231cfg_if! {
232    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
233        use crate::simd_funcs::*;
234        use packed_simd::u8x16;
235        use packed_simd::u16x8;
236
237        const SIMD_ALIGNMENT: usize = 16;
238
239        const SIMD_ALIGNMENT_MASK: usize = 15;
240
241        by_unit_check_simd!(is_ascii_impl, u8, u8x16::splat(0), u8x16, 0x80, simd_is_ascii);
242        by_unit_check_simd!(is_basic_latin_impl, u16, u16x8::splat(0), u16x8, 0x80, simd_is_basic_latin);
243        by_unit_check_simd!(is_utf16_latin1_impl, u16, u16x8::splat(0), u16x8, 0x100, simd_is_latin1);
244
245        #[inline(always)]
246        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
247            // This function is a mess, because it simultaneously tries to do
248            // only aligned SIMD (perhaps misguidedly) and needs to deal with
249            // the last code unit in a SIMD stride being part of a valid
250            // surrogate pair.
251            let unit_size = ::std::mem::size_of::<u16>();
252            let src = buffer.as_ptr();
253            let len = buffer.len();
254            let mut offset = 0usize;
255            'outer: loop {
256                let until_alignment = ((SIMD_ALIGNMENT - ((unsafe { src.add(offset) } as usize) & SIMD_ALIGNMENT_MASK)) &
257                                        SIMD_ALIGNMENT_MASK) / unit_size;
258                if until_alignment == 0 {
259                    if offset + SIMD_STRIDE_SIZE / unit_size > len {
260                        break;
261                    }
262                } else {
263                    let offset_plus_until_alignment = offset + until_alignment;
264                    let offset_plus_until_alignment_plus_one = offset_plus_until_alignment + 1;
265                    if offset_plus_until_alignment_plus_one + SIMD_STRIDE_SIZE / unit_size > len {
266                        break;
267                    }
268                    let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_until_alignment_plus_one]);
269                    if up_to < until_alignment {
270                        return offset + up_to;
271                    }
272                    if last_valid_low {
273                        offset = offset_plus_until_alignment_plus_one;
274                        continue;
275                    }
276                    offset = offset_plus_until_alignment;
277                }
278                let len_minus_stride = len - SIMD_STRIDE_SIZE / unit_size;
279                loop {
280                    let offset_plus_stride = offset + SIMD_STRIDE_SIZE / unit_size;
281                    if contains_surrogates(unsafe { *(src.add(offset) as *const u16x8) }) {
282                        if offset_plus_stride == len {
283                            break 'outer;
284                        }
285                        let offset_plus_stride_plus_one = offset_plus_stride + 1;
286                        let (up_to, last_valid_low) = utf16_valid_up_to_alu(&buffer[offset..offset_plus_stride_plus_one]);
287                        if up_to < SIMD_STRIDE_SIZE / unit_size {
288                            return offset + up_to;
289                        }
290                        if last_valid_low {
291                            offset = offset_plus_stride_plus_one;
292                            continue 'outer;
293                        }
294                    }
295                    offset = offset_plus_stride;
296                    if offset > len_minus_stride {
297                        break 'outer;
298                    }
299                }
300            }
301            let (up_to, _) = utf16_valid_up_to_alu(&buffer[offset..]);
302            offset + up_to
303        }
304    } else {
305        by_unit_check_alu!(is_ascii_impl, u8, 0x80, ASCII_MASK);
306        by_unit_check_alu!(is_basic_latin_impl, u16, 0x80, BASIC_LATIN_MASK);
307        by_unit_check_alu!(is_utf16_latin1_impl, u16, 0x100, LATIN1_MASK);
308
309        #[inline(always)]
310        fn utf16_valid_up_to_impl(buffer: &[u16]) -> usize {
311            let (up_to, _) = utf16_valid_up_to_alu(buffer);
312            up_to
313        }
314    }
315}
316
317/// The second return value is true iff the last code unit of the slice was
318/// reached and turned out to be a low surrogate that is part of a valid pair.
319#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
320#[inline(always)]
321fn utf16_valid_up_to_alu(buffer: &[u16]) -> (usize, bool) {
322    let len = buffer.len();
323    if len == 0 {
324        return (0, false);
325    }
326    let mut offset = 0usize;
327    loop {
328        let unit = buffer[offset];
329        let next = offset + 1;
330        let unit_minus_surrogate_start = unit.wrapping_sub(0xD800);
331        if unit_minus_surrogate_start > (0xDFFF - 0xD800) {
332            // Not a surrogate
333            offset = next;
334            if offset == len {
335                return (offset, false);
336            }
337            continue;
338        }
339        if unit_minus_surrogate_start <= (0xDBFF - 0xD800) {
340            // high surrogate
341            if next < len {
342                let second = buffer[next];
343                let second_minus_low_surrogate_start = second.wrapping_sub(0xDC00);
344                if second_minus_low_surrogate_start <= (0xDFFF - 0xDC00) {
345                    // The next code unit is a low surrogate. Advance position.
346                    offset = next + 1;
347                    if offset == len {
348                        return (offset, true);
349                    }
350                    continue;
351                }
352                // The next code unit is not a low surrogate. Don't advance
353                // position and treat the high surrogate as unpaired.
354                // fall through
355            }
356            // Unpaired, fall through
357        }
358        // Unpaired surrogate
359        return (offset, false);
360    }
361}
362
363cfg_if! {
364    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
365        #[inline(always)]
366        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
367            let mut offset = 0usize;
368            let bytes = buffer.as_bytes();
369            let len = bytes.len();
370            if len >= SIMD_STRIDE_SIZE {
371                let src = bytes.as_ptr();
372                let mut until_alignment = (SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
373                                           SIMD_ALIGNMENT_MASK;
374                if until_alignment + SIMD_STRIDE_SIZE <= len {
375                    while until_alignment != 0 {
376                        if bytes[offset] > 0xC3 {
377                            return Some(offset);
378                        }
379                        offset += 1;
380                        until_alignment -= 1;
381                    }
382                    let len_minus_stride = len - SIMD_STRIDE_SIZE;
383                    loop {
384                        if !simd_is_str_latin1(unsafe { *(src.add(offset) as *const u8x16) }) {
385                            // TODO: Ensure this compiles away when inlined into `is_str_latin1()`.
386                            while bytes[offset] & 0xC0 == 0x80 {
387                                offset += 1;
388                            }
389                            return Some(offset);
390                        }
391                        offset += SIMD_STRIDE_SIZE;
392                        if offset > len_minus_stride {
393                            break;
394                        }
395                    }
396                }
397            }
398            for i in offset..len {
399                if bytes[i] > 0xC3 {
400                    return Some(i);
401                }
402            }
403            None
404        }
405    } else {
406        #[inline(always)]
407        fn is_str_latin1_impl(buffer: &str) -> Option<usize> {
408            let mut bytes = buffer.as_bytes();
409            let mut total = 0;
410            loop {
411                if let Some((byte, offset)) = validate_ascii(bytes) {
412                    total += offset;
413                    if byte > 0xC3 {
414                        return Some(total);
415                    }
416                    bytes = &bytes[offset + 2..];
417                    total += 2;
418                } else {
419                    return None;
420                }
421            }
422        }
423    }
424}
425
426#[inline(always)]
427fn is_utf8_latin1_impl(buffer: &[u8]) -> Option<usize> {
428    let mut bytes = buffer;
429    let mut total = 0;
430    loop {
431        if let Some((byte, offset)) = validate_ascii(bytes) {
432            total += offset;
433            if in_inclusive_range8(byte, 0xC2, 0xC3) {
434                let next = offset + 1;
435                if next == bytes.len() {
436                    return Some(total);
437                }
438                if bytes[next] & 0xC0 != 0x80 {
439                    return Some(total);
440                }
441                bytes = &bytes[offset + 2..];
442                total += 2;
443            } else {
444                return Some(total);
445            }
446        } else {
447            return None;
448        }
449    }
450}
451
452cfg_if! {
453    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
454        #[inline(always)]
455        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
456            let mut offset = 0usize;
457            let len = buffer.len();
458            if len >= SIMD_STRIDE_SIZE / 2 {
459                let src = buffer.as_ptr();
460                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
461                                           SIMD_ALIGNMENT_MASK) / 2;
462                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
463                    while until_alignment != 0 {
464                        if is_utf16_code_unit_bidi(buffer[offset]) {
465                            return true;
466                        }
467                        offset += 1;
468                        until_alignment -= 1;
469                    }
470                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
471                    loop {
472                        if is_u16x8_bidi(unsafe { *(src.add(offset) as *const u16x8) }) {
473                            return true;
474                        }
475                        offset += SIMD_STRIDE_SIZE / 2;
476                        if offset > len_minus_stride {
477                            break;
478                        }
479                    }
480                }
481            }
482            for &u in &buffer[offset..] {
483                if is_utf16_code_unit_bidi(u) {
484                    return true;
485                }
486            }
487            false
488        }
489    } else {
490        #[inline(always)]
491        fn is_utf16_bidi_impl(buffer: &[u16]) -> bool {
492            for &u in buffer {
493                if is_utf16_code_unit_bidi(u) {
494                    return true;
495                }
496            }
497            false
498        }
499    }
500}
501
502cfg_if! {
503    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"), all(target_endian = "little", target_feature = "neon"))))] {
504        #[inline(always)]
505        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
506            let mut offset = 0usize;
507            let len = buffer.len();
508            if len >= SIMD_STRIDE_SIZE / 2 {
509                let src = buffer.as_ptr();
510                let mut until_alignment = ((SIMD_ALIGNMENT - ((src as usize) & SIMD_ALIGNMENT_MASK)) &
511                                           SIMD_ALIGNMENT_MASK) / 2;
512                if until_alignment + (SIMD_STRIDE_SIZE / 2) <= len {
513                    while until_alignment != 0 {
514                        if buffer[offset] > 0xFF {
515                            // This transition isn't optimal, since the aligment is recomputing
516                            // but not tweaking further today.
517                            if is_utf16_bidi_impl(&buffer[offset..]) {
518                                return Latin1Bidi::Bidi;
519                            }
520                            return Latin1Bidi::LeftToRight;
521                        }
522                        offset += 1;
523                        until_alignment -= 1;
524                    }
525                    let len_minus_stride = len - (SIMD_STRIDE_SIZE / 2);
526                    loop {
527                        let mut s = unsafe { *(src.add(offset) as *const u16x8) };
528                        if !simd_is_latin1(s) {
529                            loop {
530                                if is_u16x8_bidi(s) {
531                                    return Latin1Bidi::Bidi;
532                                }
533                                offset += SIMD_STRIDE_SIZE / 2;
534                                if offset > len_minus_stride {
535                                    for &u in &buffer[offset..] {
536                                        if is_utf16_code_unit_bidi(u) {
537                                            return Latin1Bidi::Bidi;
538                                        }
539                                    }
540                                    return Latin1Bidi::LeftToRight;
541                                }
542                                s = unsafe { *(src.add(offset) as *const u16x8) };
543                            }
544                        }
545                        offset += SIMD_STRIDE_SIZE / 2;
546                        if offset > len_minus_stride {
547                            break;
548                        }
549                    }
550                }
551            }
552            let mut iter = (&buffer[offset..]).iter();
553            loop {
554                if let Some(&u) = iter.next() {
555                    if u > 0xFF {
556                        let mut inner_u = u;
557                        loop {
558                            if is_utf16_code_unit_bidi(inner_u) {
559                                return Latin1Bidi::Bidi;
560                            }
561                            if let Some(&code_unit) = iter.next() {
562                                inner_u = code_unit;
563                            } else {
564                                return Latin1Bidi::LeftToRight;
565                            }
566                        }
567                    }
568                } else {
569                    return Latin1Bidi::Latin1;
570                }
571            }
572        }
573    } else {
574        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
575        #[inline(always)]
576        fn check_utf16_for_latin1_and_bidi_impl(buffer: &[u16]) -> Latin1Bidi {
577            let mut offset = 0usize;
578            let len = buffer.len();
579            if len >= ALU_ALIGNMENT / 2 {
580                let src = buffer.as_ptr();
581                let mut until_alignment = ((ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) &
582                                           ALU_ALIGNMENT_MASK) / 2;
583                if until_alignment + ALU_ALIGNMENT / 2 <= len {
584                    while until_alignment != 0 {
585                        if buffer[offset] > 0xFF {
586                            if is_utf16_bidi_impl(&buffer[offset..]) {
587                                return Latin1Bidi::Bidi;
588                            }
589                            return Latin1Bidi::LeftToRight;
590                        }
591                        offset += 1;
592                        until_alignment -= 1;
593                    }
594                    let len_minus_stride = len - ALU_ALIGNMENT / 2;
595                    loop {
596                        if unsafe { *(src.add(offset) as *const usize) } & LATIN1_MASK != 0 {
597                            if is_utf16_bidi_impl(&buffer[offset..]) {
598                                return Latin1Bidi::Bidi;
599                            }
600                            return Latin1Bidi::LeftToRight;
601                        }
602                        offset += ALU_ALIGNMENT / 2;
603                        if offset > len_minus_stride {
604                            break;
605                        }
606                    }
607                }
608            }
609            let mut iter = (&buffer[offset..]).iter();
610            loop {
611                if let Some(&u) = iter.next() {
612                    if u > 0xFF {
613                        let mut inner_u = u;
614                        loop {
615                            if is_utf16_code_unit_bidi(inner_u) {
616                                return Latin1Bidi::Bidi;
617                            }
618                            if let Some(&code_unit) = iter.next() {
619                                inner_u = code_unit;
620                            } else {
621                                return Latin1Bidi::LeftToRight;
622                            }
623                        }
624                    }
625                } else {
626                    return Latin1Bidi::Latin1;
627                }
628            }
629        }
630    }
631}
632
633/// Checks whether the buffer is all-ASCII.
634///
635/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
636/// is not guaranteed to fail fast.)
637pub fn is_ascii(buffer: &[u8]) -> bool {
638    is_ascii_impl(buffer)
639}
640
641/// Checks whether the buffer is all-Basic Latin (i.e. UTF-16 representing
642/// only ASCII characters).
643///
644/// May read the entire buffer even if it isn't all-ASCII. (I.e. the function
645/// is not guaranteed to fail fast.)
646pub fn is_basic_latin(buffer: &[u16]) -> bool {
647    is_basic_latin_impl(buffer)
648}
649
650/// Checks whether the buffer is valid UTF-8 representing only code points
651/// less than or equal to U+00FF.
652///
653/// Fails fast. (I.e. returns before having read the whole buffer if UTF-8
654/// invalidity or code points above U+00FF are discovered.
655pub fn is_utf8_latin1(buffer: &[u8]) -> bool {
656    is_utf8_latin1_impl(buffer).is_none()
657}
658
659/// Checks whether the buffer represents only code points less than or equal
660/// to U+00FF.
661///
662/// Fails fast. (I.e. returns before having read the whole buffer if code
663/// points above U+00FF are discovered.
664pub fn is_str_latin1(buffer: &str) -> bool {
665    is_str_latin1_impl(buffer).is_none()
666}
667
668/// Checks whether the buffer represents only code point less than or equal
669/// to U+00FF.
670///
671/// May read the entire buffer even if it isn't all-Latin1. (I.e. the function
672/// is not guaranteed to fail fast.)
673pub fn is_utf16_latin1(buffer: &[u16]) -> bool {
674    is_utf16_latin1_impl(buffer)
675}
676
677/// Checks whether a potentially-invalid UTF-8 buffer contains code points
678/// that trigger right-to-left processing.
679///
680/// The check is done on a Unicode block basis without regard to assigned
681/// vs. unassigned code points in the block. Hebrew presentation forms in
682/// the Alphabetic Presentation Forms block are treated as if they formed
683/// a block on their own (i.e. it treated as right-to-left). Additionally,
684/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
685/// for. Control characters that are technically bidi controls but do not
686/// cause right-to-left behavior without the presence of right-to-left
687/// characters or right-to-left controls are not checked for. As a special
688/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
689///
690/// Returns `true` if the input is invalid UTF-8 or the input contains an
691/// RTL character. Returns `false` if the input is valid UTF-8 and contains
692/// no RTL characters.
693#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if, cyclomatic_complexity))]
694#[inline]
695pub fn is_utf8_bidi(buffer: &[u8]) -> bool {
696    // As of rustc 1.25.0-nightly (73ac5d6a8 2018-01-11), this is faster
697    // than UTF-8 validation followed by `is_str_bidi()` for German,
698    // Russian and Japanese. However, this is considerably slower for Thai.
699    // Chances are that the compiler makes some branch predictions that are
700    // unfortunate for Thai. Not spending the time to manually optimize
701    // further at this time, since it's unclear if this variant even has
702    // use cases. However, this is worth revisiting once Rust gets the
703    // ability to annotate relative priorities of match arms.
704
705    // U+058F: D6 8F
706    // U+0590: D6 90
707    // U+08FF: E0 A3 BF
708    // U+0900: E0 A4 80
709    //
710    // U+200F: E2 80 8F
711    // U+202B: E2 80 AB
712    // U+202E: E2 80 AE
713    // U+2067: E2 81 A7
714    //
715    // U+FB1C: EF AC 9C
716    // U+FB1D: EF AC 9D
717    // U+FDFF: EF B7 BF
718    // U+FE00: EF B8 80
719    //
720    // U+FE6F: EF B9 AF
721    // U+FE70: EF B9 B0
722    // U+FEFE: EF BB BE
723    // U+FEFF: EF BB BF
724    //
725    // U+107FF: F0 90 9F BF
726    // U+10800: F0 90 A0 80
727    // U+10FFF: F0 90 BF BF
728    // U+11000: F0 91 80 80
729    //
730    // U+1E7FF: F0 9E 9F BF
731    // U+1E800: F0 9E A0 80
732    // U+1EFFF: F0 9E BF BF
733    // U+1F000: F0 9F 80 80
734    let mut src = buffer;
735    'outer: loop {
736        if let Some((mut byte, mut read)) = validate_ascii(src) {
737            // Check for the longest sequence to avoid checking twice for the
738            // multi-byte sequences.
739            if read + 4 <= src.len() {
740                'inner: loop {
741                    // At this point, `byte` is not included in `read`.
742                    match byte {
743                        0..=0x7F => {
744                            // ASCII: go back to SIMD.
745                            read += 1;
746                            src = &src[read..];
747                            continue 'outer;
748                        }
749                        0xC2..=0xD5 => {
750                            // Two-byte
751                            let second = unsafe { *(src.get_unchecked(read + 1)) };
752                            if !in_inclusive_range8(second, 0x80, 0xBF) {
753                                return true;
754                            }
755                            read += 2;
756                        }
757                        0xD6 => {
758                            // Two-byte
759                            let second = unsafe { *(src.get_unchecked(read + 1)) };
760                            if !in_inclusive_range8(second, 0x80, 0xBF) {
761                                return true;
762                            }
763                            // XXX consider folding the above and below checks
764                            if second > 0x8F {
765                                return true;
766                            }
767                            read += 2;
768                        }
769                        // two-byte starting with 0xD7 and above is bidi
770                        0xE1 | 0xE3..=0xEC | 0xEE => {
771                            // Three-byte normal
772                            let second = unsafe { *(src.get_unchecked(read + 1)) };
773                            let third = unsafe { *(src.get_unchecked(read + 2)) };
774                            if ((UTF8_DATA.table[usize::from(second)]
775                                & unsafe {
776                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
777                                })
778                                | (third >> 6))
779                                != 2
780                            {
781                                return true;
782                            }
783                            read += 3;
784                        }
785                        0xE2 => {
786                            // Three-byte normal, potentially bidi
787                            let second = unsafe { *(src.get_unchecked(read + 1)) };
788                            let third = unsafe { *(src.get_unchecked(read + 2)) };
789                            if ((UTF8_DATA.table[usize::from(second)]
790                                & unsafe {
791                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
792                                })
793                                | (third >> 6))
794                                != 2
795                            {
796                                return true;
797                            }
798                            if second == 0x80 {
799                                if third == 0x8F || third == 0xAB || third == 0xAE {
800                                    return true;
801                                }
802                            } else if second == 0x81 {
803                                if third == 0xA7 {
804                                    return true;
805                                }
806                            }
807                            read += 3;
808                        }
809                        0xEF => {
810                            // Three-byte normal, potentially bidi
811                            let second = unsafe { *(src.get_unchecked(read + 1)) };
812                            let third = unsafe { *(src.get_unchecked(read + 2)) };
813                            if ((UTF8_DATA.table[usize::from(second)]
814                                & unsafe {
815                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
816                                })
817                                | (third >> 6))
818                                != 2
819                            {
820                                return true;
821                            }
822                            if in_inclusive_range8(second, 0xAC, 0xB7) {
823                                if second == 0xAC {
824                                    if third > 0x9C {
825                                        return true;
826                                    }
827                                } else {
828                                    return true;
829                                }
830                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
831                                if second == 0xB9 {
832                                    if third > 0xAF {
833                                        return true;
834                                    }
835                                } else if second == 0xBB {
836                                    if third != 0xBF {
837                                        return true;
838                                    }
839                                } else {
840                                    return true;
841                                }
842                            }
843                            read += 3;
844                        }
845                        0xE0 => {
846                            // Three-byte special lower bound, potentially bidi
847                            let second = unsafe { *(src.get_unchecked(read + 1)) };
848                            let third = unsafe { *(src.get_unchecked(read + 2)) };
849                            if ((UTF8_DATA.table[usize::from(second)]
850                                & unsafe {
851                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
852                                })
853                                | (third >> 6))
854                                != 2
855                            {
856                                return true;
857                            }
858                            // XXX can this be folded into the above validity check
859                            if second < 0xA4 {
860                                return true;
861                            }
862                            read += 3;
863                        }
864                        0xED => {
865                            // Three-byte special upper bound
866                            let second = unsafe { *(src.get_unchecked(read + 1)) };
867                            let third = unsafe { *(src.get_unchecked(read + 2)) };
868                            if ((UTF8_DATA.table[usize::from(second)]
869                                & unsafe {
870                                    *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
871                                })
872                                | (third >> 6))
873                                != 2
874                            {
875                                return true;
876                            }
877                            read += 3;
878                        }
879                        0xF1..=0xF4 => {
880                            // Four-byte normal
881                            let second = unsafe { *(src.get_unchecked(read + 1)) };
882                            let third = unsafe { *(src.get_unchecked(read + 2)) };
883                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
884                            if (u16::from(
885                                UTF8_DATA.table[usize::from(second)]
886                                    & unsafe {
887                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
888                                    },
889                            ) | u16::from(third >> 6)
890                                | (u16::from(fourth & 0xC0) << 2))
891                                != 0x202
892                            {
893                                return true;
894                            }
895                            read += 4;
896                        }
897                        0xF0 => {
898                            // Four-byte special lower bound, potentially bidi
899                            let second = unsafe { *(src.get_unchecked(read + 1)) };
900                            let third = unsafe { *(src.get_unchecked(read + 2)) };
901                            let fourth = unsafe { *(src.get_unchecked(read + 3)) };
902                            if (u16::from(
903                                UTF8_DATA.table[usize::from(second)]
904                                    & unsafe {
905                                        *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80))
906                                    },
907                            ) | u16::from(third >> 6)
908                                | (u16::from(fourth & 0xC0) << 2))
909                                != 0x202
910                            {
911                                return true;
912                            }
913                            if unsafe { unlikely(second == 0x90 || second == 0x9E) } {
914                                let third = src[read + 2];
915                                if third >= 0xA0 {
916                                    return true;
917                                }
918                            }
919                            read += 4;
920                        }
921                        _ => {
922                            // Invalid lead or bidi-only lead
923                            return true;
924                        }
925                    }
926                    if read + 4 > src.len() {
927                        if read == src.len() {
928                            return false;
929                        }
930                        byte = src[read];
931                        break 'inner;
932                    }
933                    byte = src[read];
934                    continue 'inner;
935                }
936            }
937            // We can't have a complete 4-byte sequence, but we could still have
938            // a complete shorter sequence.
939
940            // At this point, `byte` is not included in `read`.
941            match byte {
942                0..=0x7F => {
943                    // ASCII: go back to SIMD.
944                    read += 1;
945                    src = &src[read..];
946                    continue 'outer;
947                }
948                0xC2..=0xD5 => {
949                    // Two-byte
950                    let new_read = read + 2;
951                    if new_read > src.len() {
952                        return true;
953                    }
954                    let second = unsafe { *(src.get_unchecked(read + 1)) };
955                    if !in_inclusive_range8(second, 0x80, 0xBF) {
956                        return true;
957                    }
958                    read = new_read;
959                    // We need to deal with the case where we came here with 3 bytes
960                    // left, so we need to take a look at the last one.
961                    src = &src[read..];
962                    continue 'outer;
963                }
964                0xD6 => {
965                    // Two-byte, potentially bidi
966                    let new_read = read + 2;
967                    if new_read > src.len() {
968                        return true;
969                    }
970                    let second = unsafe { *(src.get_unchecked(read + 1)) };
971                    if !in_inclusive_range8(second, 0x80, 0xBF) {
972                        return true;
973                    }
974                    // XXX consider folding the above and below checks
975                    if second > 0x8F {
976                        return true;
977                    }
978                    read = new_read;
979                    // We need to deal with the case where we came here with 3 bytes
980                    // left, so we need to take a look at the last one.
981                    src = &src[read..];
982                    continue 'outer;
983                }
984                // two-byte starting with 0xD7 and above is bidi
985                0xE1 | 0xE3..=0xEC | 0xEE => {
986                    // Three-byte normal
987                    let new_read = read + 3;
988                    if new_read > src.len() {
989                        return true;
990                    }
991                    let second = unsafe { *(src.get_unchecked(read + 1)) };
992                    let third = unsafe { *(src.get_unchecked(read + 2)) };
993                    if ((UTF8_DATA.table[usize::from(second)]
994                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
995                        | (third >> 6))
996                        != 2
997                    {
998                        return true;
999                    }
1000                }
1001                0xE2 => {
1002                    // Three-byte normal, potentially bidi
1003                    let new_read = read + 3;
1004                    if new_read > src.len() {
1005                        return true;
1006                    }
1007                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1008                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1009                    if ((UTF8_DATA.table[usize::from(second)]
1010                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1011                        | (third >> 6))
1012                        != 2
1013                    {
1014                        return true;
1015                    }
1016                    if second == 0x80 {
1017                        if third == 0x8F || third == 0xAB || third == 0xAE {
1018                            return true;
1019                        }
1020                    } else if second == 0x81 {
1021                        if third == 0xA7 {
1022                            return true;
1023                        }
1024                    }
1025                }
1026                0xEF => {
1027                    // Three-byte normal, potentially bidi
1028                    let new_read = read + 3;
1029                    if new_read > src.len() {
1030                        return true;
1031                    }
1032                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1033                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1034                    if ((UTF8_DATA.table[usize::from(second)]
1035                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1036                        | (third >> 6))
1037                        != 2
1038                    {
1039                        return true;
1040                    }
1041                    if in_inclusive_range8(second, 0xAC, 0xB7) {
1042                        if second == 0xAC {
1043                            if third > 0x9C {
1044                                return true;
1045                            }
1046                        } else {
1047                            return true;
1048                        }
1049                    } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1050                        if second == 0xB9 {
1051                            if third > 0xAF {
1052                                return true;
1053                            }
1054                        } else if second == 0xBB {
1055                            if third != 0xBF {
1056                                return true;
1057                            }
1058                        } else {
1059                            return true;
1060                        }
1061                    }
1062                }
1063                0xE0 => {
1064                    // Three-byte special lower bound, potentially bidi
1065                    let new_read = read + 3;
1066                    if new_read > src.len() {
1067                        return true;
1068                    }
1069                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1070                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1071                    if ((UTF8_DATA.table[usize::from(second)]
1072                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1073                        | (third >> 6))
1074                        != 2
1075                    {
1076                        return true;
1077                    }
1078                    // XXX can this be folded into the above validity check
1079                    if second < 0xA4 {
1080                        return true;
1081                    }
1082                }
1083                0xED => {
1084                    // Three-byte special upper bound
1085                    let new_read = read + 3;
1086                    if new_read > src.len() {
1087                        return true;
1088                    }
1089                    let second = unsafe { *(src.get_unchecked(read + 1)) };
1090                    let third = unsafe { *(src.get_unchecked(read + 2)) };
1091                    if ((UTF8_DATA.table[usize::from(second)]
1092                        & unsafe { *(UTF8_DATA.table.get_unchecked(byte as usize + 0x80)) })
1093                        | (third >> 6))
1094                        != 2
1095                    {
1096                        return true;
1097                    }
1098                }
1099                _ => {
1100                    // Invalid lead, 4-byte lead or 2-byte bidi-only lead
1101                    return true;
1102                }
1103            }
1104            return false;
1105        } else {
1106            return false;
1107        }
1108    }
1109}
1110
1111/// Checks whether a valid UTF-8 buffer contains code points that trigger
1112/// right-to-left processing.
1113///
1114/// The check is done on a Unicode block basis without regard to assigned
1115/// vs. unassigned code points in the block. Hebrew presentation forms in
1116/// the Alphabetic Presentation Forms block are treated as if they formed
1117/// a block on their own (i.e. it treated as right-to-left). Additionally,
1118/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1119/// for. Control characters that are technically bidi controls but do not
1120/// cause right-to-left behavior without the presence of right-to-left
1121/// characters or right-to-left controls are not checked for. As a special
1122/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1123#[cfg_attr(feature = "cargo-clippy", allow(collapsible_if))]
1124#[inline]
1125pub fn is_str_bidi(buffer: &str) -> bool {
1126    // U+058F: D6 8F
1127    // U+0590: D6 90
1128    // U+08FF: E0 A3 BF
1129    // U+0900: E0 A4 80
1130    //
1131    // U+200F: E2 80 8F
1132    // U+202B: E2 80 AB
1133    // U+202E: E2 80 AE
1134    // U+2067: E2 81 A7
1135    //
1136    // U+FB1C: EF AC 9C
1137    // U+FB1D: EF AC 9D
1138    // U+FDFF: EF B7 BF
1139    // U+FE00: EF B8 80
1140    //
1141    // U+FE6F: EF B9 AF
1142    // U+FE70: EF B9 B0
1143    // U+FEFE: EF BB BE
1144    // U+FEFF: EF BB BF
1145    //
1146    // U+107FF: F0 90 9F BF
1147    // U+10800: F0 90 A0 80
1148    // U+10FFF: F0 90 BF BF
1149    // U+11000: F0 91 80 80
1150    //
1151    // U+1E7FF: F0 9E 9F BF
1152    // U+1E800: F0 9E A0 80
1153    // U+1EFFF: F0 9E BF BF
1154    // U+1F000: F0 9F 80 80
1155    let mut bytes = buffer.as_bytes();
1156    'outer: loop {
1157        // TODO: Instead of just validating ASCII using SIMD, use SIMD
1158        // to check for non-ASCII lead bytes, too, to quickly conclude
1159        // that the vector consist entirely of CJK and below-Hebrew
1160        // code points.
1161        // Unfortunately, scripts above Arabic but below CJK share
1162        // lead bytes with RTL.
1163        if let Some((mut byte, mut read)) = validate_ascii(bytes) {
1164            'inner: loop {
1165                // At this point, `byte` is not included in `read`.
1166                if byte < 0xE0 {
1167                    if byte >= 0x80 {
1168                        // Two-byte
1169                        // Adding `unlikely` here improved throughput on
1170                        // Russian plain text by 33%!
1171                        if unsafe { unlikely(byte >= 0xD6) } {
1172                            if byte == 0xD6 {
1173                                let second = bytes[read + 1];
1174                                if second > 0x8F {
1175                                    return true;
1176                                }
1177                            } else {
1178                                return true;
1179                            }
1180                        }
1181                        read += 2;
1182                    } else {
1183                        // ASCII: write and go back to SIMD.
1184                        read += 1;
1185                        // Intuitively, we should go back to the outer loop only
1186                        // if byte is 0x30 or above, so as to avoid trashing on
1187                        // ASCII space, comma and period in non-Latin context.
1188                        // However, the extra branch seems to cost more than it's
1189                        // worth.
1190                        bytes = &bytes[read..];
1191                        continue 'outer;
1192                    }
1193                } else if byte < 0xF0 {
1194                    // Three-byte
1195                    if unsafe { unlikely(!in_inclusive_range8(byte, 0xE3, 0xEE) && byte != 0xE1) } {
1196                        let second = bytes[read + 1];
1197                        if byte == 0xE0 {
1198                            if second < 0xA4 {
1199                                return true;
1200                            }
1201                        } else if byte == 0xE2 {
1202                            let third = bytes[read + 2];
1203                            if second == 0x80 {
1204                                if third == 0x8F || third == 0xAB || third == 0xAE {
1205                                    return true;
1206                                }
1207                            } else if second == 0x81 {
1208                                if third == 0xA7 {
1209                                    return true;
1210                                }
1211                            }
1212                        } else {
1213                            debug_assert_eq!(byte, 0xEF);
1214                            if in_inclusive_range8(second, 0xAC, 0xB7) {
1215                                if second == 0xAC {
1216                                    let third = bytes[read + 2];
1217                                    if third > 0x9C {
1218                                        return true;
1219                                    }
1220                                } else {
1221                                    return true;
1222                                }
1223                            } else if in_inclusive_range8(second, 0xB9, 0xBB) {
1224                                if second == 0xB9 {
1225                                    let third = bytes[read + 2];
1226                                    if third > 0xAF {
1227                                        return true;
1228                                    }
1229                                } else if second == 0xBB {
1230                                    let third = bytes[read + 2];
1231                                    if third != 0xBF {
1232                                        return true;
1233                                    }
1234                                } else {
1235                                    return true;
1236                                }
1237                            }
1238                        }
1239                    }
1240                    read += 3;
1241                } else {
1242                    // Four-byte
1243                    let second = bytes[read + 1];
1244                    if unsafe { unlikely(byte == 0xF0 && (second == 0x90 || second == 0x9E)) } {
1245                        let third = bytes[read + 2];
1246                        if third >= 0xA0 {
1247                            return true;
1248                        }
1249                    }
1250                    read += 4;
1251                }
1252                // The comparison is always < or == and never >, but including
1253                // > here to let the compiler assume that < is true if this
1254                // comparison is false.
1255                if read >= bytes.len() {
1256                    return false;
1257                }
1258                byte = bytes[read];
1259                continue 'inner;
1260            }
1261        } else {
1262            return false;
1263        }
1264    }
1265}
1266
1267/// Checks whether a UTF-16 buffer contains code points that trigger
1268/// right-to-left processing.
1269///
1270/// The check is done on a Unicode block basis without regard to assigned
1271/// vs. unassigned code points in the block. Hebrew presentation forms in
1272/// the Alphabetic Presentation Forms block are treated as if they formed
1273/// a block on their own (i.e. it treated as right-to-left). Additionally,
1274/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1275/// for. Control characters that are technically bidi controls but do not
1276/// cause right-to-left behavior without the presence of right-to-left
1277/// characters or right-to-left controls are not checked for. As a special
1278/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1279///
1280/// Returns `true` if the input contains an RTL character or an unpaired
1281/// high surrogate that could be the high half of an RTL character.
1282/// Returns `false` if the input contains neither RTL characters nor
1283/// unpaired high surrogates that could be higher halves of RTL characters.
1284pub fn is_utf16_bidi(buffer: &[u16]) -> bool {
1285    is_utf16_bidi_impl(buffer)
1286}
1287
1288/// Checks whether a scalar value triggers right-to-left processing.
1289///
1290/// The check is done on a Unicode block basis without regard to assigned
1291/// vs. unassigned code points in the block. Hebrew presentation forms in
1292/// the Alphabetic Presentation Forms block are treated as if they formed
1293/// a block on their own (i.e. it treated as right-to-left). Additionally,
1294/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1295/// for. Control characters that are technically bidi controls but do not
1296/// cause right-to-left behavior without the presence of right-to-left
1297/// characters or right-to-left controls are not checked for. As a special
1298/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1299#[inline(always)]
1300pub fn is_char_bidi(c: char) -> bool {
1301    // Controls:
1302    // Every control with RIGHT-TO-LEFT in its name in
1303    // https://www.unicode.org/charts/PDF/U2000.pdf
1304    // U+200F RLM
1305    // U+202B RLE
1306    // U+202E RLO
1307    // U+2067 RLI
1308    //
1309    // BMP RTL:
1310    // https://www.unicode.org/roadmaps/bmp/
1311    // U+0590...U+08FF
1312    // U+FB1D...U+FDFF Hebrew presentation forms and
1313    //                 Arabic Presentation Forms A
1314    // U+FE70...U+FEFE Arabic Presentation Forms B (excl. BOM)
1315    //
1316    // Supplementary RTL:
1317    // https://www.unicode.org/roadmaps/smp/
1318    // U+10800...U+10FFF (Lead surrogate U+D802 or U+D803)
1319    // U+1E800...U+1EFFF (Lead surrogate U+D83A or U+D83B)
1320    let code_point = u32::from(c);
1321    if code_point < 0x0590 {
1322        // Below Hebrew
1323        return false;
1324    }
1325    if in_range32(code_point, 0x0900, 0xFB1D) {
1326        // Above Arabic Extended-A and below Hebrew presentation forms
1327        if in_inclusive_range32(code_point, 0x200F, 0x2067) {
1328            // In the range that contains the RTL controls
1329            return code_point == 0x200F
1330                || code_point == 0x202B
1331                || code_point == 0x202E
1332                || code_point == 0x2067;
1333        }
1334        return false;
1335    }
1336    if code_point > 0x1EFFF {
1337        // Above second astral RTL. (Emoji is here.)
1338        return false;
1339    }
1340    if in_range32(code_point, 0x11000, 0x1E800) {
1341        // Between astral RTL blocks
1342        return false;
1343    }
1344    if in_range32(code_point, 0xFEFF, 0x10800) {
1345        // Above Arabic Presentations Forms B (excl. BOM) and below first
1346        // astral RTL
1347        return false;
1348    }
1349    if in_range32(code_point, 0xFE00, 0xFE70) {
1350        // Between Arabic Presentations Forms
1351        return false;
1352    }
1353    true
1354}
1355
1356/// Checks whether a UTF-16 code unit triggers right-to-left processing.
1357///
1358/// The check is done on a Unicode block basis without regard to assigned
1359/// vs. unassigned code points in the block. Hebrew presentation forms in
1360/// the Alphabetic Presentation Forms block are treated as if they formed
1361/// a block on their own (i.e. it treated as right-to-left). Additionally,
1362/// the four RIGHT-TO-LEFT FOO controls in General Punctuation are checked
1363/// for. Control characters that are technically bidi controls but do not
1364/// cause right-to-left behavior without the presence of right-to-left
1365/// characters or right-to-left controls are not checked for. As a special
1366/// case, U+FEFF is excluded from Arabic Presentation Forms-B.
1367///
1368/// Since supplementary-plane right-to-left blocks are identifiable from the
1369/// high surrogate without examining the low surrogate, this function returns
1370/// `true` for such high surrogates making the function suitable for handling
1371/// supplementary-plane text without decoding surrogate pairs to scalar
1372/// values. Obviously, such high surrogates are then reported as right-to-left
1373/// even if actually unpaired.
1374#[inline(always)]
1375pub fn is_utf16_code_unit_bidi(u: u16) -> bool {
1376    if u < 0x0590 {
1377        // Below Hebrew
1378        return false;
1379    }
1380    if in_range16(u, 0x0900, 0xD802) {
1381        // Above Arabic Extended-A and below first RTL surrogate
1382        if in_inclusive_range16(u, 0x200F, 0x2067) {
1383            // In the range that contains the RTL controls
1384            return u == 0x200F || u == 0x202B || u == 0x202E || u == 0x2067;
1385        }
1386        return false;
1387    }
1388    if in_range16(u, 0xD83C, 0xFB1D) {
1389        // Between astral RTL high surrogates and Hebrew presentation forms
1390        // (Emoji is here)
1391        return false;
1392    }
1393    if in_range16(u, 0xD804, 0xD83A) {
1394        // Between RTL high surragates
1395        return false;
1396    }
1397    if u > 0xFEFE {
1398        // Above Arabic Presentation Forms (excl. BOM)
1399        return false;
1400    }
1401    if in_range16(u, 0xFE00, 0xFE70) {
1402        // Between Arabic Presentations Forms
1403        return false;
1404    }
1405    true
1406}
1407
1408/// Checks whether a potentially invalid UTF-8 buffer contains code points
1409/// that trigger right-to-left processing or is all-Latin1.
1410///
1411/// Possibly more efficient than performing the checks separately.
1412///
1413/// Returns `Latin1Bidi::Latin1` if `is_utf8_latin1()` would return `true`.
1414/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf8_bidi()` would return
1415/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1416pub fn check_utf8_for_latin1_and_bidi(buffer: &[u8]) -> Latin1Bidi {
1417    if let Some(offset) = is_utf8_latin1_impl(buffer) {
1418        if is_utf8_bidi(&buffer[offset..]) {
1419            Latin1Bidi::Bidi
1420        } else {
1421            Latin1Bidi::LeftToRight
1422        }
1423    } else {
1424        Latin1Bidi::Latin1
1425    }
1426}
1427
1428/// Checks whether a valid UTF-8 buffer contains code points
1429/// that trigger right-to-left processing or is all-Latin1.
1430///
1431/// Possibly more efficient than performing the checks separately.
1432///
1433/// Returns `Latin1Bidi::Latin1` if `is_str_latin1()` would return `true`.
1434/// Otherwise, returns `Latin1Bidi::Bidi` if `is_str_bidi()` would return
1435/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1436pub fn check_str_for_latin1_and_bidi(buffer: &str) -> Latin1Bidi {
1437    // The transition from the latin1 check to the bidi check isn't
1438    // optimal but not tweaking it to perfection today.
1439    if let Some(offset) = is_str_latin1_impl(buffer) {
1440        if is_str_bidi(&buffer[offset..]) {
1441            Latin1Bidi::Bidi
1442        } else {
1443            Latin1Bidi::LeftToRight
1444        }
1445    } else {
1446        Latin1Bidi::Latin1
1447    }
1448}
1449
1450/// Checks whether a potentially invalid UTF-16 buffer contains code points
1451/// that trigger right-to-left processing or is all-Latin1.
1452///
1453/// Possibly more efficient than performing the checks separately.
1454///
1455/// Returns `Latin1Bidi::Latin1` if `is_utf16_latin1()` would return `true`.
1456/// Otherwise, returns `Latin1Bidi::Bidi` if `is_utf16_bidi()` would return
1457/// `true`. Otherwise, returns `Latin1Bidi::LeftToRight`.
1458pub fn check_utf16_for_latin1_and_bidi(buffer: &[u16]) -> Latin1Bidi {
1459    check_utf16_for_latin1_and_bidi_impl(buffer)
1460}
1461
1462/// Converts potentially-invalid UTF-8 to valid UTF-16 with errors replaced
1463/// with the REPLACEMENT CHARACTER.
1464///
1465/// The length of the destination buffer must be at least the length of the
1466/// source buffer _plus one_.
1467///
1468/// Returns the number of `u16`s written.
1469///
1470/// # Panics
1471///
1472/// Panics if the destination buffer is shorter than stated above.
1473pub fn convert_utf8_to_utf16(src: &[u8], dst: &mut [u16]) -> usize {
1474    // TODO: Can the requirement for dst to be at least one unit longer
1475    // be eliminated?
1476    assert!(dst.len() > src.len());
1477    let mut decoder = Utf8Decoder::new_inner();
1478    let mut total_read = 0usize;
1479    let mut total_written = 0usize;
1480    loop {
1481        let (result, read, written) =
1482            decoder.decode_to_utf16_raw(&src[total_read..], &mut dst[total_written..], true);
1483        total_read += read;
1484        total_written += written;
1485        match result {
1486            DecoderResult::InputEmpty => {
1487                return total_written;
1488            }
1489            DecoderResult::OutputFull => {
1490                unreachable!("The assert at the top of the function should have caught this.");
1491            }
1492            DecoderResult::Malformed(_, _) => {
1493                // There should always be space for the U+FFFD, because
1494                // otherwise we'd have gotten OutputFull already.
1495                dst[total_written] = 0xFFFD;
1496                total_written += 1;
1497            }
1498        }
1499    }
1500}
1501
1502/// Converts valid UTF-8 to valid UTF-16.
1503///
1504/// The length of the destination buffer must be at least the length of the
1505/// source buffer.
1506///
1507/// Returns the number of `u16`s written.
1508///
1509/// # Panics
1510///
1511/// Panics if the destination buffer is shorter than stated above.
1512pub fn convert_str_to_utf16(src: &str, dst: &mut [u16]) -> usize {
1513    assert!(
1514        dst.len() >= src.len(),
1515        "Destination must not be shorter than the source."
1516    );
1517    let bytes = src.as_bytes();
1518    let mut read = 0;
1519    let mut written = 0;
1520    'outer: loop {
1521        let mut byte = {
1522            let src_remaining = &bytes[read..];
1523            let dst_remaining = &mut dst[written..];
1524            let length = src_remaining.len();
1525            match unsafe {
1526                ascii_to_basic_latin(src_remaining.as_ptr(), dst_remaining.as_mut_ptr(), length)
1527            } {
1528                None => {
1529                    written += length;
1530                    return written;
1531                }
1532                Some((non_ascii, consumed)) => {
1533                    read += consumed;
1534                    written += consumed;
1535                    non_ascii
1536                }
1537            }
1538        };
1539        'inner: loop {
1540            // At this point, `byte` is not included in `read`.
1541            if byte < 0xE0 {
1542                if byte >= 0x80 {
1543                    // Two-byte
1544                    let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1545                    let point = ((u16::from(byte) & 0x1F) << 6) | (u16::from(second) & 0x3F);
1546                    unsafe { *(dst.get_unchecked_mut(written)) = point };
1547                    read += 2;
1548                    written += 1;
1549                } else {
1550                    // ASCII: write and go back to SIMD.
1551                    unsafe { *(dst.get_unchecked_mut(written)) = u16::from(byte) };
1552                    read += 1;
1553                    written += 1;
1554                    // Intuitively, we should go back to the outer loop only
1555                    // if byte is 0x30 or above, so as to avoid trashing on
1556                    // ASCII space, comma and period in non-Latin context.
1557                    // However, the extra branch seems to cost more than it's
1558                    // worth.
1559                    continue 'outer;
1560                }
1561            } else if byte < 0xF0 {
1562                // Three-byte
1563                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1564                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1565                let point = ((u16::from(byte) & 0xF) << 12)
1566                    | ((u16::from(second) & 0x3F) << 6)
1567                    | (u16::from(third) & 0x3F);
1568                unsafe { *(dst.get_unchecked_mut(written)) = point };
1569                read += 3;
1570                written += 1;
1571            } else {
1572                // Four-byte
1573                let second = unsafe { *(bytes.get_unchecked(read + 1)) };
1574                let third = unsafe { *(bytes.get_unchecked(read + 2)) };
1575                let fourth = unsafe { *(bytes.get_unchecked(read + 3)) };
1576                let point = ((u32::from(byte) & 0x7) << 18)
1577                    | ((u32::from(second) & 0x3F) << 12)
1578                    | ((u32::from(third) & 0x3F) << 6)
1579                    | (u32::from(fourth) & 0x3F);
1580                unsafe { *(dst.get_unchecked_mut(written)) = (0xD7C0 + (point >> 10)) as u16 };
1581                unsafe {
1582                    *(dst.get_unchecked_mut(written + 1)) = (0xDC00 + (point & 0x3FF)) as u16
1583                };
1584                read += 4;
1585                written += 2;
1586            }
1587            // The comparison is always < or == and never >, but including
1588            // > here to let the compiler assume that < is true if this
1589            // comparison is false.
1590            if read >= src.len() {
1591                return written;
1592            }
1593            byte = bytes[read];
1594            continue 'inner;
1595        }
1596    }
1597}
1598
1599/// Converts potentially-invalid UTF-8 to valid UTF-16 signaling on error.
1600///
1601/// The length of the destination buffer must be at least the length of the
1602/// source buffer.
1603///
1604/// Returns the number of `u16`s written or `None` if the input was invalid.
1605///
1606/// When the input was invalid, some output may have been written.
1607///
1608/// # Panics
1609///
1610/// Panics if the destination buffer is shorter than stated above.
1611pub fn convert_utf8_to_utf16_without_replacement(src: &[u8], dst: &mut [u16]) -> Option<usize> {
1612    assert!(
1613        dst.len() >= src.len(),
1614        "Destination must not be shorter than the source."
1615    );
1616    let (read, written) = convert_utf8_to_utf16_up_to_invalid(src, dst);
1617    if read == src.len() {
1618        return Some(written);
1619    }
1620    None
1621}
1622
1623/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1624/// with the REPLACEMENT CHARACTER with potentially insufficient output
1625/// space.
1626///
1627/// Returns the number of code units read and the number of bytes written.
1628///
1629/// Guarantees that the bytes in the destination beyond the number of
1630/// bytes claimed as written by the second item of the return tuple
1631/// are left unmodified.
1632///
1633/// Not all code units are read if there isn't enough output space.
1634///
1635/// Note  that this method isn't designed for general streamability but for
1636/// not allocating memory for the worst case up front. Specifically,
1637/// if the input starts with or ends with an unpaired surrogate, those are
1638/// replaced with the REPLACEMENT CHARACTER.
1639///
1640/// Matches the semantics of `TextEncoder.encodeInto()` from the
1641/// Encoding Standard.
1642///
1643/// # Safety
1644///
1645/// If you want to convert into a `&mut str`, use
1646/// `convert_utf16_to_str_partial()` instead of using this function
1647/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1648#[inline(always)]
1649pub fn convert_utf16_to_utf8_partial(src: &[u16], dst: &mut [u8]) -> (usize, usize) {
1650    // The two functions called below are marked `inline(never)` to make
1651    // transitions from the hot part (first function) into the cold part
1652    // (second function) go through a return and another call to discouge
1653    // the CPU from speculating from the hot code into the cold code.
1654    // Letting the transitions be mere intra-function jumps, even to
1655    // basic blocks out-of-lined to the end of the function would wipe
1656    // away a quarter of Arabic encode performance on Haswell!
1657    let (read, written) = convert_utf16_to_utf8_partial_inner(src, dst);
1658    if unsafe { likely(read == src.len()) } {
1659        return (read, written);
1660    }
1661    let (tail_read, tail_written) =
1662        convert_utf16_to_utf8_partial_tail(&src[read..], &mut dst[written..]);
1663    (read + tail_read, written + tail_written)
1664}
1665
1666/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1667/// with the REPLACEMENT CHARACTER.
1668///
1669/// The length of the destination buffer must be at least the length of the
1670/// source buffer times three.
1671///
1672/// Returns the number of bytes written.
1673///
1674/// # Panics
1675///
1676/// Panics if the destination buffer is shorter than stated above.
1677///
1678/// # Safety
1679///
1680/// If you want to convert into a `&mut str`, use `convert_utf16_to_str()`
1681/// instead of using this function together with the `unsafe` method
1682/// `as_bytes_mut()` on `&mut str`.
1683#[inline(always)]
1684pub fn convert_utf16_to_utf8(src: &[u16], dst: &mut [u8]) -> usize {
1685    assert!(dst.len() >= src.len() * 3);
1686    let (read, written) = convert_utf16_to_utf8_partial(src, dst);
1687    debug_assert_eq!(read, src.len());
1688    written
1689}
1690
1691/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1692/// with the REPLACEMENT CHARACTER such that the validity of the output is
1693/// signaled using the Rust type system with potentially insufficient output
1694/// space.
1695///
1696/// Returns the number of code units read and the number of bytes written.
1697///
1698/// Not all code units are read if there isn't enough output space.
1699///
1700/// Note  that this method isn't designed for general streamability but for
1701/// not allocating memory for the worst case up front. Specifically,
1702/// if the input starts with or ends with an unpaired surrogate, those are
1703/// replaced with the REPLACEMENT CHARACTER.
1704pub fn convert_utf16_to_str_partial(src: &[u16], dst: &mut str) -> (usize, usize) {
1705    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1706    let (read, written) = convert_utf16_to_utf8_partial(src, bytes);
1707    let len = bytes.len();
1708    let mut trail = written;
1709    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1710        bytes[trail] = 0;
1711        trail += 1;
1712    }
1713    (read, written)
1714}
1715
1716/// Converts potentially-invalid UTF-16 to valid UTF-8 with errors replaced
1717/// with the REPLACEMENT CHARACTER such that the validity of the output is
1718/// signaled using the Rust type system.
1719///
1720/// The length of the destination buffer must be at least the length of the
1721/// source buffer times three.
1722///
1723/// Returns the number of bytes written.
1724///
1725/// # Panics
1726///
1727/// Panics if the destination buffer is shorter than stated above.
1728#[inline(always)]
1729pub fn convert_utf16_to_str(src: &[u16], dst: &mut str) -> usize {
1730    assert!(dst.len() >= src.len() * 3);
1731    let (read, written) = convert_utf16_to_str_partial(src, dst);
1732    debug_assert_eq!(read, src.len());
1733    written
1734}
1735
1736/// Converts bytes whose unsigned value is interpreted as Unicode code point
1737/// (i.e. U+0000 to U+00FF, inclusive) to UTF-16.
1738///
1739/// The length of the destination buffer must be at least the length of the
1740/// source buffer.
1741///
1742/// The number of `u16`s written equals the length of the source buffer.
1743///
1744/// # Panics
1745///
1746/// Panics if the destination buffer is shorter than stated above.
1747pub fn convert_latin1_to_utf16(src: &[u8], dst: &mut [u16]) {
1748    assert!(
1749        dst.len() >= src.len(),
1750        "Destination must not be shorter than the source."
1751    );
1752    // TODO: On aarch64, the safe version autovectorizes to the same unpacking
1753    // instructions and this code, but, yet, the autovectorized version is
1754    // faster.
1755    unsafe {
1756        unpack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1757    }
1758}
1759
1760/// Converts bytes whose unsigned value is interpreted as Unicode code point
1761/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 with potentially insufficient
1762/// output space.
1763///
1764/// Returns the number of bytes read and the number of bytes written.
1765///
1766/// If the output isn't large enough, not all input is consumed.
1767///
1768/// # Safety
1769///
1770/// If you want to convert into a `&mut str`, use
1771/// `convert_utf16_to_str_partial()` instead of using this function
1772/// together with the `unsafe` method `as_bytes_mut()` on `&mut str`.
1773pub fn convert_latin1_to_utf8_partial(src: &[u8], dst: &mut [u8]) -> (usize, usize) {
1774    let src_len = src.len();
1775    let src_ptr = src.as_ptr();
1776    let dst_ptr = dst.as_mut_ptr();
1777    let dst_len = dst.len();
1778    let mut total_read = 0usize;
1779    let mut total_written = 0usize;
1780    loop {
1781        // src can't advance more than dst
1782        let src_left = src_len - total_read;
1783        let dst_left = dst_len - total_written;
1784        let min_left = ::std::cmp::min(src_left, dst_left);
1785        if let Some((non_ascii, consumed)) = unsafe {
1786            ascii_to_ascii(
1787                src_ptr.add(total_read),
1788                dst_ptr.add(total_written),
1789                min_left,
1790            )
1791        } {
1792            total_read += consumed;
1793            total_written += consumed;
1794            if total_written.checked_add(2).unwrap() > dst_len {
1795                return (total_read, total_written);
1796            }
1797
1798            total_read += 1; // consume `non_ascii`
1799
1800            dst[total_written] = (non_ascii >> 6) | 0xC0;
1801            total_written += 1;
1802            dst[total_written] = (non_ascii & 0x3F) | 0x80;
1803            total_written += 1;
1804            continue;
1805        }
1806        return (total_read + min_left, total_written + min_left);
1807    }
1808}
1809
1810/// Converts bytes whose unsigned value is interpreted as Unicode code point
1811/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1812///
1813/// The length of the destination buffer must be at least the length of the
1814/// source buffer times two.
1815///
1816/// Returns the number of bytes written.
1817///
1818/// # Panics
1819///
1820/// Panics if the destination buffer is shorter than stated above.
1821///
1822/// # Safety
1823///
1824/// Note that this function may write garbage beyond the number of bytes
1825/// indicated by the return value, so using a `&mut str` interpreted as
1826/// `&mut [u8]` as the destination is not safe. If you want to convert into
1827/// a `&mut str`, use `convert_utf16_to_str()` instead of this function.
1828#[inline]
1829pub fn convert_latin1_to_utf8(src: &[u8], dst: &mut [u8]) -> usize {
1830    assert!(
1831        dst.len() >= src.len() * 2,
1832        "Destination must not be shorter than the source times two."
1833    );
1834    let (read, written) = convert_latin1_to_utf8_partial(src, dst);
1835    debug_assert_eq!(read, src.len());
1836    written
1837}
1838
1839/// Converts bytes whose unsigned value is interpreted as Unicode code point
1840/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1841/// output is signaled using the Rust type system with potentially insufficient
1842/// output space.
1843///
1844/// Returns the number of bytes read and the number of bytes written.
1845///
1846/// If the output isn't large enough, not all input is consumed.
1847#[inline]
1848pub fn convert_latin1_to_str_partial(src: &[u8], dst: &mut str) -> (usize, usize) {
1849    let bytes: &mut [u8] = unsafe { dst.as_bytes_mut() };
1850    let (read, written) = convert_latin1_to_utf8_partial(src, bytes);
1851    let len = bytes.len();
1852    let mut trail = written;
1853    let max = ::std::cmp::min(len, trail + MAX_STRIDE_SIZE);
1854    while trail < max {
1855        bytes[trail] = 0;
1856        trail += 1;
1857    }
1858    while trail < len && ((bytes[trail] & 0xC0) == 0x80) {
1859        bytes[trail] = 0;
1860        trail += 1;
1861    }
1862    (read, written)
1863}
1864
1865/// Converts bytes whose unsigned value is interpreted as Unicode code point
1866/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8 such that the validity of the
1867/// output is signaled using the Rust type system.
1868///
1869/// The length of the destination buffer must be at least the length of the
1870/// source buffer times two.
1871///
1872/// Returns the number of bytes written.
1873///
1874/// # Panics
1875///
1876/// Panics if the destination buffer is shorter than stated above.
1877#[inline]
1878pub fn convert_latin1_to_str(src: &[u8], dst: &mut str) -> usize {
1879    assert!(
1880        dst.len() >= src.len() * 2,
1881        "Destination must not be shorter than the source times two."
1882    );
1883    let (read, written) = convert_latin1_to_str_partial(src, dst);
1884    debug_assert_eq!(read, src.len());
1885    written
1886}
1887
1888/// If the input is valid UTF-8 representing only Unicode code points from
1889/// U+0000 to U+00FF, inclusive, converts the input into output that
1890/// represents the value of each code point as the unsigned byte value of
1891/// each output byte.
1892///
1893/// If the input does not fulfill the condition stated above, this function
1894/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
1895/// does something that is memory-safe without any promises about any
1896/// properties of the output. In particular, callers shouldn't assume the
1897/// output to be the same across crate versions or CPU architectures and
1898/// should not assume that non-ASCII input can't map to ASCII output.
1899///
1900/// The length of the destination buffer must be at least the length of the
1901/// source buffer.
1902///
1903/// Returns the number of bytes written.
1904///
1905/// # Panics
1906///
1907/// Panics if the destination buffer is shorter than stated above.
1908///
1909/// If debug assertions are enabled (and not fuzzing) and the input is
1910/// not in the range U+0000 to U+00FF, inclusive.
1911pub fn convert_utf8_to_latin1_lossy(src: &[u8], dst: &mut [u8]) -> usize {
1912    assert!(
1913        dst.len() >= src.len(),
1914        "Destination must not be shorter than the source."
1915    );
1916    non_fuzz_debug_assert!(is_utf8_latin1(src));
1917    let src_len = src.len();
1918    let src_ptr = src.as_ptr();
1919    let dst_ptr = dst.as_mut_ptr();
1920    let mut total_read = 0usize;
1921    let mut total_written = 0usize;
1922    loop {
1923        // dst can't advance more than src
1924        let src_left = src_len - total_read;
1925        if let Some((non_ascii, consumed)) = unsafe {
1926            ascii_to_ascii(
1927                src_ptr.add(total_read),
1928                dst_ptr.add(total_written),
1929                src_left,
1930            )
1931        } {
1932            total_read += consumed + 1;
1933            total_written += consumed;
1934
1935            if total_read == src_len {
1936                return total_written;
1937            }
1938
1939            let trail = src[total_read];
1940            total_read += 1;
1941
1942            dst[total_written] = ((non_ascii & 0x1F) << 6) | (trail & 0x3F);
1943            total_written += 1;
1944            continue;
1945        }
1946        return total_written + src_left;
1947    }
1948}
1949
1950/// If the input is valid UTF-16 representing only Unicode code points from
1951/// U+0000 to U+00FF, inclusive, converts the input into output that
1952/// represents the value of each code point as the unsigned byte value of
1953/// each output byte.
1954///
1955/// If the input does not fulfill the condition stated above, does something
1956/// that is memory-safe without any promises about any properties of the
1957/// output and will probably assert in debug builds in future versions.
1958/// In particular, callers shouldn't assume the output to be the same across
1959/// crate versions or CPU architectures and should not assume that non-ASCII
1960/// input can't map to ASCII output.
1961///
1962/// The length of the destination buffer must be at least the length of the
1963/// source buffer.
1964///
1965/// The number of bytes written equals the length of the source buffer.
1966///
1967/// # Panics
1968///
1969/// Panics if the destination buffer is shorter than stated above.
1970///
1971/// (Probably in future versions if debug assertions are enabled (and not
1972/// fuzzing) and the input is not in the range U+0000 to U+00FF, inclusive.)
1973pub fn convert_utf16_to_latin1_lossy(src: &[u16], dst: &mut [u8]) {
1974    assert!(
1975        dst.len() >= src.len(),
1976        "Destination must not be shorter than the source."
1977    );
1978    // non_fuzz_debug_assert!(is_utf16_latin1(src));
1979    unsafe {
1980        pack_latin1(src.as_ptr(), dst.as_mut_ptr(), src.len());
1981    }
1982}
1983
1984/// Converts bytes whose unsigned value is interpreted as Unicode code point
1985/// (i.e. U+0000 to U+00FF, inclusive) to UTF-8.
1986///
1987/// Borrows if input is ASCII-only. Performs a single heap allocation
1988/// otherwise.
1989pub fn decode_latin1<'a>(bytes: &'a [u8]) -> Cow<'a, str> {
1990    let up_to = ascii_valid_up_to(bytes);
1991    // >= makes later things optimize better than ==
1992    if up_to >= bytes.len() {
1993        debug_assert_eq!(up_to, bytes.len());
1994        let s: &str = unsafe { ::std::str::from_utf8_unchecked(bytes) };
1995        return Cow::Borrowed(s);
1996    }
1997    let (head, tail) = bytes.split_at(up_to);
1998    let capacity = head.len() + tail.len() * 2;
1999    let mut vec = Vec::with_capacity(capacity);
2000    unsafe {
2001        vec.set_len(capacity);
2002    }
2003    (&mut vec[..up_to]).copy_from_slice(head);
2004    let written = convert_latin1_to_utf8(tail, &mut vec[up_to..]);
2005    vec.truncate(up_to + written);
2006    Cow::Owned(unsafe { String::from_utf8_unchecked(vec) })
2007}
2008
2009/// If the input is valid UTF-8 representing only Unicode code points from
2010/// U+0000 to U+00FF, inclusive, converts the input into output that
2011/// represents the value of each code point as the unsigned byte value of
2012/// each output byte.
2013///
2014/// If the input does not fulfill the condition stated above, this function
2015/// panics if debug assertions are enabled (and fuzzing isn't) and otherwise
2016/// does something that is memory-safe without any promises about any
2017/// properties of the output. In particular, callers shouldn't assume the
2018/// output to be the same across crate versions or CPU architectures and
2019/// should not assume that non-ASCII input can't map to ASCII output.
2020///
2021/// Borrows if input is ASCII-only. Performs a single heap allocation
2022/// otherwise.
2023pub fn encode_latin1_lossy<'a>(string: &'a str) -> Cow<'a, [u8]> {
2024    let bytes = string.as_bytes();
2025    let up_to = ascii_valid_up_to(bytes);
2026    // >= makes later things optimize better than ==
2027    if up_to >= bytes.len() {
2028        debug_assert_eq!(up_to, bytes.len());
2029        return Cow::Borrowed(bytes);
2030    }
2031    let (head, tail) = bytes.split_at(up_to);
2032    let capacity = bytes.len();
2033    let mut vec = Vec::with_capacity(capacity);
2034    unsafe {
2035        vec.set_len(capacity);
2036    }
2037    (&mut vec[..up_to]).copy_from_slice(head);
2038    let written = convert_utf8_to_latin1_lossy(tail, &mut vec[up_to..]);
2039    vec.truncate(up_to + written);
2040    Cow::Owned(vec)
2041}
2042
2043/// Returns the index of the first unpaired surrogate or, if the input is
2044/// valid UTF-16 in its entirety, the length of the input.
2045pub fn utf16_valid_up_to(buffer: &[u16]) -> usize {
2046    utf16_valid_up_to_impl(buffer)
2047}
2048
2049/// Returns the index of first byte that starts an invalid byte
2050/// sequence or a non-Latin1 byte sequence, or the length of the
2051/// string if there are neither.
2052pub fn utf8_latin1_up_to(buffer: &[u8]) -> usize {
2053    is_utf8_latin1_impl(buffer).unwrap_or(buffer.len())
2054}
2055
2056/// Returns the index of first byte that starts a non-Latin1 byte
2057/// sequence, or the length of the string if there are none.
2058pub fn str_latin1_up_to(buffer: &str) -> usize {
2059    is_str_latin1_impl(buffer).unwrap_or(buffer.len())
2060}
2061
2062/// Replaces unpaired surrogates in the input with the REPLACEMENT CHARACTER.
2063#[inline]
2064pub fn ensure_utf16_validity(buffer: &mut [u16]) {
2065    let mut offset = 0;
2066    loop {
2067        offset += utf16_valid_up_to(&buffer[offset..]);
2068        if offset == buffer.len() {
2069            return;
2070        }
2071        buffer[offset] = 0xFFFD;
2072        offset += 1;
2073    }
2074}
2075
2076/// Copies ASCII from source to destination up to the first non-ASCII byte
2077/// (or the end of the input if it is ASCII in its entirety).
2078///
2079/// The length of the destination buffer must be at least the length of the
2080/// source buffer.
2081///
2082/// Returns the number of bytes written.
2083///
2084/// # Panics
2085///
2086/// Panics if the destination buffer is shorter than stated above.
2087pub fn copy_ascii_to_ascii(src: &[u8], dst: &mut [u8]) -> usize {
2088    assert!(
2089        dst.len() >= src.len(),
2090        "Destination must not be shorter than the source."
2091    );
2092    if let Some((_, consumed)) =
2093        unsafe { ascii_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2094    {
2095        consumed
2096    } else {
2097        src.len()
2098    }
2099}
2100
2101/// Copies ASCII from source to destination zero-extending it to UTF-16 up to
2102/// the first non-ASCII byte (or the end of the input if it is ASCII in its
2103/// entirety).
2104///
2105/// The length of the destination buffer must be at least the length of the
2106/// source buffer.
2107///
2108/// Returns the number of `u16`s written.
2109///
2110/// # Panics
2111///
2112/// Panics if the destination buffer is shorter than stated above.
2113pub fn copy_ascii_to_basic_latin(src: &[u8], dst: &mut [u16]) -> usize {
2114    assert!(
2115        dst.len() >= src.len(),
2116        "Destination must not be shorter than the source."
2117    );
2118    if let Some((_, consumed)) =
2119        unsafe { ascii_to_basic_latin(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2120    {
2121        consumed
2122    } else {
2123        src.len()
2124    }
2125}
2126
2127/// Copies Basic Latin from source to destination narrowing it to ASCII up to
2128/// the first non-Basic Latin code unit (or the end of the input if it is
2129/// Basic Latin in its entirety).
2130///
2131/// The length of the destination buffer must be at least the length of the
2132/// source buffer.
2133///
2134/// Returns the number of bytes written.
2135///
2136/// # Panics
2137///
2138/// Panics if the destination buffer is shorter than stated above.
2139pub fn copy_basic_latin_to_ascii(src: &[u16], dst: &mut [u8]) -> usize {
2140    assert!(
2141        dst.len() >= src.len(),
2142        "Destination must not be shorter than the source."
2143    );
2144    if let Some((_, consumed)) =
2145        unsafe { basic_latin_to_ascii(src.as_ptr(), dst.as_mut_ptr(), src.len()) }
2146    {
2147        consumed
2148    } else {
2149        src.len()
2150    }
2151}
2152
2153// Any copyright to the test code below this comment is dedicated to the
2154// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
2155
2156#[cfg(test)]
2157mod tests {
2158    use super::*;
2159
2160    #[test]
2161    fn test_is_ascii_success() {
2162        let mut src: Vec<u8> = Vec::with_capacity(128);
2163        src.resize(128, 0);
2164        for i in 0..src.len() {
2165            src[i] = i as u8;
2166        }
2167        for i in 0..src.len() {
2168            assert!(is_ascii(&src[i..]));
2169        }
2170    }
2171
2172    #[test]
2173    fn test_is_ascii_fail() {
2174        let mut src: Vec<u8> = Vec::with_capacity(128);
2175        src.resize(128, 0);
2176        for i in 0..src.len() {
2177            src[i] = i as u8;
2178        }
2179        for i in 0..src.len() {
2180            let tail = &mut src[i..];
2181            for j in 0..tail.len() {
2182                tail[j] = 0xA0;
2183                assert!(!is_ascii(tail));
2184            }
2185        }
2186    }
2187
2188    #[test]
2189    fn test_is_basic_latin_success() {
2190        let mut src: Vec<u16> = Vec::with_capacity(128);
2191        src.resize(128, 0);
2192        for i in 0..src.len() {
2193            src[i] = i as u16;
2194        }
2195        for i in 0..src.len() {
2196            assert!(is_basic_latin(&src[i..]));
2197        }
2198    }
2199
2200    #[test]
2201    fn test_is_basic_latin_fail() {
2202        let mut src: Vec<u16> = Vec::with_capacity(128);
2203        src.resize(128, 0);
2204        for i in 0..src.len() {
2205            src[i] = i as u16;
2206        }
2207        for i in 0..src.len() {
2208            let tail = &mut src[i..];
2209            for j in 0..tail.len() {
2210                tail[j] = 0xA0;
2211                assert!(!is_basic_latin(tail));
2212            }
2213        }
2214    }
2215
2216    #[test]
2217    fn test_is_utf16_latin1_success() {
2218        let mut src: Vec<u16> = Vec::with_capacity(256);
2219        src.resize(256, 0);
2220        for i in 0..src.len() {
2221            src[i] = i as u16;
2222        }
2223        for i in 0..src.len() {
2224            assert!(is_utf16_latin1(&src[i..]));
2225            assert_eq!(
2226                check_utf16_for_latin1_and_bidi(&src[i..]),
2227                Latin1Bidi::Latin1
2228            );
2229        }
2230    }
2231
2232    #[test]
2233    fn test_is_utf16_latin1_fail() {
2234        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2235        let mut src: Vec<u16> = Vec::with_capacity(len);
2236        src.resize(len, 0);
2237        for i in 0..src.len() {
2238            src[i] = i as u16;
2239        }
2240        for i in 0..src.len() {
2241            let tail = &mut src[i..];
2242            for j in 0..tail.len() {
2243                tail[j] = 0x100 + j as u16;
2244                assert!(!is_utf16_latin1(tail));
2245                assert_ne!(check_utf16_for_latin1_and_bidi(tail), Latin1Bidi::Latin1);
2246            }
2247        }
2248    }
2249
2250    #[test]
2251    fn test_is_str_latin1_success() {
2252        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2253        let mut src: Vec<u16> = Vec::with_capacity(len);
2254        src.resize(len, 0);
2255        for i in 0..src.len() {
2256            src[i] = i as u16;
2257        }
2258        for i in 0..src.len() {
2259            let s = String::from_utf16(&src[i..]).unwrap();
2260            assert!(is_str_latin1(&s[..]));
2261            assert_eq!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2262        }
2263    }
2264
2265    #[test]
2266    fn test_is_str_latin1_fail() {
2267        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2268        let mut src: Vec<u16> = Vec::with_capacity(len);
2269        src.resize(len, 0);
2270        for i in 0..src.len() {
2271            src[i] = i as u16;
2272        }
2273        for i in 0..src.len() {
2274            let tail = &mut src[i..];
2275            for j in 0..tail.len() {
2276                tail[j] = 0x100 + j as u16;
2277                let s = String::from_utf16(tail).unwrap();
2278                assert!(!is_str_latin1(&s[..]));
2279                assert_ne!(check_str_for_latin1_and_bidi(&s[..]), Latin1Bidi::Latin1);
2280            }
2281        }
2282    }
2283
2284    #[test]
2285    fn test_is_utf8_latin1_success() {
2286        let len = if cfg!(miri) { 64 } else { 256 }; // Miri is too slow
2287        let mut src: Vec<u16> = Vec::with_capacity(len);
2288        src.resize(len, 0);
2289        for i in 0..src.len() {
2290            src[i] = i as u16;
2291        }
2292        for i in 0..src.len() {
2293            let s = String::from_utf16(&src[i..]).unwrap();
2294            assert!(is_utf8_latin1(s.as_bytes()));
2295            assert_eq!(
2296                check_utf8_for_latin1_and_bidi(s.as_bytes()),
2297                Latin1Bidi::Latin1
2298            );
2299        }
2300    }
2301
2302    #[test]
2303    fn test_is_utf8_latin1_fail() {
2304        let len = if cfg!(miri) { 32 } else { 256 }; // Miri is too slow
2305        let mut src: Vec<u16> = Vec::with_capacity(len);
2306        src.resize(len, 0);
2307        for i in 0..src.len() {
2308            src[i] = i as u16;
2309        }
2310        for i in 0..src.len() {
2311            let tail = &mut src[i..];
2312            for j in 0..tail.len() {
2313                tail[j] = 0x100 + j as u16;
2314                let s = String::from_utf16(tail).unwrap();
2315                assert!(!is_utf8_latin1(s.as_bytes()));
2316                assert_ne!(
2317                    check_utf8_for_latin1_and_bidi(s.as_bytes()),
2318                    Latin1Bidi::Latin1
2319                );
2320            }
2321        }
2322    }
2323
2324    #[test]
2325    fn test_is_utf8_latin1_invalid() {
2326        assert!(!is_utf8_latin1(b"\xC3"));
2327        assert!(!is_utf8_latin1(b"a\xC3"));
2328        assert!(!is_utf8_latin1(b"\xFF"));
2329        assert!(!is_utf8_latin1(b"a\xFF"));
2330        assert!(!is_utf8_latin1(b"\xC3\xFF"));
2331        assert!(!is_utf8_latin1(b"a\xC3\xFF"));
2332    }
2333
2334    #[test]
2335    fn test_convert_utf8_to_utf16() {
2336        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2337        let mut dst: Vec<u16> = Vec::with_capacity(src.len() + 1);
2338        dst.resize(src.len() + 1, 0);
2339        let len = convert_utf8_to_utf16(src.as_bytes(), &mut dst[..]);
2340        dst.truncate(len);
2341        let reference: Vec<u16> = src.encode_utf16().collect();
2342        assert_eq!(dst, reference);
2343    }
2344
2345    #[test]
2346    fn test_convert_str_to_utf16() {
2347        let src = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2348        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2349        dst.resize(src.len(), 0);
2350        let len = convert_str_to_utf16(src, &mut dst[..]);
2351        dst.truncate(len);
2352        let reference: Vec<u16> = src.encode_utf16().collect();
2353        assert_eq!(dst, reference);
2354    }
2355
2356    #[test]
2357    fn test_convert_utf16_to_utf8_partial() {
2358        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2359        let src: Vec<u16> = reference.encode_utf16().collect();
2360        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2361        dst.resize(src.len() * 3 + 1, 0);
2362        let (read, written) = convert_utf16_to_utf8_partial(&src[..], &mut dst[..24]);
2363        let len = written + convert_utf16_to_utf8(&src[read..], &mut dst[written..]);
2364        dst.truncate(len);
2365        assert_eq!(dst, reference.as_bytes());
2366    }
2367
2368    #[test]
2369    fn test_convert_utf16_to_utf8() {
2370        let reference = "abcdefghijklmnopqrstu\u{1F4A9}v\u{2603}w\u{00B6}xyzz";
2371        let src: Vec<u16> = reference.encode_utf16().collect();
2372        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 3 + 1);
2373        dst.resize(src.len() * 3 + 1, 0);
2374        let len = convert_utf16_to_utf8(&src[..], &mut dst[..]);
2375        dst.truncate(len);
2376        assert_eq!(dst, reference.as_bytes());
2377    }
2378
2379    #[test]
2380    fn test_convert_latin1_to_utf16() {
2381        let mut src: Vec<u8> = Vec::with_capacity(256);
2382        src.resize(256, 0);
2383        let mut reference: Vec<u16> = Vec::with_capacity(256);
2384        reference.resize(256, 0);
2385        for i in 0..256 {
2386            src[i] = i as u8;
2387            reference[i] = i as u16;
2388        }
2389        let mut dst: Vec<u16> = Vec::with_capacity(src.len());
2390        dst.resize(src.len(), 0);
2391        convert_latin1_to_utf16(&src[..], &mut dst[..]);
2392        assert_eq!(dst, reference);
2393    }
2394
2395    #[test]
2396    fn test_convert_latin1_to_utf8_partial() {
2397        let mut dst = [0u8, 2];
2398        let (read, written) = convert_latin1_to_utf8_partial(b"a\xFF", &mut dst[..]);
2399        assert_eq!(read, 1);
2400        assert_eq!(written, 1);
2401    }
2402
2403    #[test]
2404    fn test_convert_latin1_to_utf8() {
2405        let mut src: Vec<u8> = Vec::with_capacity(256);
2406        src.resize(256, 0);
2407        let mut reference: Vec<u16> = Vec::with_capacity(256);
2408        reference.resize(256, 0);
2409        for i in 0..256 {
2410            src[i] = i as u8;
2411            reference[i] = i as u16;
2412        }
2413        let s = String::from_utf16(&reference[..]).unwrap();
2414        let mut dst: Vec<u8> = Vec::with_capacity(src.len() * 2);
2415        dst.resize(src.len() * 2, 0);
2416        let len = convert_latin1_to_utf8(&src[..], &mut dst[..]);
2417        dst.truncate(len);
2418        assert_eq!(&dst[..], s.as_bytes());
2419    }
2420
2421    #[test]
2422    fn test_convert_utf8_to_latin1_lossy() {
2423        let mut reference: Vec<u8> = Vec::with_capacity(256);
2424        reference.resize(256, 0);
2425        let mut src16: Vec<u16> = Vec::with_capacity(256);
2426        src16.resize(256, 0);
2427        for i in 0..256 {
2428            src16[i] = i as u16;
2429            reference[i] = i as u8;
2430        }
2431        let src = String::from_utf16(&src16[..]).unwrap();
2432        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2433        dst.resize(src.len(), 0);
2434        let len = convert_utf8_to_latin1_lossy(src.as_bytes(), &mut dst[..]);
2435        dst.truncate(len);
2436        assert_eq!(dst, reference);
2437    }
2438
2439    #[cfg(all(debug_assertions, not(fuzzing)))]
2440    #[test]
2441    #[should_panic]
2442    fn test_convert_utf8_to_latin1_lossy_panics() {
2443        let mut dst = [0u8; 16];
2444        let _ = convert_utf8_to_latin1_lossy("\u{100}".as_bytes(), &mut dst[..]);
2445    }
2446
2447    #[test]
2448    fn test_convert_utf16_to_latin1_lossy() {
2449        let mut src: Vec<u16> = Vec::with_capacity(256);
2450        src.resize(256, 0);
2451        let mut reference: Vec<u8> = Vec::with_capacity(256);
2452        reference.resize(256, 0);
2453        for i in 0..256 {
2454            src[i] = i as u16;
2455            reference[i] = i as u8;
2456        }
2457        let mut dst: Vec<u8> = Vec::with_capacity(src.len());
2458        dst.resize(src.len(), 0);
2459        convert_utf16_to_latin1_lossy(&src[..], &mut dst[..]);
2460        assert_eq!(dst, reference);
2461    }
2462
2463    #[test]
2464    // #[should_panic]
2465    fn test_convert_utf16_to_latin1_lossy_panics() {
2466        let mut dst = [0u8; 16];
2467        let _ = convert_utf16_to_latin1_lossy(&[0x0100u16], &mut dst[..]);
2468    }
2469
2470    #[test]
2471    fn test_utf16_valid_up_to() {
2472        let valid = vec![
2473            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0x2603u16,
2474            0xD83Du16, 0xDCA9u16, 0x00B6u16,
2475        ];
2476        assert_eq!(utf16_valid_up_to(&valid[..]), 16);
2477        let lone_high = vec![
2478            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2479            0x2603u16, 0xD83Du16, 0x00B6u16,
2480        ];
2481        assert_eq!(utf16_valid_up_to(&lone_high[..]), 14);
2482        let lone_low = vec![
2483            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2484            0x2603u16, 0xDCA9u16, 0x00B6u16,
2485        ];
2486        assert_eq!(utf16_valid_up_to(&lone_low[..]), 14);
2487        let lone_high_at_end = vec![
2488            0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2489            0x2603u16, 0x00B6u16, 0xD83Du16,
2490        ];
2491        assert_eq!(utf16_valid_up_to(&lone_high_at_end[..]), 15);
2492    }
2493
2494    #[test]
2495    fn test_ensure_utf16_validity() {
2496        let mut src = vec![
2497            0u16, 0xD83Du16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2498            0u16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2499            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2500        ];
2501        let reference = vec![
2502            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0xD83Du16, 0xDCA9u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2503            0u16, 0xFFFDu16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2504            0u16, 0u16, 0u16, 0u16, 0u16, 0u16,
2505        ];
2506        ensure_utf16_validity(&mut src[..]);
2507        assert_eq!(src, reference);
2508    }
2509
2510    #[test]
2511    fn test_is_char_bidi() {
2512        assert!(!is_char_bidi('a'));
2513        assert!(!is_char_bidi('\u{03B1}'));
2514        assert!(!is_char_bidi('\u{3041}'));
2515        assert!(!is_char_bidi('\u{1F4A9}'));
2516        assert!(!is_char_bidi('\u{FE00}'));
2517        assert!(!is_char_bidi('\u{202C}'));
2518        assert!(!is_char_bidi('\u{FEFF}'));
2519        assert!(is_char_bidi('\u{0590}'));
2520        assert!(is_char_bidi('\u{08FF}'));
2521        assert!(is_char_bidi('\u{061C}'));
2522        assert!(is_char_bidi('\u{FB50}'));
2523        assert!(is_char_bidi('\u{FDFF}'));
2524        assert!(is_char_bidi('\u{FE70}'));
2525        assert!(is_char_bidi('\u{FEFE}'));
2526        assert!(is_char_bidi('\u{200F}'));
2527        assert!(is_char_bidi('\u{202B}'));
2528        assert!(is_char_bidi('\u{202E}'));
2529        assert!(is_char_bidi('\u{2067}'));
2530        assert!(is_char_bidi('\u{10800}'));
2531        assert!(is_char_bidi('\u{10FFF}'));
2532        assert!(is_char_bidi('\u{1E800}'));
2533        assert!(is_char_bidi('\u{1EFFF}'));
2534    }
2535
2536    #[test]
2537    fn test_is_utf16_code_unit_bidi() {
2538        assert!(!is_utf16_code_unit_bidi(0x0062));
2539        assert!(!is_utf16_code_unit_bidi(0x03B1));
2540        assert!(!is_utf16_code_unit_bidi(0x3041));
2541        assert!(!is_utf16_code_unit_bidi(0xD801));
2542        assert!(!is_utf16_code_unit_bidi(0xFE00));
2543        assert!(!is_utf16_code_unit_bidi(0x202C));
2544        assert!(!is_utf16_code_unit_bidi(0xFEFF));
2545        assert!(is_utf16_code_unit_bidi(0x0590));
2546        assert!(is_utf16_code_unit_bidi(0x08FF));
2547        assert!(is_utf16_code_unit_bidi(0x061C));
2548        assert!(is_utf16_code_unit_bidi(0xFB1D));
2549        assert!(is_utf16_code_unit_bidi(0xFB50));
2550        assert!(is_utf16_code_unit_bidi(0xFDFF));
2551        assert!(is_utf16_code_unit_bidi(0xFE70));
2552        assert!(is_utf16_code_unit_bidi(0xFEFE));
2553        assert!(is_utf16_code_unit_bidi(0x200F));
2554        assert!(is_utf16_code_unit_bidi(0x202B));
2555        assert!(is_utf16_code_unit_bidi(0x202E));
2556        assert!(is_utf16_code_unit_bidi(0x2067));
2557        assert!(is_utf16_code_unit_bidi(0xD802));
2558        assert!(is_utf16_code_unit_bidi(0xD803));
2559        assert!(is_utf16_code_unit_bidi(0xD83A));
2560        assert!(is_utf16_code_unit_bidi(0xD83B));
2561    }
2562
2563    #[test]
2564    fn test_is_str_bidi() {
2565        assert!(!is_str_bidi("abcdefghijklmnopaabcdefghijklmnop"));
2566        assert!(!is_str_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"));
2567        assert!(!is_str_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"));
2568        assert!(!is_str_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"));
2569        assert!(!is_str_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"));
2570        assert!(!is_str_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"));
2571        assert!(!is_str_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"));
2572        assert!(is_str_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"));
2573        assert!(is_str_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"));
2574        assert!(is_str_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"));
2575        assert!(is_str_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"));
2576        assert!(is_str_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"));
2577        assert!(is_str_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"));
2578        assert!(is_str_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"));
2579        assert!(is_str_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"));
2580        assert!(is_str_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"));
2581        assert!(is_str_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"));
2582        assert!(is_str_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"));
2583        assert!(is_str_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"));
2584        assert!(is_str_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"));
2585        assert!(is_str_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"));
2586        assert!(is_str_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"));
2587    }
2588
2589    #[test]
2590    fn test_is_utf8_bidi() {
2591        assert!(!is_utf8_bidi(
2592            "abcdefghijklmnopaabcdefghijklmnop".as_bytes()
2593        ));
2594        assert!(!is_utf8_bidi(
2595            "abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()
2596        ));
2597        assert!(!is_utf8_bidi(
2598            "abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()
2599        ));
2600        assert!(!is_utf8_bidi(
2601            "abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()
2602        ));
2603        assert!(!is_utf8_bidi(
2604            "abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()
2605        ));
2606        assert!(!is_utf8_bidi(
2607            "abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()
2608        ));
2609        assert!(!is_utf8_bidi(
2610            "abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()
2611        ));
2612        assert!(is_utf8_bidi(
2613            "abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()
2614        ));
2615        assert!(is_utf8_bidi(
2616            "abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()
2617        ));
2618        assert!(is_utf8_bidi(
2619            "abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()
2620        ));
2621        assert!(is_utf8_bidi(
2622            "abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()
2623        ));
2624        assert!(is_utf8_bidi(
2625            "abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()
2626        ));
2627        assert!(is_utf8_bidi(
2628            "abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()
2629        ));
2630        assert!(is_utf8_bidi(
2631            "abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()
2632        ));
2633        assert!(is_utf8_bidi(
2634            "abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()
2635        ));
2636        assert!(is_utf8_bidi(
2637            "abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()
2638        ));
2639        assert!(is_utf8_bidi(
2640            "abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()
2641        ));
2642        assert!(is_utf8_bidi(
2643            "abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()
2644        ));
2645        assert!(is_utf8_bidi(
2646            "abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()
2647        ));
2648        assert!(is_utf8_bidi(
2649            "abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()
2650        ));
2651        assert!(is_utf8_bidi(
2652            "abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()
2653        ));
2654        assert!(is_utf8_bidi(
2655            "abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()
2656        ));
2657    }
2658
2659    #[test]
2660    fn test_is_utf16_bidi() {
2661        assert!(!is_utf16_bidi(&[
2662            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65, 0x66,
2663            0x67, 0x68, 0x69,
2664        ]));
2665        assert!(!is_utf16_bidi(&[
2666            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65, 0x66,
2667            0x67, 0x68, 0x69,
2668        ]));
2669        assert!(!is_utf16_bidi(&[
2670            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65, 0x66,
2671            0x67, 0x68, 0x69,
2672        ]));
2673        assert!(!is_utf16_bidi(&[
2674            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65, 0x66,
2675            0x67, 0x68, 0x69,
2676        ]));
2677        assert!(!is_utf16_bidi(&[
2678            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65, 0x66,
2679            0x67, 0x68, 0x69,
2680        ]));
2681        assert!(!is_utf16_bidi(&[
2682            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65, 0x66,
2683            0x67, 0x68, 0x69,
2684        ]));
2685        assert!(!is_utf16_bidi(&[
2686            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2687            0x67, 0x68, 0x69,
2688        ]));
2689        assert!(is_utf16_bidi(&[
2690            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65, 0x66,
2691            0x67, 0x68, 0x69,
2692        ]));
2693        assert!(is_utf16_bidi(&[
2694            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65, 0x66,
2695            0x67, 0x68, 0x69,
2696        ]));
2697        assert!(is_utf16_bidi(&[
2698            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65, 0x66,
2699            0x67, 0x68, 0x69,
2700        ]));
2701        assert!(is_utf16_bidi(&[
2702            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65, 0x66,
2703            0x67, 0x68, 0x69,
2704        ]));
2705        assert!(is_utf16_bidi(&[
2706            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65, 0x66,
2707            0x67, 0x68, 0x69,
2708        ]));
2709        assert!(is_utf16_bidi(&[
2710            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65, 0x66,
2711            0x67, 0x68, 0x69,
2712        ]));
2713        assert!(is_utf16_bidi(&[
2714            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65, 0x66,
2715            0x67, 0x68, 0x69,
2716        ]));
2717        assert!(is_utf16_bidi(&[
2718            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65, 0x66,
2719            0x67, 0x68, 0x69,
2720        ]));
2721        assert!(is_utf16_bidi(&[
2722            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65, 0x66,
2723            0x67, 0x68, 0x69,
2724        ]));
2725        assert!(is_utf16_bidi(&[
2726            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65, 0x66,
2727            0x67, 0x68, 0x69,
2728        ]));
2729        assert!(is_utf16_bidi(&[
2730            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65, 0x66,
2731            0x67, 0x68, 0x69,
2732        ]));
2733        assert!(is_utf16_bidi(&[
2734            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65, 0x66,
2735            0x67, 0x68, 0x69,
2736        ]));
2737        assert!(is_utf16_bidi(&[
2738            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65, 0x66,
2739            0x67, 0x68, 0x69,
2740        ]));
2741        assert!(is_utf16_bidi(&[
2742            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65, 0x66,
2743            0x67, 0x68, 0x69,
2744        ]));
2745        assert!(is_utf16_bidi(&[
2746            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65, 0x66,
2747            0x67, 0x68, 0x69,
2748        ]));
2749        assert!(is_utf16_bidi(&[
2750            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65, 0x66,
2751            0x67, 0x68, 0x69,
2752        ]));
2753
2754        assert!(is_utf16_bidi(&[
2755            0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64, 0x65,
2756            0x66, 0x67, 0x68, 0x69,
2757        ]));
2758    }
2759
2760    #[test]
2761    fn test_check_str_for_latin1_and_bidi() {
2762        assert_ne!(
2763            check_str_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop"),
2764            Latin1Bidi::Bidi
2765        );
2766        assert_ne!(
2767            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop"),
2768            Latin1Bidi::Bidi
2769        );
2770        assert_ne!(
2771            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop"),
2772            Latin1Bidi::Bidi
2773        );
2774        assert_ne!(
2775            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop"),
2776            Latin1Bidi::Bidi
2777        );
2778        assert_ne!(
2779            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop"),
2780            Latin1Bidi::Bidi
2781        );
2782        assert_ne!(
2783            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop"),
2784            Latin1Bidi::Bidi
2785        );
2786        assert_ne!(
2787            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop"),
2788            Latin1Bidi::Bidi
2789        );
2790        assert_eq!(
2791            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop"),
2792            Latin1Bidi::Bidi
2793        );
2794        assert_eq!(
2795            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop"),
2796            Latin1Bidi::Bidi
2797        );
2798        assert_eq!(
2799            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop"),
2800            Latin1Bidi::Bidi
2801        );
2802        assert_eq!(
2803            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop"),
2804            Latin1Bidi::Bidi
2805        );
2806        assert_eq!(
2807            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop"),
2808            Latin1Bidi::Bidi
2809        );
2810        assert_eq!(
2811            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop"),
2812            Latin1Bidi::Bidi
2813        );
2814        assert_eq!(
2815            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop"),
2816            Latin1Bidi::Bidi
2817        );
2818        assert_eq!(
2819            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop"),
2820            Latin1Bidi::Bidi
2821        );
2822        assert_eq!(
2823            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop"),
2824            Latin1Bidi::Bidi
2825        );
2826        assert_eq!(
2827            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop"),
2828            Latin1Bidi::Bidi
2829        );
2830        assert_eq!(
2831            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop"),
2832            Latin1Bidi::Bidi
2833        );
2834        assert_eq!(
2835            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop"),
2836            Latin1Bidi::Bidi
2837        );
2838        assert_eq!(
2839            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop"),
2840            Latin1Bidi::Bidi
2841        );
2842        assert_eq!(
2843            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop"),
2844            Latin1Bidi::Bidi
2845        );
2846        assert_eq!(
2847            check_str_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop"),
2848            Latin1Bidi::Bidi
2849        );
2850    }
2851
2852    #[test]
2853    fn test_check_utf8_for_latin1_and_bidi() {
2854        assert_ne!(
2855            check_utf8_for_latin1_and_bidi("abcdefghijklmnopaabcdefghijklmnop".as_bytes()),
2856            Latin1Bidi::Bidi
2857        );
2858        assert_ne!(
2859            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{03B1}abcdefghijklmnop".as_bytes()),
2860            Latin1Bidi::Bidi
2861        );
2862        assert_ne!(
2863            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{3041}abcdefghijklmnop".as_bytes()),
2864            Latin1Bidi::Bidi
2865        );
2866        assert_ne!(
2867            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1F4A9}abcdefghijklmnop".as_bytes()),
2868            Latin1Bidi::Bidi
2869        );
2870        assert_ne!(
2871            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE00}abcdefghijklmnop".as_bytes()),
2872            Latin1Bidi::Bidi
2873        );
2874        assert_ne!(
2875            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202C}abcdefghijklmnop".as_bytes()),
2876            Latin1Bidi::Bidi
2877        );
2878        assert_ne!(
2879            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFF}abcdefghijklmnop".as_bytes()),
2880            Latin1Bidi::Bidi
2881        );
2882        assert_eq!(
2883            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{0590}abcdefghijklmnop".as_bytes()),
2884            Latin1Bidi::Bidi
2885        );
2886        assert_eq!(
2887            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{08FF}abcdefghijklmnop".as_bytes()),
2888            Latin1Bidi::Bidi
2889        );
2890        assert_eq!(
2891            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{061C}abcdefghijklmnop".as_bytes()),
2892            Latin1Bidi::Bidi
2893        );
2894        assert_eq!(
2895            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FB50}abcdefghijklmnop".as_bytes()),
2896            Latin1Bidi::Bidi
2897        );
2898        assert_eq!(
2899            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FDFF}abcdefghijklmnop".as_bytes()),
2900            Latin1Bidi::Bidi
2901        );
2902        assert_eq!(
2903            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FE70}abcdefghijklmnop".as_bytes()),
2904            Latin1Bidi::Bidi
2905        );
2906        assert_eq!(
2907            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{FEFE}abcdefghijklmnop".as_bytes()),
2908            Latin1Bidi::Bidi
2909        );
2910        assert_eq!(
2911            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{200F}abcdefghijklmnop".as_bytes()),
2912            Latin1Bidi::Bidi
2913        );
2914        assert_eq!(
2915            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202B}abcdefghijklmnop".as_bytes()),
2916            Latin1Bidi::Bidi
2917        );
2918        assert_eq!(
2919            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{202E}abcdefghijklmnop".as_bytes()),
2920            Latin1Bidi::Bidi
2921        );
2922        assert_eq!(
2923            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{2067}abcdefghijklmnop".as_bytes()),
2924            Latin1Bidi::Bidi
2925        );
2926        assert_eq!(
2927            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10800}abcdefghijklmnop".as_bytes()),
2928            Latin1Bidi::Bidi
2929        );
2930        assert_eq!(
2931            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{10FFF}abcdefghijklmnop".as_bytes()),
2932            Latin1Bidi::Bidi
2933        );
2934        assert_eq!(
2935            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1E800}abcdefghijklmnop".as_bytes()),
2936            Latin1Bidi::Bidi
2937        );
2938        assert_eq!(
2939            check_utf8_for_latin1_and_bidi("abcdefghijklmnop\u{1EFFF}abcdefghijklmnop".as_bytes()),
2940            Latin1Bidi::Bidi
2941        );
2942    }
2943
2944    #[test]
2945    fn test_check_utf16_for_latin1_and_bidi() {
2946        assert_ne!(
2947            check_utf16_for_latin1_and_bidi(&[
2948                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0062, 0x62, 0x63, 0x64, 0x65,
2949                0x66, 0x67, 0x68, 0x69,
2950            ]),
2951            Latin1Bidi::Bidi
2952        );
2953        assert_ne!(
2954            check_utf16_for_latin1_and_bidi(&[
2955                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x03B1, 0x62, 0x63, 0x64, 0x65,
2956                0x66, 0x67, 0x68, 0x69,
2957            ]),
2958            Latin1Bidi::Bidi
2959        );
2960        assert_ne!(
2961            check_utf16_for_latin1_and_bidi(&[
2962                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x3041, 0x62, 0x63, 0x64, 0x65,
2963                0x66, 0x67, 0x68, 0x69,
2964            ]),
2965            Latin1Bidi::Bidi
2966        );
2967        assert_ne!(
2968            check_utf16_for_latin1_and_bidi(&[
2969                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD801, 0x62, 0x63, 0x64, 0x65,
2970                0x66, 0x67, 0x68, 0x69,
2971            ]),
2972            Latin1Bidi::Bidi
2973        );
2974        assert_ne!(
2975            check_utf16_for_latin1_and_bidi(&[
2976                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE00, 0x62, 0x63, 0x64, 0x65,
2977                0x66, 0x67, 0x68, 0x69,
2978            ]),
2979            Latin1Bidi::Bidi
2980        );
2981        assert_ne!(
2982            check_utf16_for_latin1_and_bidi(&[
2983                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202C, 0x62, 0x63, 0x64, 0x65,
2984                0x66, 0x67, 0x68, 0x69,
2985            ]),
2986            Latin1Bidi::Bidi
2987        );
2988        assert_ne!(
2989            check_utf16_for_latin1_and_bidi(&[
2990                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFF, 0x62, 0x63, 0x64, 0x65,
2991                0x66, 0x67, 0x68, 0x69,
2992            ]),
2993            Latin1Bidi::Bidi
2994        );
2995        assert_eq!(
2996            check_utf16_for_latin1_and_bidi(&[
2997                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x62, 0x63, 0x64, 0x65,
2998                0x66, 0x67, 0x68, 0x69,
2999            ]),
3000            Latin1Bidi::Bidi
3001        );
3002        assert_eq!(
3003            check_utf16_for_latin1_and_bidi(&[
3004                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x08FF, 0x62, 0x63, 0x64, 0x65,
3005                0x66, 0x67, 0x68, 0x69,
3006            ]),
3007            Latin1Bidi::Bidi
3008        );
3009        assert_eq!(
3010            check_utf16_for_latin1_and_bidi(&[
3011                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x061C, 0x62, 0x63, 0x64, 0x65,
3012                0x66, 0x67, 0x68, 0x69,
3013            ]),
3014            Latin1Bidi::Bidi
3015        );
3016        assert_eq!(
3017            check_utf16_for_latin1_and_bidi(&[
3018                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB1D, 0x62, 0x63, 0x64, 0x65,
3019                0x66, 0x67, 0x68, 0x69,
3020            ]),
3021            Latin1Bidi::Bidi
3022        );
3023        assert_eq!(
3024            check_utf16_for_latin1_and_bidi(&[
3025                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFB50, 0x62, 0x63, 0x64, 0x65,
3026                0x66, 0x67, 0x68, 0x69,
3027            ]),
3028            Latin1Bidi::Bidi
3029        );
3030        assert_eq!(
3031            check_utf16_for_latin1_and_bidi(&[
3032                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFDFF, 0x62, 0x63, 0x64, 0x65,
3033                0x66, 0x67, 0x68, 0x69,
3034            ]),
3035            Latin1Bidi::Bidi
3036        );
3037        assert_eq!(
3038            check_utf16_for_latin1_and_bidi(&[
3039                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFE70, 0x62, 0x63, 0x64, 0x65,
3040                0x66, 0x67, 0x68, 0x69,
3041            ]),
3042            Latin1Bidi::Bidi
3043        );
3044        assert_eq!(
3045            check_utf16_for_latin1_and_bidi(&[
3046                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xFEFE, 0x62, 0x63, 0x64, 0x65,
3047                0x66, 0x67, 0x68, 0x69,
3048            ]),
3049            Latin1Bidi::Bidi
3050        );
3051        assert_eq!(
3052            check_utf16_for_latin1_and_bidi(&[
3053                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x200F, 0x62, 0x63, 0x64, 0x65,
3054                0x66, 0x67, 0x68, 0x69,
3055            ]),
3056            Latin1Bidi::Bidi
3057        );
3058        assert_eq!(
3059            check_utf16_for_latin1_and_bidi(&[
3060                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202B, 0x62, 0x63, 0x64, 0x65,
3061                0x66, 0x67, 0x68, 0x69,
3062            ]),
3063            Latin1Bidi::Bidi
3064        );
3065        assert_eq!(
3066            check_utf16_for_latin1_and_bidi(&[
3067                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x202E, 0x62, 0x63, 0x64, 0x65,
3068                0x66, 0x67, 0x68, 0x69,
3069            ]),
3070            Latin1Bidi::Bidi
3071        );
3072        assert_eq!(
3073            check_utf16_for_latin1_and_bidi(&[
3074                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x2067, 0x62, 0x63, 0x64, 0x65,
3075                0x66, 0x67, 0x68, 0x69,
3076            ]),
3077            Latin1Bidi::Bidi
3078        );
3079        assert_eq!(
3080            check_utf16_for_latin1_and_bidi(&[
3081                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD802, 0x62, 0x63, 0x64, 0x65,
3082                0x66, 0x67, 0x68, 0x69,
3083            ]),
3084            Latin1Bidi::Bidi
3085        );
3086        assert_eq!(
3087            check_utf16_for_latin1_and_bidi(&[
3088                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD803, 0x62, 0x63, 0x64, 0x65,
3089                0x66, 0x67, 0x68, 0x69,
3090            ]),
3091            Latin1Bidi::Bidi
3092        );
3093        assert_eq!(
3094            check_utf16_for_latin1_and_bidi(&[
3095                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83A, 0x62, 0x63, 0x64, 0x65,
3096                0x66, 0x67, 0x68, 0x69,
3097            ]),
3098            Latin1Bidi::Bidi
3099        );
3100        assert_eq!(
3101            check_utf16_for_latin1_and_bidi(&[
3102                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0xD83B, 0x62, 0x63, 0x64, 0x65,
3103                0x66, 0x67, 0x68, 0x69,
3104            ]),
3105            Latin1Bidi::Bidi
3106        );
3107
3108        assert_eq!(
3109            check_utf16_for_latin1_and_bidi(&[
3110                0x62, 0x63, 0x64, 0x65, 0x66, 0x67, 0x68, 0x69, 0x0590, 0x3041, 0x62, 0x63, 0x64,
3111                0x65, 0x66, 0x67, 0x68, 0x69,
3112            ]),
3113            Latin1Bidi::Bidi
3114        );
3115    }
3116
3117    #[inline(always)]
3118    pub fn reference_is_char_bidi(c: char) -> bool {
3119        match c {
3120            '\u{0590}'..='\u{08FF}'
3121            | '\u{FB1D}'..='\u{FDFF}'
3122            | '\u{FE70}'..='\u{FEFE}'
3123            | '\u{10800}'..='\u{10FFF}'
3124            | '\u{1E800}'..='\u{1EFFF}'
3125            | '\u{200F}'
3126            | '\u{202B}'
3127            | '\u{202E}'
3128            | '\u{2067}' => true,
3129            _ => false,
3130        }
3131    }
3132
3133    #[inline(always)]
3134    pub fn reference_is_utf16_code_unit_bidi(u: u16) -> bool {
3135        match u {
3136            0x0590..=0x08FF
3137            | 0xFB1D..=0xFDFF
3138            | 0xFE70..=0xFEFE
3139            | 0xD802
3140            | 0xD803
3141            | 0xD83A
3142            | 0xD83B
3143            | 0x200F
3144            | 0x202B
3145            | 0x202E
3146            | 0x2067 => true,
3147            _ => false,
3148        }
3149    }
3150
3151    #[test]
3152    #[cfg_attr(miri, ignore)] // Miri is too slow
3153    fn test_is_char_bidi_thoroughly() {
3154        for i in 0..0xD800u32 {
3155            let c: char = ::std::char::from_u32(i).unwrap();
3156            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3157        }
3158        for i in 0xE000..0x110000u32 {
3159            let c: char = ::std::char::from_u32(i).unwrap();
3160            assert_eq!(is_char_bidi(c), reference_is_char_bidi(c));
3161        }
3162    }
3163
3164    #[test]
3165    #[cfg_attr(miri, ignore)] // Miri is too slow
3166    fn test_is_utf16_code_unit_bidi_thoroughly() {
3167        for i in 0..0x10000u32 {
3168            let u = i as u16;
3169            assert_eq!(
3170                is_utf16_code_unit_bidi(u),
3171                reference_is_utf16_code_unit_bidi(u)
3172            );
3173        }
3174    }
3175
3176    #[test]
3177    #[cfg_attr(miri, ignore)] // Miri is too slow
3178    fn test_is_str_bidi_thoroughly() {
3179        let mut buf = [0; 4];
3180        for i in 0..0xD800u32 {
3181            let c: char = ::std::char::from_u32(i).unwrap();
3182            assert_eq!(
3183                is_str_bidi(c.encode_utf8(&mut buf[..])),
3184                reference_is_char_bidi(c)
3185            );
3186        }
3187        for i in 0xE000..0x110000u32 {
3188            let c: char = ::std::char::from_u32(i).unwrap();
3189            assert_eq!(
3190                is_str_bidi(c.encode_utf8(&mut buf[..])),
3191                reference_is_char_bidi(c)
3192            );
3193        }
3194    }
3195
3196    #[test]
3197    #[cfg_attr(miri, ignore)] // Miri is too slow
3198    fn test_is_utf8_bidi_thoroughly() {
3199        let mut buf = [0; 8];
3200        for i in 0..0xD800u32 {
3201            let c: char = ::std::char::from_u32(i).unwrap();
3202            let expect = reference_is_char_bidi(c);
3203            {
3204                let len = {
3205                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3206                    assert_eq!(is_utf8_bidi(bytes), expect);
3207                    bytes.len()
3208                };
3209                {
3210                    let tail = &mut buf[len..];
3211                    for b in tail.iter_mut() {
3212                        *b = 0;
3213                    }
3214                }
3215            }
3216            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3217        }
3218        for i in 0xE000..0x110000u32 {
3219            let c: char = ::std::char::from_u32(i).unwrap();
3220            let expect = reference_is_char_bidi(c);
3221            {
3222                let len = {
3223                    let bytes = c.encode_utf8(&mut buf[..]).as_bytes();
3224                    assert_eq!(is_utf8_bidi(bytes), expect);
3225                    bytes.len()
3226                };
3227                {
3228                    let tail = &mut buf[len..];
3229                    for b in tail.iter_mut() {
3230                        *b = 0;
3231                    }
3232                }
3233            }
3234            assert_eq!(is_utf8_bidi(&buf[..]), expect);
3235        }
3236    }
3237
3238    #[test]
3239    #[cfg_attr(miri, ignore)] // Miri is too slow
3240    fn test_is_utf16_bidi_thoroughly() {
3241        let mut buf = [0; 32];
3242        for i in 0..0x10000u32 {
3243            let u = i as u16;
3244            buf[15] = u;
3245            assert_eq!(
3246                is_utf16_bidi(&buf[..]),
3247                reference_is_utf16_code_unit_bidi(u)
3248            );
3249        }
3250    }
3251
3252    #[test]
3253    fn test_is_utf8_bidi_edge_cases() {
3254        assert!(!is_utf8_bidi(b"\xD5\xBF\x61"));
3255        assert!(!is_utf8_bidi(b"\xD6\x80\x61"));
3256        assert!(!is_utf8_bidi(b"abc"));
3257        assert!(is_utf8_bidi(b"\xD5\xBF\xC2"));
3258        assert!(is_utf8_bidi(b"\xD6\x80\xC2"));
3259        assert!(is_utf8_bidi(b"ab\xC2"));
3260    }
3261
3262    #[test]
3263    fn test_decode_latin1() {
3264        match decode_latin1(b"ab") {
3265            Cow::Borrowed(s) => {
3266                assert_eq!(s, "ab");
3267            }
3268            Cow::Owned(_) => {
3269                unreachable!("Should have borrowed");
3270            }
3271        }
3272        assert_eq!(decode_latin1(b"a\xE4"), "a\u{E4}");
3273    }
3274
3275    #[test]
3276    fn test_encode_latin1_lossy() {
3277        match encode_latin1_lossy("ab") {
3278            Cow::Borrowed(s) => {
3279                assert_eq!(s, b"ab");
3280            }
3281            Cow::Owned(_) => {
3282                unreachable!("Should have borrowed");
3283            }
3284        }
3285        assert_eq!(encode_latin1_lossy("a\u{E4}"), &(b"a\xE4")[..]);
3286    }
3287
3288    #[test]
3289    fn test_convert_utf8_to_utf16_without_replacement() {
3290        let mut buf = [0u16; 5];
3291        assert_eq!(
3292            convert_utf8_to_utf16_without_replacement(b"ab", &mut buf[..2]),
3293            Some(2)
3294        );
3295        assert_eq!(buf[0], u16::from(b'a'));
3296        assert_eq!(buf[1], u16::from(b'b'));
3297        assert_eq!(buf[2], 0);
3298        assert_eq!(
3299            convert_utf8_to_utf16_without_replacement(b"\xC3\xA4c", &mut buf[..3]),
3300            Some(2)
3301        );
3302        assert_eq!(buf[0], 0xE4);
3303        assert_eq!(buf[1], u16::from(b'c'));
3304        assert_eq!(buf[2], 0);
3305        assert_eq!(
3306            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83", &mut buf[..3]),
3307            Some(1)
3308        );
3309        assert_eq!(buf[0], 0x2603);
3310        assert_eq!(buf[1], u16::from(b'c'));
3311        assert_eq!(buf[2], 0);
3312        assert_eq!(
3313            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83d", &mut buf[..4]),
3314            Some(2)
3315        );
3316        assert_eq!(buf[0], 0x2603);
3317        assert_eq!(buf[1], u16::from(b'd'));
3318        assert_eq!(buf[2], 0);
3319        assert_eq!(
3320            convert_utf8_to_utf16_without_replacement(b"\xE2\x98\x83\xC3\xA4", &mut buf[..5]),
3321            Some(2)
3322        );
3323        assert_eq!(buf[0], 0x2603);
3324        assert_eq!(buf[1], 0xE4);
3325        assert_eq!(buf[2], 0);
3326        assert_eq!(
3327            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8E", &mut buf[..4]),
3328            Some(2)
3329        );
3330        assert_eq!(buf[0], 0xD83D);
3331        assert_eq!(buf[1], 0xDCCE);
3332        assert_eq!(buf[2], 0);
3333        assert_eq!(
3334            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93\x8Ee", &mut buf[..5]),
3335            Some(3)
3336        );
3337        assert_eq!(buf[0], 0xD83D);
3338        assert_eq!(buf[1], 0xDCCE);
3339        assert_eq!(buf[2], u16::from(b'e'));
3340        assert_eq!(
3341            convert_utf8_to_utf16_without_replacement(b"\xF0\x9F\x93", &mut buf[..5]),
3342            None
3343        );
3344    }
3345}