encoding_rs/
ascii.rs

1// Copyright Mozilla Foundation. See the COPYRIGHT
2// file at the top-level directory of this distribution.
3//
4// Licensed under the Apache License, Version 2.0 <LICENSE-APACHE or
5// https://www.apache.org/licenses/LICENSE-2.0> or the MIT license
6// <LICENSE-MIT or https://opensource.org/licenses/MIT>, at your
7// option. This file may not be copied, modified, or distributed
8// except according to those terms.
9
10// It's assumed that in due course Rust will have explicit SIMD but will not
11// be good at run-time selection of SIMD vs. no-SIMD. In such a future,
12// x86_64 will always use SSE2 and 32-bit x86 will use SSE2 when compiled with
13// a Mozilla-shipped rustc. SIMD support and especially detection on ARM is a
14// mess. Under the circumstances, it seems to make sense to optimize the ALU
15// case for ARMv7 rather than x86. Annoyingly, I was unable to get useful
16// numbers of the actual ARMv7 CPU I have access to, because (thermal?)
17// throttling kept interfering. Since Raspberry Pi 3 (ARMv8 core but running
18// ARMv7 code) produced reproducible performance numbers, that's the ARM
19// computer that this code ended up being optimized for in the ALU case.
20// Less popular CPU architectures simply get the approach that was chosen based
21// on Raspberry Pi 3 measurements. The UTF-16 and UTF-8 ALU cases take
22// different approaches based on benchmarking on Raspberry Pi 3.
23
24#[cfg(all(
25    feature = "simd-accel",
26    any(
27        target_feature = "sse2",
28        all(target_endian = "little", target_arch = "aarch64"),
29        all(target_endian = "little", target_feature = "neon")
30    )
31))]
32use crate::simd_funcs::*;
33
34cfg_if! {
35    if #[cfg(feature = "simd-accel")] {
36        #[allow(unused_imports)]
37        use ::std::intrinsics::unlikely;
38        #[allow(unused_imports)]
39        use ::std::intrinsics::likely;
40    } else {
41        #[allow(dead_code)]
42        #[inline(always)]
43        // Unsafe to match the intrinsic, which is needlessly unsafe.
44        unsafe fn unlikely(b: bool) -> bool {
45            b
46        }
47        #[allow(dead_code)]
48        #[inline(always)]
49        // Unsafe to match the intrinsic, which is needlessly unsafe.
50        unsafe fn likely(b: bool) -> bool {
51            b
52        }
53    }
54}
55
56// `as` truncates, so works on 32-bit, too.
57#[allow(dead_code)]
58pub const ASCII_MASK: usize = 0x8080_8080_8080_8080u64 as usize;
59
60// `as` truncates, so works on 32-bit, too.
61#[allow(dead_code)]
62pub const BASIC_LATIN_MASK: usize = 0xFF80_FF80_FF80_FF80u64 as usize;
63
64#[allow(unused_macros)]
65macro_rules! ascii_naive {
66    ($name:ident, $src_unit:ty, $dst_unit:ty) => {
67        #[inline(always)]
68        pub unsafe fn $name(
69            src: *const $src_unit,
70            dst: *mut $dst_unit,
71            len: usize,
72        ) -> Option<($src_unit, usize)> {
73            // Yes, manually omitting the bound check here matters
74            // a lot for perf.
75            for i in 0..len {
76                let code_unit = *(src.add(i));
77                if code_unit > 127 {
78                    return Some((code_unit, i));
79                }
80                *(dst.add(i)) = code_unit as $dst_unit;
81            }
82            return None;
83        }
84    };
85}
86
87#[allow(unused_macros)]
88macro_rules! ascii_alu {
89    ($name:ident,
90     $src_unit:ty,
91     $dst_unit:ty,
92     $stride_fn:ident) => {
93        #[cfg_attr(feature = "cargo-clippy", allow(never_loop, cast_ptr_alignment))]
94        #[inline(always)]
95        pub unsafe fn $name(
96            src: *const $src_unit,
97            dst: *mut $dst_unit,
98            len: usize,
99        ) -> Option<($src_unit, usize)> {
100            let mut offset = 0usize;
101            // This loop is only broken out of as a `goto` forward
102            loop {
103                let mut until_alignment = {
104                    // Check if the other unit aligns if we move the narrower unit
105                    // to alignment.
106                    //               if ::std::mem::size_of::<$src_unit>() == ::std::mem::size_of::<$dst_unit>() {
107                    // ascii_to_ascii
108                    let src_alignment = (src as usize) & ALU_ALIGNMENT_MASK;
109                    let dst_alignment = (dst as usize) & ALU_ALIGNMENT_MASK;
110                    if src_alignment != dst_alignment {
111                        break;
112                    }
113                    (ALU_ALIGNMENT - src_alignment) & ALU_ALIGNMENT_MASK
114                    //               } else if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
115                    // ascii_to_basic_latin
116                    //                   let src_until_alignment = (ALIGNMENT - ((src as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
117                    //                   if (dst.add(src_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
118                    //                       break;
119                    //                   }
120                    //                   src_until_alignment
121                    //               } else {
122                    // basic_latin_to_ascii
123                    //                   let dst_until_alignment = (ALIGNMENT - ((dst as usize) & ALIGNMENT_MASK)) & ALIGNMENT_MASK;
124                    //                   if (src.add(dst_until_alignment) as usize) & ALIGNMENT_MASK != 0 {
125                    //                       break;
126                    //                   }
127                    //                   dst_until_alignment
128                    //               }
129                };
130                if until_alignment + ALU_STRIDE_SIZE <= len {
131                    // Moving pointers to alignment seems to be a pessimization on
132                    // x86_64 for operations that have UTF-16 as the internal
133                    // Unicode representation. However, since it seems to be a win
134                    // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
135                    // mixed results when encoding from UTF-16 and since x86 and
136                    // x86_64 should be using SSE2 in due course, keeping the move
137                    // to alignment here. It would be good to test on more ARM CPUs
138                    // and on real MIPS and POWER hardware.
139                    while until_alignment != 0 {
140                        let code_unit = *(src.add(offset));
141                        if code_unit > 127 {
142                            return Some((code_unit, offset));
143                        }
144                        *(dst.add(offset)) = code_unit as $dst_unit;
145                        offset += 1;
146                        until_alignment -= 1;
147                    }
148                    let len_minus_stride = len - ALU_STRIDE_SIZE;
149                    loop {
150                        if let Some(num_ascii) = $stride_fn(
151                            src.add(offset) as *const usize,
152                            dst.add(offset) as *mut usize,
153                        ) {
154                            offset += num_ascii;
155                            return Some((*(src.add(offset)), offset));
156                        }
157                        offset += ALU_STRIDE_SIZE;
158                        if offset > len_minus_stride {
159                            break;
160                        }
161                    }
162                }
163                break;
164            }
165            while offset < len {
166                let code_unit = *(src.add(offset));
167                if code_unit > 127 {
168                    return Some((code_unit, offset));
169                }
170                *(dst.add(offset)) = code_unit as $dst_unit;
171                offset += 1;
172            }
173            None
174        }
175    };
176}
177
178#[allow(unused_macros)]
179macro_rules! basic_latin_alu {
180    ($name:ident,
181     $src_unit:ty,
182     $dst_unit:ty,
183     $stride_fn:ident) => {
184        #[cfg_attr(
185            feature = "cargo-clippy",
186            allow(never_loop, cast_ptr_alignment, cast_lossless)
187        )]
188        #[inline(always)]
189        pub unsafe fn $name(
190            src: *const $src_unit,
191            dst: *mut $dst_unit,
192            len: usize,
193        ) -> Option<($src_unit, usize)> {
194            let mut offset = 0usize;
195            // This loop is only broken out of as a `goto` forward
196            loop {
197                let mut until_alignment = {
198                    // Check if the other unit aligns if we move the narrower unit
199                    // to alignment.
200                    //               if ::std::mem::size_of::<$src_unit>() == ::std::mem::size_of::<$dst_unit>() {
201                    // ascii_to_ascii
202                    //                   let src_alignment = (src as usize) & ALIGNMENT_MASK;
203                    //                   let dst_alignment = (dst as usize) & ALIGNMENT_MASK;
204                    //                   if src_alignment != dst_alignment {
205                    //                       break;
206                    //                   }
207                    //                   (ALIGNMENT - src_alignment) & ALIGNMENT_MASK
208                    //               } else
209                    if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
210                        // ascii_to_basic_latin
211                        let src_until_alignment = (ALU_ALIGNMENT
212                            - ((src as usize) & ALU_ALIGNMENT_MASK))
213                            & ALU_ALIGNMENT_MASK;
214                        if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
215                            != 0
216                        {
217                            break;
218                        }
219                        src_until_alignment
220                    } else {
221                        // basic_latin_to_ascii
222                        let dst_until_alignment = (ALU_ALIGNMENT
223                            - ((dst as usize) & ALU_ALIGNMENT_MASK))
224                            & ALU_ALIGNMENT_MASK;
225                        if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
226                            != 0
227                        {
228                            break;
229                        }
230                        dst_until_alignment
231                    }
232                };
233                if until_alignment + ALU_STRIDE_SIZE <= len {
234                    // Moving pointers to alignment seems to be a pessimization on
235                    // x86_64 for operations that have UTF-16 as the internal
236                    // Unicode representation. However, since it seems to be a win
237                    // on ARM (tested ARMv7 code running on ARMv8 [rpi3]), except
238                    // mixed results when encoding from UTF-16 and since x86 and
239                    // x86_64 should be using SSE2 in due course, keeping the move
240                    // to alignment here. It would be good to test on more ARM CPUs
241                    // and on real MIPS and POWER hardware.
242                    while until_alignment != 0 {
243                        let code_unit = *(src.add(offset));
244                        if code_unit > 127 {
245                            return Some((code_unit, offset));
246                        }
247                        *(dst.add(offset)) = code_unit as $dst_unit;
248                        offset += 1;
249                        until_alignment -= 1;
250                    }
251                    let len_minus_stride = len - ALU_STRIDE_SIZE;
252                    loop {
253                        if !$stride_fn(
254                            src.add(offset) as *const usize,
255                            dst.add(offset) as *mut usize,
256                        ) {
257                            break;
258                        }
259                        offset += ALU_STRIDE_SIZE;
260                        if offset > len_minus_stride {
261                            break;
262                        }
263                    }
264                }
265                break;
266            }
267            while offset < len {
268                let code_unit = *(src.add(offset));
269                if code_unit > 127 {
270                    return Some((code_unit, offset));
271                }
272                *(dst.add(offset)) = code_unit as $dst_unit;
273                offset += 1;
274            }
275            None
276        }
277    };
278}
279
280#[allow(unused_macros)]
281macro_rules! latin1_alu {
282    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_fn:ident) => {
283        #[cfg_attr(
284            feature = "cargo-clippy",
285            allow(never_loop, cast_ptr_alignment, cast_lossless)
286        )]
287        #[inline(always)]
288        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
289            let mut offset = 0usize;
290            // This loop is only broken out of as a `goto` forward
291            loop {
292                let mut until_alignment = {
293                    if ::std::mem::size_of::<$src_unit>() < ::std::mem::size_of::<$dst_unit>() {
294                        // unpack
295                        let src_until_alignment = (ALU_ALIGNMENT
296                            - ((src as usize) & ALU_ALIGNMENT_MASK))
297                            & ALU_ALIGNMENT_MASK;
298                        if (dst.wrapping_add(src_until_alignment) as usize) & ALU_ALIGNMENT_MASK
299                            != 0
300                        {
301                            break;
302                        }
303                        src_until_alignment
304                    } else {
305                        // pack
306                        let dst_until_alignment = (ALU_ALIGNMENT
307                            - ((dst as usize) & ALU_ALIGNMENT_MASK))
308                            & ALU_ALIGNMENT_MASK;
309                        if (src.wrapping_add(dst_until_alignment) as usize) & ALU_ALIGNMENT_MASK
310                            != 0
311                        {
312                            break;
313                        }
314                        dst_until_alignment
315                    }
316                };
317                if until_alignment + ALU_STRIDE_SIZE <= len {
318                    while until_alignment != 0 {
319                        let code_unit = *(src.add(offset));
320                        *(dst.add(offset)) = code_unit as $dst_unit;
321                        offset += 1;
322                        until_alignment -= 1;
323                    }
324                    let len_minus_stride = len - ALU_STRIDE_SIZE;
325                    loop {
326                        $stride_fn(
327                            src.add(offset) as *const usize,
328                            dst.add(offset) as *mut usize,
329                        );
330                        offset += ALU_STRIDE_SIZE;
331                        if offset > len_minus_stride {
332                            break;
333                        }
334                    }
335                }
336                break;
337            }
338            while offset < len {
339                let code_unit = *(src.add(offset));
340                *(dst.add(offset)) = code_unit as $dst_unit;
341                offset += 1;
342            }
343        }
344    };
345}
346
347#[allow(unused_macros)]
348macro_rules! ascii_simd_check_align {
349    (
350        $name:ident,
351        $src_unit:ty,
352        $dst_unit:ty,
353        $stride_both_aligned:ident,
354        $stride_src_aligned:ident,
355        $stride_dst_aligned:ident,
356        $stride_neither_aligned:ident
357    ) => {
358        #[inline(always)]
359        pub unsafe fn $name(
360            src: *const $src_unit,
361            dst: *mut $dst_unit,
362            len: usize,
363        ) -> Option<($src_unit, usize)> {
364            let mut offset = 0usize;
365            if SIMD_STRIDE_SIZE <= len {
366                let len_minus_stride = len - SIMD_STRIDE_SIZE;
367                // XXX Should we first process one stride unconditionally as unaligned to
368                // avoid the cost of the branchiness below if the first stride fails anyway?
369                // XXX Should we just use unaligned SSE2 access unconditionally? It seems that
370                // on Haswell, it would make sense to just use unaligned and not bother
371                // checking. Need to benchmark older architectures before deciding.
372                let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
373                if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
374                    if dst_masked == 0 {
375                        loop {
376                            if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
377                                break;
378                            }
379                            offset += SIMD_STRIDE_SIZE;
380                            if offset > len_minus_stride {
381                                break;
382                            }
383                        }
384                    } else {
385                        loop {
386                            if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
387                                break;
388                            }
389                            offset += SIMD_STRIDE_SIZE;
390                            if offset > len_minus_stride {
391                                break;
392                            }
393                        }
394                    }
395                } else {
396                    if dst_masked == 0 {
397                        loop {
398                            if !$stride_dst_aligned(src.add(offset), dst.add(offset)) {
399                                break;
400                            }
401                            offset += SIMD_STRIDE_SIZE;
402                            if offset > len_minus_stride {
403                                break;
404                            }
405                        }
406                    } else {
407                        loop {
408                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
409                                break;
410                            }
411                            offset += SIMD_STRIDE_SIZE;
412                            if offset > len_minus_stride {
413                                break;
414                            }
415                        }
416                    }
417                }
418            }
419            while offset < len {
420                let code_unit = *(src.add(offset));
421                if code_unit > 127 {
422                    return Some((code_unit, offset));
423                }
424                *(dst.add(offset)) = code_unit as $dst_unit;
425                offset += 1;
426            }
427            None
428        }
429    };
430}
431
432#[allow(unused_macros)]
433macro_rules! ascii_simd_check_align_unrolled {
434    (
435        $name:ident,
436        $src_unit:ty,
437        $dst_unit:ty,
438        $stride_both_aligned:ident,
439        $stride_src_aligned:ident,
440        $stride_neither_aligned:ident,
441        $double_stride_both_aligned:ident,
442        $double_stride_src_aligned:ident
443    ) => {
444        #[inline(always)]
445        pub unsafe fn $name(
446            src: *const $src_unit,
447            dst: *mut $dst_unit,
448            len: usize,
449        ) -> Option<($src_unit, usize)> {
450            let unit_size = ::std::mem::size_of::<$src_unit>();
451            let mut offset = 0usize;
452            // This loop is only broken out of as a goto forward without
453            // actually looping
454            'outer: loop {
455                if SIMD_STRIDE_SIZE <= len {
456                    // First, process one unaligned
457                    if !$stride_neither_aligned(src, dst) {
458                        break 'outer;
459                    }
460                    offset = SIMD_STRIDE_SIZE;
461
462                    // We have now seen 16 ASCII bytes. Let's guess that
463                    // there will be enough more to justify more expense
464                    // in the case of non-ASCII.
465                    // Use aligned reads for the sake of old microachitectures.
466                    let until_alignment = ((SIMD_ALIGNMENT
467                        - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK))
468                        & SIMD_ALIGNMENT_MASK)
469                        / unit_size;
470                    // This addition won't overflow, because even in the 32-bit PAE case the
471                    // address space holds enough code that the slice length can't be that
472                    // close to address space size.
473                    // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
474                    if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
475                        if until_alignment != 0 {
476                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
477                                break;
478                            }
479                            offset += until_alignment;
480                        }
481                        let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
482                        let dst_masked = (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK;
483                        if dst_masked == 0 {
484                            loop {
485                                if let Some(advance) =
486                                    $double_stride_both_aligned(src.add(offset), dst.add(offset))
487                                {
488                                    offset += advance;
489                                    let code_unit = *(src.add(offset));
490                                    return Some((code_unit, offset));
491                                }
492                                offset += SIMD_STRIDE_SIZE * 2;
493                                if offset > len_minus_stride_times_two {
494                                    break;
495                                }
496                            }
497                            if offset + SIMD_STRIDE_SIZE <= len {
498                                if !$stride_both_aligned(src.add(offset), dst.add(offset)) {
499                                    break 'outer;
500                                }
501                                offset += SIMD_STRIDE_SIZE;
502                            }
503                        } else {
504                            loop {
505                                if let Some(advance) =
506                                    $double_stride_src_aligned(src.add(offset), dst.add(offset))
507                                {
508                                    offset += advance;
509                                    let code_unit = *(src.add(offset));
510                                    return Some((code_unit, offset));
511                                }
512                                offset += SIMD_STRIDE_SIZE * 2;
513                                if offset > len_minus_stride_times_two {
514                                    break;
515                                }
516                            }
517                            if offset + SIMD_STRIDE_SIZE <= len {
518                                if !$stride_src_aligned(src.add(offset), dst.add(offset)) {
519                                    break 'outer;
520                                }
521                                offset += SIMD_STRIDE_SIZE;
522                            }
523                        }
524                    } else {
525                        // At most two iterations, so unroll
526                        if offset + SIMD_STRIDE_SIZE <= len {
527                            if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
528                                break;
529                            }
530                            offset += SIMD_STRIDE_SIZE;
531                            if offset + SIMD_STRIDE_SIZE <= len {
532                                if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
533                                    break;
534                                }
535                                offset += SIMD_STRIDE_SIZE;
536                            }
537                        }
538                    }
539                }
540                break 'outer;
541            }
542            while offset < len {
543                let code_unit = *(src.add(offset));
544                if code_unit > 127 {
545                    return Some((code_unit, offset));
546                }
547                *(dst.add(offset)) = code_unit as $dst_unit;
548                offset += 1;
549            }
550            None
551        }
552    };
553}
554
555#[allow(unused_macros)]
556macro_rules! latin1_simd_check_align {
557    (
558        $name:ident,
559        $src_unit:ty,
560        $dst_unit:ty,
561        $stride_both_aligned:ident,
562        $stride_src_aligned:ident,
563        $stride_dst_aligned:ident,
564        $stride_neither_aligned:ident
565    ) => {
566        #[inline(always)]
567        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
568            let mut offset = 0usize;
569            if SIMD_STRIDE_SIZE <= len {
570                let len_minus_stride = len - SIMD_STRIDE_SIZE;
571                let dst_masked = (dst as usize) & SIMD_ALIGNMENT_MASK;
572                if ((src as usize) & SIMD_ALIGNMENT_MASK) == 0 {
573                    if dst_masked == 0 {
574                        loop {
575                            $stride_both_aligned(src.add(offset), dst.add(offset));
576                            offset += SIMD_STRIDE_SIZE;
577                            if offset > len_minus_stride {
578                                break;
579                            }
580                        }
581                    } else {
582                        loop {
583                            $stride_src_aligned(src.add(offset), dst.add(offset));
584                            offset += SIMD_STRIDE_SIZE;
585                            if offset > len_minus_stride {
586                                break;
587                            }
588                        }
589                    }
590                } else {
591                    if dst_masked == 0 {
592                        loop {
593                            $stride_dst_aligned(src.add(offset), dst.add(offset));
594                            offset += SIMD_STRIDE_SIZE;
595                            if offset > len_minus_stride {
596                                break;
597                            }
598                        }
599                    } else {
600                        loop {
601                            $stride_neither_aligned(src.add(offset), dst.add(offset));
602                            offset += SIMD_STRIDE_SIZE;
603                            if offset > len_minus_stride {
604                                break;
605                            }
606                        }
607                    }
608                }
609            }
610            while offset < len {
611                let code_unit = *(src.add(offset));
612                *(dst.add(offset)) = code_unit as $dst_unit;
613                offset += 1;
614            }
615        }
616    };
617}
618
619#[allow(unused_macros)]
620macro_rules! latin1_simd_check_align_unrolled {
621    (
622        $name:ident,
623        $src_unit:ty,
624        $dst_unit:ty,
625        $stride_both_aligned:ident,
626        $stride_src_aligned:ident,
627        $stride_dst_aligned:ident,
628        $stride_neither_aligned:ident
629    ) => {
630        #[inline(always)]
631        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
632            let unit_size = ::std::mem::size_of::<$src_unit>();
633            let mut offset = 0usize;
634            if SIMD_STRIDE_SIZE <= len {
635                let mut until_alignment = ((SIMD_STRIDE_SIZE
636                    - ((src as usize) & SIMD_ALIGNMENT_MASK))
637                    & SIMD_ALIGNMENT_MASK)
638                    / unit_size;
639                while until_alignment != 0 {
640                    *(dst.add(offset)) = *(src.add(offset)) as $dst_unit;
641                    offset += 1;
642                    until_alignment -= 1;
643                }
644                let len_minus_stride = len - SIMD_STRIDE_SIZE;
645                if offset + SIMD_STRIDE_SIZE * 2 <= len {
646                    let len_minus_stride_times_two = len_minus_stride - SIMD_STRIDE_SIZE;
647                    if (dst.add(offset) as usize) & SIMD_ALIGNMENT_MASK == 0 {
648                        loop {
649                            $stride_both_aligned(src.add(offset), dst.add(offset));
650                            offset += SIMD_STRIDE_SIZE;
651                            $stride_both_aligned(src.add(offset), dst.add(offset));
652                            offset += SIMD_STRIDE_SIZE;
653                            if offset > len_minus_stride_times_two {
654                                break;
655                            }
656                        }
657                    } else {
658                        loop {
659                            $stride_src_aligned(src.add(offset), dst.add(offset));
660                            offset += SIMD_STRIDE_SIZE;
661                            $stride_src_aligned(src.add(offset), dst.add(offset));
662                            offset += SIMD_STRIDE_SIZE;
663                            if offset > len_minus_stride_times_two {
664                                break;
665                            }
666                        }
667                    }
668                }
669                if offset < len_minus_stride {
670                    $stride_src_aligned(src.add(offset), dst.add(offset));
671                    offset += SIMD_STRIDE_SIZE;
672                }
673            }
674            while offset < len {
675                let code_unit = *(src.add(offset));
676                // On x86_64, this loop autovectorizes but in the pack
677                // case there are instructions whose purpose is to make sure
678                // each u16 in the vector is truncated before packing. However,
679                // since we don't care about saturating behavior of SSE2 packing
680                // when the input isn't Latin1, those instructions are useless.
681                // Unfortunately, using the `assume` intrinsic to lie to the
682                // optimizer doesn't make LLVM omit the trunctation that we
683                // don't need. Possibly this loop could be manually optimized
684                // to do the sort of thing that LLVM does but without the
685                // ANDing the read vectors of u16 with a constant that discards
686                // the high half of each u16. As far as I can tell, the
687                // optimization assumes that doing a SIMD read past the end of
688                // the array is OK.
689                *(dst.add(offset)) = code_unit as $dst_unit;
690                offset += 1;
691            }
692        }
693    };
694}
695
696#[allow(unused_macros)]
697macro_rules! ascii_simd_unalign {
698    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
699        #[inline(always)]
700        pub unsafe fn $name(
701            src: *const $src_unit,
702            dst: *mut $dst_unit,
703            len: usize,
704        ) -> Option<($src_unit, usize)> {
705            let mut offset = 0usize;
706            if SIMD_STRIDE_SIZE <= len {
707                let len_minus_stride = len - SIMD_STRIDE_SIZE;
708                loop {
709                    if !$stride_neither_aligned(src.add(offset), dst.add(offset)) {
710                        break;
711                    }
712                    offset += SIMD_STRIDE_SIZE;
713                    if offset > len_minus_stride {
714                        break;
715                    }
716                }
717            }
718            while offset < len {
719                let code_unit = *(src.add(offset));
720                if code_unit > 127 {
721                    return Some((code_unit, offset));
722                }
723                *(dst.add(offset)) = code_unit as $dst_unit;
724                offset += 1;
725            }
726            None
727        }
728    };
729}
730
731#[allow(unused_macros)]
732macro_rules! latin1_simd_unalign {
733    ($name:ident, $src_unit:ty, $dst_unit:ty, $stride_neither_aligned:ident) => {
734        #[inline(always)]
735        pub unsafe fn $name(src: *const $src_unit, dst: *mut $dst_unit, len: usize) {
736            let mut offset = 0usize;
737            if SIMD_STRIDE_SIZE <= len {
738                let len_minus_stride = len - SIMD_STRIDE_SIZE;
739                loop {
740                    $stride_neither_aligned(src.add(offset), dst.add(offset));
741                    offset += SIMD_STRIDE_SIZE;
742                    if offset > len_minus_stride {
743                        break;
744                    }
745                }
746            }
747            while offset < len {
748                let code_unit = *(src.add(offset));
749                *(dst.add(offset)) = code_unit as $dst_unit;
750                offset += 1;
751            }
752        }
753    };
754}
755
756#[allow(unused_macros)]
757macro_rules! ascii_to_ascii_simd_stride {
758    ($name:ident, $load:ident, $store:ident) => {
759        #[inline(always)]
760        pub unsafe fn $name(src: *const u8, dst: *mut u8) -> bool {
761            let simd = $load(src);
762            if !simd_is_ascii(simd) {
763                return false;
764            }
765            $store(dst, simd);
766            true
767        }
768    };
769}
770
771#[allow(unused_macros)]
772macro_rules! ascii_to_ascii_simd_double_stride {
773    ($name:ident, $store:ident) => {
774        #[inline(always)]
775        pub unsafe fn $name(src: *const u8, dst: *mut u8) -> Option<usize> {
776            let first = load16_aligned(src);
777            let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
778            $store(dst, first);
779            if unlikely(!simd_is_ascii(first | second)) {
780                let mask_first = mask_ascii(first);
781                if mask_first != 0 {
782                    return Some(mask_first.trailing_zeros() as usize);
783                }
784                $store(dst.add(SIMD_STRIDE_SIZE), second);
785                let mask_second = mask_ascii(second);
786                return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
787            }
788            $store(dst.add(SIMD_STRIDE_SIZE), second);
789            None
790        }
791    };
792}
793
794#[allow(unused_macros)]
795macro_rules! ascii_to_basic_latin_simd_stride {
796    ($name:ident, $load:ident, $store:ident) => {
797        #[inline(always)]
798        pub unsafe fn $name(src: *const u8, dst: *mut u16) -> bool {
799            let simd = $load(src);
800            if !simd_is_ascii(simd) {
801                return false;
802            }
803            let (first, second) = simd_unpack(simd);
804            $store(dst, first);
805            $store(dst.add(8), second);
806            true
807        }
808    };
809}
810
811#[allow(unused_macros)]
812macro_rules! ascii_to_basic_latin_simd_double_stride {
813    ($name:ident, $store:ident) => {
814        #[inline(always)]
815        pub unsafe fn $name(src: *const u8, dst: *mut u16) -> Option<usize> {
816            let first = load16_aligned(src);
817            let second = load16_aligned(src.add(SIMD_STRIDE_SIZE));
818            let (a, b) = simd_unpack(first);
819            $store(dst, a);
820            $store(dst.add(SIMD_STRIDE_SIZE / 2), b);
821            if unlikely(!simd_is_ascii(first | second)) {
822                let mask_first = mask_ascii(first);
823                if mask_first != 0 {
824                    return Some(mask_first.trailing_zeros() as usize);
825                }
826                let (c, d) = simd_unpack(second);
827                $store(dst.add(SIMD_STRIDE_SIZE), c);
828                $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
829                let mask_second = mask_ascii(second);
830                return Some(SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize);
831            }
832            let (c, d) = simd_unpack(second);
833            $store(dst.add(SIMD_STRIDE_SIZE), c);
834            $store(dst.add(SIMD_STRIDE_SIZE + (SIMD_STRIDE_SIZE / 2)), d);
835            None
836        }
837    };
838}
839
840#[allow(unused_macros)]
841macro_rules! unpack_simd_stride {
842    ($name:ident, $load:ident, $store:ident) => {
843        #[inline(always)]
844        pub unsafe fn $name(src: *const u8, dst: *mut u16) {
845            let simd = $load(src);
846            let (first, second) = simd_unpack(simd);
847            $store(dst, first);
848            $store(dst.add(8), second);
849        }
850    };
851}
852
853#[allow(unused_macros)]
854macro_rules! basic_latin_to_ascii_simd_stride {
855    ($name:ident, $load:ident, $store:ident) => {
856        #[inline(always)]
857        pub unsafe fn $name(src: *const u16, dst: *mut u8) -> bool {
858            let first = $load(src);
859            let second = $load(src.add(8));
860            if simd_is_basic_latin(first | second) {
861                $store(dst, simd_pack(first, second));
862                true
863            } else {
864                false
865            }
866        }
867    };
868}
869
870#[allow(unused_macros)]
871macro_rules! pack_simd_stride {
872    ($name:ident, $load:ident, $store:ident) => {
873        #[inline(always)]
874        pub unsafe fn $name(src: *const u16, dst: *mut u8) {
875            let first = $load(src);
876            let second = $load(src.add(8));
877            $store(dst, simd_pack(first, second));
878        }
879    };
880}
881
882cfg_if! {
883    if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "aarch64"))] {
884        // SIMD with the same instructions for aligned and unaligned loads and stores
885
886        pub const SIMD_STRIDE_SIZE: usize = 16;
887
888        pub const MAX_STRIDE_SIZE: usize = 16;
889
890//        pub const ALIGNMENT: usize = 8;
891
892        pub const ALU_STRIDE_SIZE: usize = 16;
893
894        pub const ALU_ALIGNMENT: usize = 8;
895
896        pub const ALU_ALIGNMENT_MASK: usize = 7;
897
898        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
899
900        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
901        unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
902
903        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
904        pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
905
906        ascii_simd_unalign!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_neither_aligned);
907        ascii_simd_unalign!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_neither_aligned);
908        ascii_simd_unalign!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_neither_aligned);
909        latin1_simd_unalign!(unpack_latin1, u8, u16, unpack_stride_neither_aligned);
910        latin1_simd_unalign!(pack_latin1, u16, u8, pack_stride_neither_aligned);
911    } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
912        // SIMD with different instructions for aligned and unaligned loads and stores.
913        //
914        // Newer microarchitectures are not supposed to have a performance difference between
915        // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
916        // but the benchmark results I see don't agree.
917
918        pub const SIMD_STRIDE_SIZE: usize = 16;
919
920        pub const MAX_STRIDE_SIZE: usize = 16;
921
922        pub const SIMD_ALIGNMENT_MASK: usize = 15;
923
924        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
925        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
926        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_dst_aligned, load16_unaligned, store16_aligned);
927        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
928
929        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
930        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
931        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_dst_aligned, load16_unaligned, store8_aligned);
932        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
933
934        unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
935        unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
936        unpack_simd_stride!(unpack_stride_dst_aligned, load16_unaligned, store8_aligned);
937        unpack_simd_stride!(unpack_stride_neither_aligned, load16_unaligned, store8_unaligned);
938
939        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
940        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
941        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
942        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
943
944        pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
945        pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
946        pack_simd_stride!(pack_stride_dst_aligned, load8_unaligned, store16_aligned);
947        pack_simd_stride!(pack_stride_neither_aligned, load8_unaligned, store16_unaligned);
948
949        ascii_simd_check_align!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_dst_aligned, ascii_to_ascii_stride_neither_aligned);
950        ascii_simd_check_align!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_dst_aligned, ascii_to_basic_latin_stride_neither_aligned);
951        ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
952        latin1_simd_check_align!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
953        latin1_simd_check_align!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
954    } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
955        // SIMD with different instructions for aligned and unaligned loads and stores.
956        //
957        // Newer microarchitectures are not supposed to have a performance difference between
958        // aligned and unaligned SSE2 loads and stores when the address is actually aligned,
959        // but the benchmark results I see don't agree.
960
961        pub const SIMD_STRIDE_SIZE: usize = 16;
962
963        pub const SIMD_ALIGNMENT: usize = 16;
964
965        pub const MAX_STRIDE_SIZE: usize = 16;
966
967        pub const SIMD_ALIGNMENT_MASK: usize = 15;
968
969        ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_both_aligned, store16_aligned);
970        ascii_to_ascii_simd_double_stride!(ascii_to_ascii_simd_double_stride_src_aligned, store16_unaligned);
971
972        ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_both_aligned, store8_aligned);
973        ascii_to_basic_latin_simd_double_stride!(ascii_to_basic_latin_simd_double_stride_src_aligned, store8_unaligned);
974
975        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_both_aligned, load16_aligned, store16_aligned);
976        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_src_aligned, load16_aligned, store16_unaligned);
977        ascii_to_ascii_simd_stride!(ascii_to_ascii_stride_neither_aligned, load16_unaligned, store16_unaligned);
978
979        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_both_aligned, load16_aligned, store8_aligned);
980        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_src_aligned, load16_aligned, store8_unaligned);
981        ascii_to_basic_latin_simd_stride!(ascii_to_basic_latin_stride_neither_aligned, load16_unaligned, store8_unaligned);
982
983        unpack_simd_stride!(unpack_stride_both_aligned, load16_aligned, store8_aligned);
984        unpack_simd_stride!(unpack_stride_src_aligned, load16_aligned, store8_unaligned);
985
986        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_both_aligned, load8_aligned, store16_aligned);
987        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_src_aligned, load8_aligned, store16_unaligned);
988        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_dst_aligned, load8_unaligned, store16_aligned);
989        basic_latin_to_ascii_simd_stride!(basic_latin_to_ascii_stride_neither_aligned, load8_unaligned, store16_unaligned);
990
991        pack_simd_stride!(pack_stride_both_aligned, load8_aligned, store16_aligned);
992        pack_simd_stride!(pack_stride_src_aligned, load8_aligned, store16_unaligned);
993
994        ascii_simd_check_align_unrolled!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride_both_aligned, ascii_to_ascii_stride_src_aligned, ascii_to_ascii_stride_neither_aligned, ascii_to_ascii_simd_double_stride_both_aligned, ascii_to_ascii_simd_double_stride_src_aligned);
995        ascii_simd_check_align_unrolled!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_both_aligned, ascii_to_basic_latin_stride_src_aligned, ascii_to_basic_latin_stride_neither_aligned, ascii_to_basic_latin_simd_double_stride_both_aligned, ascii_to_basic_latin_simd_double_stride_src_aligned);
996
997        ascii_simd_check_align!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_both_aligned, basic_latin_to_ascii_stride_src_aligned, basic_latin_to_ascii_stride_dst_aligned, basic_latin_to_ascii_stride_neither_aligned);
998        latin1_simd_check_align_unrolled!(unpack_latin1, u8, u16, unpack_stride_both_aligned, unpack_stride_src_aligned, unpack_stride_dst_aligned, unpack_stride_neither_aligned);
999        latin1_simd_check_align_unrolled!(pack_latin1, u16, u8, pack_stride_both_aligned, pack_stride_src_aligned, pack_stride_dst_aligned, pack_stride_neither_aligned);
1000    } else if #[cfg(all(target_endian = "little", target_pointer_width = "64"))] {
1001        // Aligned ALU word, little-endian, 64-bit
1002
1003        pub const ALU_STRIDE_SIZE: usize = 16;
1004
1005        pub const MAX_STRIDE_SIZE: usize = 16;
1006
1007        pub const ALU_ALIGNMENT: usize = 8;
1008
1009        pub const ALU_ALIGNMENT_MASK: usize = 7;
1010
1011        #[inline(always)]
1012        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1013            let first = ((0x0000_0000_FF00_0000usize & word) << 24) |
1014                        ((0x0000_0000_00FF_0000usize & word) << 16) |
1015                        ((0x0000_0000_0000_FF00usize & word) << 8) |
1016                        (0x0000_0000_0000_00FFusize & word);
1017            let second = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1018                         ((0x00FF_0000_0000_0000usize & word) >> 16) |
1019                         ((0x0000_FF00_0000_0000usize & word) >> 24) |
1020                         ((0x0000_00FF_0000_0000usize & word) >> 32);
1021            let third = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1022                        ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1023                        ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1024                        (0x0000_0000_0000_00FFusize & second_word);
1025            let fourth = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1026                         ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1027                         ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1028                         ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1029            *dst = first;
1030            *(dst.add(1)) = second;
1031            *(dst.add(2)) = third;
1032            *(dst.add(3)) = fourth;
1033        }
1034
1035        #[inline(always)]
1036        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1037            let word = ((0x00FF_0000_0000_0000usize & second) << 8) |
1038                       ((0x0000_00FF_0000_0000usize & second) << 16) |
1039                       ((0x0000_0000_00FF_0000usize & second) << 24) |
1040                       ((0x0000_0000_0000_00FFusize & second) << 32) |
1041                       ((0x00FF_0000_0000_0000usize & first) >> 24) |
1042                       ((0x0000_00FF_0000_0000usize & first) >> 16) |
1043                       ((0x0000_0000_00FF_0000usize & first) >> 8) |
1044                       (0x0000_0000_0000_00FFusize & first);
1045            let second_word = ((0x00FF_0000_0000_0000usize & fourth) << 8) |
1046                              ((0x0000_00FF_0000_0000usize & fourth) << 16) |
1047                              ((0x0000_0000_00FF_0000usize & fourth) << 24) |
1048                              ((0x0000_0000_0000_00FFusize & fourth) << 32) |
1049                              ((0x00FF_0000_0000_0000usize & third) >> 24) |
1050                              ((0x0000_00FF_0000_0000usize & third) >> 16) |
1051                              ((0x0000_0000_00FF_0000usize & third) >> 8) |
1052                              (0x0000_0000_0000_00FFusize & third);
1053            *dst = word;
1054            *(dst.add(1)) = second_word;
1055        }
1056    } else if #[cfg(all(target_endian = "little", target_pointer_width = "32"))] {
1057        // Aligned ALU word, little-endian, 32-bit
1058
1059        pub const ALU_STRIDE_SIZE: usize = 8;
1060
1061        pub const MAX_STRIDE_SIZE: usize = 8;
1062
1063        pub const ALU_ALIGNMENT: usize = 4;
1064
1065        pub const ALU_ALIGNMENT_MASK: usize = 3;
1066
1067        #[inline(always)]
1068        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1069            let first = ((0x0000_FF00usize & word) << 8) |
1070                        (0x0000_00FFusize & word);
1071            let second = ((0xFF00_0000usize & word) >> 8) |
1072                         ((0x00FF_0000usize & word) >> 16);
1073            let third = ((0x0000_FF00usize & second_word) << 8) |
1074                        (0x0000_00FFusize & second_word);
1075            let fourth = ((0xFF00_0000usize & second_word) >> 8) |
1076                         ((0x00FF_0000usize & second_word) >> 16);
1077            *dst = first;
1078            *(dst.add(1)) = second;
1079            *(dst.add(2)) = third;
1080            *(dst.add(3)) = fourth;
1081        }
1082
1083        #[inline(always)]
1084        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1085            let word = ((0x00FF_0000usize & second) << 8) |
1086                       ((0x0000_00FFusize & second) << 16) |
1087                       ((0x00FF_0000usize & first) >> 8) |
1088                       (0x0000_00FFusize & first);
1089            let second_word = ((0x00FF_0000usize & fourth) << 8) |
1090                              ((0x0000_00FFusize & fourth) << 16) |
1091                              ((0x00FF_0000usize & third) >> 8) |
1092                              (0x0000_00FFusize & third);
1093            *dst = word;
1094            *(dst.add(1)) = second_word;
1095        }
1096    } else if #[cfg(all(target_endian = "big", target_pointer_width = "64"))] {
1097        // Aligned ALU word, big-endian, 64-bit
1098
1099        pub const ALU_STRIDE_SIZE: usize = 16;
1100
1101        pub const MAX_STRIDE_SIZE: usize = 16;
1102
1103        pub const ALU_ALIGNMENT: usize = 8;
1104
1105        pub const ALU_ALIGNMENT_MASK: usize = 7;
1106
1107        #[inline(always)]
1108        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1109            let first = ((0xFF00_0000_0000_0000usize & word) >> 8) |
1110                         ((0x00FF_0000_0000_0000usize & word) >> 16) |
1111                         ((0x0000_FF00_0000_0000usize & word) >> 24) |
1112                         ((0x0000_00FF_0000_0000usize & word) >> 32);
1113            let second = ((0x0000_0000_FF00_0000usize & word) << 24) |
1114                        ((0x0000_0000_00FF_0000usize & word) << 16) |
1115                        ((0x0000_0000_0000_FF00usize & word) << 8) |
1116                        (0x0000_0000_0000_00FFusize & word);
1117            let third = ((0xFF00_0000_0000_0000usize & second_word) >> 8) |
1118                         ((0x00FF_0000_0000_0000usize & second_word) >> 16) |
1119                         ((0x0000_FF00_0000_0000usize & second_word) >> 24) |
1120                         ((0x0000_00FF_0000_0000usize & second_word) >> 32);
1121            let fourth = ((0x0000_0000_FF00_0000usize & second_word) << 24) |
1122                        ((0x0000_0000_00FF_0000usize & second_word) << 16) |
1123                        ((0x0000_0000_0000_FF00usize & second_word) << 8) |
1124                        (0x0000_0000_0000_00FFusize & second_word);
1125            *dst = first;
1126            *(dst.add(1)) = second;
1127            *(dst.add(2)) = third;
1128            *(dst.add(3)) = fourth;
1129        }
1130
1131        #[inline(always)]
1132        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1133            let word = ((0x00FF0000_00000000usize & first) << 8) |
1134                       ((0x000000FF_00000000usize & first) << 16) |
1135                       ((0x00000000_00FF0000usize & first) << 24) |
1136                       ((0x00000000_000000FFusize & first) << 32) |
1137                       ((0x00FF0000_00000000usize & second) >> 24) |
1138                       ((0x000000FF_00000000usize & second) >> 16) |
1139                       ((0x00000000_00FF0000usize & second) >> 8) |
1140                       (0x00000000_000000FFusize & second);
1141            let second_word = ((0x00FF0000_00000000usize & third) << 8) |
1142                              ((0x000000FF_00000000usize & third) << 16) |
1143                              ((0x00000000_00FF0000usize & third) << 24) |
1144                              ((0x00000000_000000FFusize & third) << 32) |
1145                              ((0x00FF0000_00000000usize & fourth) >> 24) |
1146                              ((0x000000FF_00000000usize & fourth) >> 16) |
1147                              ((0x00000000_00FF0000usize & fourth) >> 8) |
1148                              (0x00000000_000000FFusize &  fourth);
1149            *dst = word;
1150            *(dst.add(1)) = second_word;
1151        }
1152    } else if #[cfg(all(target_endian = "big", target_pointer_width = "32"))] {
1153        // Aligned ALU word, big-endian, 32-bit
1154
1155        pub const ALU_STRIDE_SIZE: usize = 8;
1156
1157        pub const MAX_STRIDE_SIZE: usize = 8;
1158
1159        pub const ALU_ALIGNMENT: usize = 4;
1160
1161        pub const ALU_ALIGNMENT_MASK: usize = 3;
1162
1163        #[inline(always)]
1164        unsafe fn unpack_alu(word: usize, second_word: usize, dst: *mut usize) {
1165            let first = ((0xFF00_0000usize & word) >> 8) |
1166                         ((0x00FF_0000usize & word) >> 16);
1167            let second = ((0x0000_FF00usize & word) << 8) |
1168                        (0x0000_00FFusize & word);
1169            let third = ((0xFF00_0000usize & second_word) >> 8) |
1170                         ((0x00FF_0000usize & second_word) >> 16);
1171            let fourth = ((0x0000_FF00usize & second_word) << 8) |
1172                        (0x0000_00FFusize & second_word);
1173            *dst = first;
1174            *(dst.add(1)) = second;
1175            *(dst.add(2)) = third;
1176            *(dst.add(3)) = fourth;
1177        }
1178
1179        #[inline(always)]
1180        unsafe fn pack_alu(first: usize, second: usize, third: usize, fourth: usize, dst: *mut usize) {
1181            let word = ((0x00FF_0000usize & first) << 8) |
1182                       ((0x0000_00FFusize & first) << 16) |
1183                       ((0x00FF_0000usize & second) >> 8) |
1184                       (0x0000_00FFusize & second);
1185            let second_word = ((0x00FF_0000usize & third) << 8) |
1186                              ((0x0000_00FFusize & third) << 16) |
1187                              ((0x00FF_0000usize & fourth) >> 8) |
1188                              (0x0000_00FFusize & fourth);
1189            *dst = word;
1190            *(dst.add(1)) = second_word;
1191        }
1192    } else {
1193        ascii_naive!(ascii_to_ascii, u8, u8);
1194        ascii_naive!(ascii_to_basic_latin, u8, u16);
1195        ascii_naive!(basic_latin_to_ascii, u16, u8);
1196    }
1197}
1198
1199cfg_if! {
1200    if #[cfg(target_endian = "little")] {
1201        #[allow(dead_code)]
1202        #[inline(always)]
1203        fn count_zeros(word: usize) -> u32 {
1204            word.trailing_zeros()
1205        }
1206    } else {
1207        #[allow(dead_code)]
1208        #[inline(always)]
1209        fn count_zeros(word: usize) -> u32 {
1210            word.leading_zeros()
1211        }
1212    }
1213}
1214
1215cfg_if! {
1216    if #[cfg(all(feature = "simd-accel", target_endian = "little", target_arch = "disabled"))] {
1217        #[inline(always)]
1218        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1219            let src = slice.as_ptr();
1220            let len = slice.len();
1221            let mut offset = 0usize;
1222            if SIMD_STRIDE_SIZE <= len {
1223                let len_minus_stride = len - SIMD_STRIDE_SIZE;
1224                loop {
1225                    let simd = unsafe { load16_unaligned(src.add(offset)) };
1226                    if !simd_is_ascii(simd) {
1227                        break;
1228                    }
1229                    offset += SIMD_STRIDE_SIZE;
1230                    if offset > len_minus_stride {
1231                        break;
1232                    }
1233                }
1234            }
1235            while offset < len {
1236                let code_unit = slice[offset];
1237                if code_unit > 127 {
1238                    return Some((code_unit, offset));
1239                }
1240                offset += 1;
1241            }
1242            None
1243        }
1244    } else if #[cfg(all(feature = "simd-accel", target_feature = "sse2"))] {
1245        #[inline(always)]
1246        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1247            let src = slice.as_ptr();
1248            let len = slice.len();
1249            let mut offset = 0usize;
1250            if SIMD_STRIDE_SIZE <= len {
1251                // First, process one unaligned vector
1252                let simd = unsafe { load16_unaligned(src) };
1253                let mask = mask_ascii(simd);
1254                if mask != 0 {
1255                    offset = mask.trailing_zeros() as usize;
1256                    let non_ascii = unsafe { *src.add(offset) };
1257                    return Some((non_ascii, offset));
1258                }
1259                offset = SIMD_STRIDE_SIZE;
1260
1261                // We have now seen 16 ASCII bytes. Let's guess that
1262                // there will be enough more to justify more expense
1263                // in the case of non-ASCII.
1264                // Use aligned reads for the sake of old microachitectures.
1265                let until_alignment = unsafe { (SIMD_ALIGNMENT - ((src.add(offset) as usize) & SIMD_ALIGNMENT_MASK)) & SIMD_ALIGNMENT_MASK };
1266                // This addition won't overflow, because even in the 32-bit PAE case the
1267                // address space holds enough code that the slice length can't be that
1268                // close to address space size.
1269                // offset now equals SIMD_STRIDE_SIZE, hence times 3 below.
1270                if until_alignment + (SIMD_STRIDE_SIZE * 3) <= len {
1271                    if until_alignment != 0 {
1272                        let simd = unsafe { load16_unaligned(src.add(offset)) };
1273                        let mask = mask_ascii(simd);
1274                        if mask != 0 {
1275                            offset += mask.trailing_zeros() as usize;
1276                            let non_ascii = unsafe { *src.add(offset) };
1277                            return Some((non_ascii, offset));
1278                        }
1279                        offset += until_alignment;
1280                    }
1281                    let len_minus_stride_times_two = len - (SIMD_STRIDE_SIZE * 2);
1282                    loop {
1283                        let first = unsafe { load16_aligned(src.add(offset)) };
1284                        let second = unsafe { load16_aligned(src.add(offset + SIMD_STRIDE_SIZE)) };
1285                        if !simd_is_ascii(first | second) {
1286                            let mask_first = mask_ascii(first);
1287                            if mask_first != 0 {
1288                                offset += mask_first.trailing_zeros() as usize;
1289                            } else {
1290                                let mask_second = mask_ascii(second);
1291                                offset += SIMD_STRIDE_SIZE + mask_second.trailing_zeros() as usize;
1292                            }
1293                            let non_ascii = unsafe { *src.add(offset) };
1294                            return Some((non_ascii, offset));
1295                        }
1296                        offset += SIMD_STRIDE_SIZE * 2;
1297                        if offset > len_minus_stride_times_two {
1298                            break;
1299                        }
1300                    }
1301                    if offset + SIMD_STRIDE_SIZE <= len {
1302                         let simd = unsafe { load16_aligned(src.add(offset)) };
1303                         let mask = mask_ascii(simd);
1304                        if mask != 0 {
1305                            offset += mask.trailing_zeros() as usize;
1306                            let non_ascii = unsafe { *src.add(offset) };
1307                            return Some((non_ascii, offset));
1308                        }
1309                        offset += SIMD_STRIDE_SIZE;
1310                    }
1311                } else {
1312                    // At most two iterations, so unroll
1313                    if offset + SIMD_STRIDE_SIZE <= len {
1314                        let simd = unsafe { load16_unaligned(src.add(offset)) };
1315                        let mask = mask_ascii(simd);
1316                        if mask != 0 {
1317                            offset += mask.trailing_zeros() as usize;
1318                            let non_ascii = unsafe { *src.add(offset) };
1319                            return Some((non_ascii, offset));
1320                        }
1321                        offset += SIMD_STRIDE_SIZE;
1322                        if offset + SIMD_STRIDE_SIZE <= len {
1323                             let simd = unsafe { load16_unaligned(src.add(offset)) };
1324                             let mask = mask_ascii(simd);
1325                            if mask != 0 {
1326                                offset += mask.trailing_zeros() as usize;
1327                                let non_ascii = unsafe { *src.add(offset) };
1328                                return Some((non_ascii, offset));
1329                            }
1330                            offset += SIMD_STRIDE_SIZE;
1331                        }
1332                    }
1333                }
1334            }
1335            while offset < len {
1336                let code_unit = unsafe { *(src.add(offset)) };
1337                if code_unit > 127 {
1338                    return Some((code_unit, offset));
1339                }
1340                offset += 1;
1341            }
1342            None
1343        }
1344    } else {
1345        #[inline(always)]
1346        fn find_non_ascii(word: usize, second_word: usize) -> Option<usize> {
1347            let word_masked = word & ASCII_MASK;
1348            let second_masked = second_word & ASCII_MASK;
1349            if (word_masked | second_masked) == 0 {
1350                return None;
1351            }
1352            if word_masked != 0 {
1353                let zeros = count_zeros(word_masked);
1354                // `zeros` now contains 7 (for the seven bits of non-ASCII)
1355                // plus 8 times the number of ASCII in text order before the
1356                // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1357                // text order before the non-ASCII byte in the big-endian case.
1358                let num_ascii = (zeros >> 3) as usize;
1359                return Some(num_ascii);
1360            }
1361            let zeros = count_zeros(second_masked);
1362            // `zeros` now contains 7 (for the seven bits of non-ASCII)
1363            // plus 8 times the number of ASCII in text order before the
1364            // non-ASCII byte in the little-endian case or 8 times the number of ASCII in
1365            // text order before the non-ASCII byte in the big-endian case.
1366            let num_ascii = (zeros >> 3) as usize;
1367            Some(ALU_ALIGNMENT + num_ascii)
1368        }
1369
1370        #[inline(always)]
1371        unsafe fn validate_ascii_stride(src: *const usize) -> Option<usize> {
1372            let word = *src;
1373            let second_word = *(src.add(1));
1374            find_non_ascii(word, second_word)
1375        }
1376
1377        #[cfg_attr(feature = "cargo-clippy", allow(cast_ptr_alignment))]
1378        #[inline(always)]
1379        pub fn validate_ascii(slice: &[u8]) -> Option<(u8, usize)> {
1380            let src = slice.as_ptr();
1381            let len = slice.len();
1382            let mut offset = 0usize;
1383            let mut until_alignment = (ALU_ALIGNMENT - ((src as usize) & ALU_ALIGNMENT_MASK)) & ALU_ALIGNMENT_MASK;
1384            if until_alignment + ALU_STRIDE_SIZE <= len {
1385                while until_alignment != 0 {
1386                    let code_unit = slice[offset];
1387                    if code_unit > 127 {
1388                        return Some((code_unit, offset));
1389                    }
1390                    offset += 1;
1391                    until_alignment -= 1;
1392                }
1393                let len_minus_stride = len - ALU_STRIDE_SIZE;
1394                loop {
1395                    let ptr = unsafe { src.add(offset) as *const usize };
1396                    if let Some(num_ascii) = unsafe { validate_ascii_stride(ptr) } {
1397                        offset += num_ascii;
1398                        return Some((unsafe { *(src.add(offset)) }, offset));
1399                    }
1400                    offset += ALU_STRIDE_SIZE;
1401                    if offset > len_minus_stride {
1402                        break;
1403                    }
1404                }
1405            }
1406            while offset < len {
1407                let code_unit = slice[offset];
1408                if code_unit > 127 {
1409                    return Some((code_unit, offset));
1410                }
1411                offset += 1;
1412           }
1413           None
1414        }
1415
1416    }
1417}
1418
1419cfg_if! {
1420    if #[cfg(all(feature = "simd-accel", any(target_feature = "sse2", all(target_endian = "little", target_arch = "aarch64"))))] {
1421
1422    } else if #[cfg(all(feature = "simd-accel", target_endian = "little", target_feature = "neon"))] {
1423        // Even with NEON enabled, we use the ALU path for ASCII validation, because testing
1424        // on Exynos 5 indicated that using NEON isn't worthwhile where there are only
1425        // vector reads without vector writes.
1426
1427        pub const ALU_STRIDE_SIZE: usize = 8;
1428
1429        pub const ALU_ALIGNMENT: usize = 4;
1430
1431        pub const ALU_ALIGNMENT_MASK: usize = 3;
1432    } else {
1433        #[inline(always)]
1434        unsafe fn unpack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1435            let word = *src;
1436            let second_word = *(src.add(1));
1437            unpack_alu(word, second_word, dst);
1438        }
1439
1440        #[inline(always)]
1441        unsafe fn pack_latin1_stride_alu(src: *const usize, dst: *mut usize) {
1442            let first = *src;
1443            let second = *(src.add(1));
1444            let third = *(src.add(2));
1445            let fourth = *(src.add(3));
1446            pack_alu(first, second, third, fourth, dst);
1447        }
1448
1449        #[inline(always)]
1450        unsafe fn ascii_to_basic_latin_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1451            let word = *src;
1452            let second_word = *(src.add(1));
1453            // Check if the words contains non-ASCII
1454            if (word & ASCII_MASK) | (second_word & ASCII_MASK) != 0 {
1455                return false;
1456            }
1457            unpack_alu(word, second_word, dst);
1458            true
1459        }
1460
1461        #[inline(always)]
1462        unsafe fn basic_latin_to_ascii_stride_alu(src: *const usize, dst: *mut usize) -> bool {
1463            let first = *src;
1464            let second = *(src.add(1));
1465            let third = *(src.add(2));
1466            let fourth = *(src.add(3));
1467            if (first & BASIC_LATIN_MASK) | (second & BASIC_LATIN_MASK) | (third & BASIC_LATIN_MASK) | (fourth & BASIC_LATIN_MASK) != 0 {
1468                return false;
1469            }
1470            pack_alu(first, second, third, fourth, dst);
1471            true
1472        }
1473
1474        #[inline(always)]
1475        unsafe fn ascii_to_ascii_stride(src: *const usize, dst: *mut usize) -> Option<usize> {
1476            let word = *src;
1477            let second_word = *(src.add(1));
1478            *dst = word;
1479            *(dst.add(1)) = second_word;
1480            find_non_ascii(word, second_word)
1481        }
1482
1483        basic_latin_alu!(ascii_to_basic_latin, u8, u16, ascii_to_basic_latin_stride_alu);
1484        basic_latin_alu!(basic_latin_to_ascii, u16, u8, basic_latin_to_ascii_stride_alu);
1485        latin1_alu!(unpack_latin1, u8, u16, unpack_latin1_stride_alu);
1486        latin1_alu!(pack_latin1, u16, u8, pack_latin1_stride_alu);
1487        ascii_alu!(ascii_to_ascii, u8, u8, ascii_to_ascii_stride);
1488    }
1489}
1490
1491pub fn ascii_valid_up_to(bytes: &[u8]) -> usize {
1492    match validate_ascii(bytes) {
1493        None => bytes.len(),
1494        Some((_, num_valid)) => num_valid,
1495    }
1496}
1497
1498pub fn iso_2022_jp_ascii_valid_up_to(bytes: &[u8]) -> usize {
1499    for (i, b_ref) in bytes.iter().enumerate() {
1500        let b = *b_ref;
1501        if b >= 0x80 || b == 0x1B || b == 0x0E || b == 0x0F {
1502            return i;
1503        }
1504    }
1505    bytes.len()
1506}
1507
1508// Any copyright to the test code below this comment is dedicated to the
1509// Public Domain. http://creativecommons.org/publicdomain/zero/1.0/
1510
1511#[cfg(test)]
1512mod tests {
1513    use super::*;
1514
1515    macro_rules! test_ascii {
1516        ($test_name:ident, $fn_tested:ident, $src_unit:ty, $dst_unit:ty) => {
1517            #[test]
1518            fn $test_name() {
1519                let mut src: Vec<$src_unit> = Vec::with_capacity(32);
1520                let mut dst: Vec<$dst_unit> = Vec::with_capacity(32);
1521                for i in 0..32 {
1522                    src.clear();
1523                    dst.clear();
1524                    dst.resize(32, 0);
1525                    for j in 0..32 {
1526                        let c = if i == j { 0xAA } else { j + 0x40 };
1527                        src.push(c as $src_unit);
1528                    }
1529                    match unsafe { $fn_tested(src.as_ptr(), dst.as_mut_ptr(), 32) } {
1530                        None => unreachable!("Should always find non-ASCII"),
1531                        Some((non_ascii, num_ascii)) => {
1532                            assert_eq!(non_ascii, 0xAA);
1533                            assert_eq!(num_ascii, i);
1534                            for j in 0..i {
1535                                assert_eq!(dst[j], (j + 0x40) as $dst_unit);
1536                            }
1537                        }
1538                    }
1539                }
1540            }
1541        };
1542    }
1543
1544    test_ascii!(test_ascii_to_ascii, ascii_to_ascii, u8, u8);
1545    test_ascii!(test_ascii_to_basic_latin, ascii_to_basic_latin, u8, u16);
1546    test_ascii!(test_basic_latin_to_ascii, basic_latin_to_ascii, u16, u8);
1547}