vsimd/
simd256.rs

1use crate::isa::{AVX2, NEON, SSE2, WASM128};
2use crate::vector::{V128, V256};
3use crate::{unified, SIMD128};
4
5#[cfg(any(
6    any(target_arch = "x86", target_arch = "x86_64"),
7    any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"),
8    target_arch = "wasm32"
9))]
10use core::mem::transmute as t;
11
12#[cfg(target_arch = "x86")]
13use core::arch::x86::*;
14
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17
18#[cfg(all(feature = "unstable", target_arch = "arm"))]
19use core::arch::arm::*;
20
21#[cfg(target_arch = "aarch64")]
22use core::arch::aarch64::*;
23
24#[cfg(target_arch = "wasm32")]
25use core::arch::wasm32::*;
26
27#[macro_export]
28macro_rules! simd256_vop {
29    ($s:expr, $f:expr, $a:expr) => {{
30        let s = $s;
31        let f = $f;
32        let a = $a.to_v128x2();
33        let b = (f(s, a.0), f(s, a.1));
34        V256::from_v128x2(b)
35    }};
36    ($s:expr, $f:expr, $a:expr, $b:expr) => {{
37        let s = $s;
38        let f = $f;
39        let a = $a.to_v128x2();
40        let b = $b.to_v128x2();
41        let c = (f(s, a.0, b.0), f(s, a.1, b.1));
42        V256::from_v128x2(c)
43    }};
44    ($s:expr, $f:expr, $a:expr, $b:expr, $c:expr) => {{
45        let s = $s;
46        let f = $f;
47        let a = $a.to_v128x2();
48        let b = $b.to_v128x2();
49        let c = $c.to_v128x2();
50        let d = (f(s, a.0, b.0, c.0), f(s, a.1, b.1, c.1));
51        V256::from_v128x2(d)
52    }};
53}
54
55pub unsafe trait SIMD256: SIMD128 {
56    #[inline(always)]
57    unsafe fn v256_load(self, addr: *const u8) -> V256 {
58        debug_assert_ptr_align!(addr, 32);
59
60        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
61        if matches_isa!(Self, AVX2) {
62            return t(_mm256_load_si256(addr.cast()));
63        }
64        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
65        if matches_isa!(Self, NEON) {
66            return t(vld1q_u8_x2(addr.cast()));
67        }
68        {
69            let x0 = self.v128_load(addr);
70            let x1 = self.v128_load(addr.add(16));
71            V256::from_v128x2((x0, x1))
72        }
73    }
74
75    #[inline(always)]
76    unsafe fn v256_load_unaligned(self, addr: *const u8) -> V256 {
77        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78        if matches_isa!(Self, AVX2) {
79            return t(_mm256_loadu_si256(addr.cast()));
80        }
81        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
82        if matches_isa!(Self, NEON) {
83            return t(vld1q_u8_x2(addr.cast()));
84        }
85        {
86            let x0 = self.v128_load_unaligned(addr);
87            let x1 = self.v128_load_unaligned(addr.add(16));
88            V256::from_v128x2((x0, x1))
89        }
90    }
91
92    #[inline(always)]
93    unsafe fn v256_store(self, addr: *mut u8, a: V256) {
94        debug_assert_ptr_align!(addr, 32);
95
96        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97        if matches_isa!(Self, AVX2) {
98            return _mm256_store_si256(addr.cast(), t(a));
99        }
100        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
101        if matches_isa!(Self, NEON) {
102            return vst1q_u8_x2(addr.cast(), t(a));
103        }
104        {
105            let a = a.to_v128x2();
106            self.v128_store(addr, a.0);
107            self.v128_store(addr.add(16), a.1);
108        }
109    }
110
111    #[inline(always)]
112    unsafe fn v256_store_unaligned(self, addr: *mut u8, a: V256) {
113        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
114        if matches_isa!(Self, AVX2) {
115            return _mm256_storeu_si256(addr.cast(), t(a));
116        }
117        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
118        if matches_isa!(Self, NEON) {
119            return vst1q_u8_x2(addr.cast(), t(a));
120        }
121        {
122            let a = a.to_v128x2();
123            self.v128_store_unaligned(addr, a.0);
124            self.v128_store_unaligned(addr.add(16), a.1);
125        }
126    }
127
128    #[inline(always)]
129    fn v256_create_zero(self) -> V256 {
130        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
131        if matches_isa!(Self, AVX2) {
132            return unsafe { t(_mm256_setzero_si256()) };
133        }
134        {
135            self.v128_create_zero().x2()
136        }
137    }
138
139    #[inline(always)]
140    fn v256_not(self, a: V256) -> V256 {
141        if matches_isa!(Self, AVX2) {
142            return self.v256_xor(a, self.u8x32_eq(a, a));
143        }
144        {
145            simd256_vop!(self, Self::v128_not, a)
146        }
147    }
148
149    #[inline(always)]
150    fn v256_and(self, a: V256, b: V256) -> V256 {
151        unified::and(self, a, b)
152    }
153
154    #[inline(always)]
155    fn v256_or(self, a: V256, b: V256) -> V256 {
156        unified::or(self, a, b)
157    }
158
159    #[inline(always)]
160    fn v256_xor(self, a: V256, b: V256) -> V256 {
161        unified::xor(self, a, b)
162    }
163
164    #[inline(always)]
165    fn v256_andnot(self, a: V256, b: V256) -> V256 {
166        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
167        if matches_isa!(Self, AVX2) {
168            return unsafe { t(_mm256_andnot_si256(t(b), t(a))) };
169        }
170        {
171            simd256_vop!(self, Self::v128_andnot, a, b)
172        }
173    }
174
175    #[inline(always)]
176    fn v256_all_zero(self, a: V256) -> bool {
177        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
178        if matches_isa!(Self, AVX2) {
179            return unsafe {
180                let a = t(a);
181                _mm256_testz_si256(a, a) != 0
182            };
183        }
184        {
185            let a = a.to_v128x2();
186            self.v128_all_zero(self.v128_or(a.0, a.1))
187        }
188    }
189
190    #[inline(always)]
191    fn u8x32_splat(self, x: u8) -> V256 {
192        unified::splat(self, x)
193    }
194
195    #[inline(always)]
196    fn u16x16_splat(self, x: u16) -> V256 {
197        unified::splat(self, x)
198    }
199
200    #[inline(always)]
201    fn u32x8_splat(self, x: u32) -> V256 {
202        unified::splat(self, x)
203    }
204
205    #[inline(always)]
206    fn u64x4_splat(self, x: u64) -> V256 {
207        unified::splat(self, x)
208    }
209
210    #[inline(always)]
211    fn i8x32_splat(self, x: i8) -> V256 {
212        unified::splat(self, x)
213    }
214
215    #[inline(always)]
216    fn i16x16_splat(self, x: i16) -> V256 {
217        unified::splat(self, x)
218    }
219
220    #[inline(always)]
221    fn i32x8_splat(self, x: i32) -> V256 {
222        unified::splat(self, x)
223    }
224
225    #[inline(always)]
226    fn i64x4_splat(self, x: i64) -> V256 {
227        unified::splat(self, x)
228    }
229
230    #[inline(always)]
231    fn u8x32_add(self, a: V256, b: V256) -> V256 {
232        unified::add::<_, u8, _>(self, a, b)
233    }
234
235    #[inline(always)]
236    fn u16x16_add(self, a: V256, b: V256) -> V256 {
237        unified::add::<_, u16, _>(self, a, b)
238    }
239
240    #[inline(always)]
241    fn u32x8_add(self, a: V256, b: V256) -> V256 {
242        unified::add::<_, u32, _>(self, a, b)
243    }
244
245    #[inline(always)]
246    fn u64x4_add(self, a: V256, b: V256) -> V256 {
247        unified::add::<_, u64, _>(self, a, b)
248    }
249
250    #[inline(always)]
251    fn u8x32_sub(self, a: V256, b: V256) -> V256 {
252        unified::sub::<_, u8, _>(self, a, b)
253    }
254
255    #[inline(always)]
256    fn u16x16_sub(self, a: V256, b: V256) -> V256 {
257        unified::sub::<_, u16, _>(self, a, b)
258    }
259
260    #[inline(always)]
261    fn u32x8_sub(self, a: V256, b: V256) -> V256 {
262        unified::sub::<_, u32, _>(self, a, b)
263    }
264
265    #[inline(always)]
266    fn u64x4_sub(self, a: V256, b: V256) -> V256 {
267        unified::sub::<_, u64, _>(self, a, b)
268    }
269
270    #[inline(always)]
271    fn u8x32_sub_sat(self, a: V256, b: V256) -> V256 {
272        unified::sub_sat::<_, u8, _>(self, a, b)
273    }
274
275    #[inline(always)]
276    fn u16x16_sub_sat(self, a: V256, b: V256) -> V256 {
277        unified::sub_sat::<_, u16, _>(self, a, b)
278    }
279
280    #[inline(always)]
281    fn i8x32_sub_sat(self, a: V256, b: V256) -> V256 {
282        unified::sub_sat::<_, i8, _>(self, a, b)
283    }
284
285    #[inline(always)]
286    fn i16x16_sub_sat(self, a: V256, b: V256) -> V256 {
287        unified::sub_sat::<_, i16, _>(self, a, b)
288    }
289
290    #[inline(always)]
291    fn i16x16_mul_lo(self, a: V256, b: V256) -> V256 {
292        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
293        if matches_isa!(Self, AVX2) {
294            return unsafe { t(_mm256_mullo_epi16(t(a), t(b))) };
295        }
296        {
297            simd256_vop!(self, Self::i16x8_mul_lo, a, b)
298        }
299    }
300
301    #[inline(always)]
302    fn i32x8_mul_lo(self, a: V256, b: V256) -> V256 {
303        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
304        if matches_isa!(Self, AVX2) {
305            return unsafe { t(_mm256_mullo_epi32(t(a), t(b))) };
306        }
307        {
308            simd256_vop!(self, Self::i32x4_mul_lo, a, b)
309        }
310    }
311
312    #[inline(always)]
313    fn u16x16_shl<const IMM8: i32>(self, a: V256) -> V256 {
314        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
315        if matches_isa!(Self, AVX2) {
316            return unsafe { t(_mm256_slli_epi16::<IMM8>(t(a))) };
317        }
318        {
319            simd256_vop!(self, Self::u16x8_shl::<IMM8>, a)
320        }
321    }
322
323    #[inline(always)]
324    fn u32x8_shl<const IMM8: i32>(self, a: V256) -> V256 {
325        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326        if matches_isa!(Self, AVX2) {
327            return unsafe { t(_mm256_slli_epi32::<IMM8>(t(a))) };
328        }
329        {
330            simd256_vop!(self, Self::u32x4_shl::<IMM8>, a)
331        }
332    }
333
334    #[inline(always)]
335    fn u16x16_shr<const IMM8: i32>(self, a: V256) -> V256 {
336        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
337        if matches_isa!(Self, AVX2) {
338            return unsafe { t(_mm256_srli_epi16::<IMM8>(t(a))) };
339        }
340        {
341            simd256_vop!(self, Self::u16x8_shr::<IMM8>, a)
342        }
343    }
344
345    #[inline(always)]
346    fn u32x8_shr<const IMM8: i32>(self, a: V256) -> V256 {
347        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
348        if matches_isa!(Self, AVX2) {
349            return unsafe { t(_mm256_srli_epi32::<IMM8>(t(a))) };
350        }
351        {
352            simd256_vop!(self, Self::u32x4_shr::<IMM8>, a)
353        }
354    }
355
356    #[inline(always)]
357    fn u8x32_eq(self, a: V256, b: V256) -> V256 {
358        unified::eq::<_, u8, _>(self, a, b)
359    }
360
361    #[inline(always)]
362    fn u16x16_eq(self, a: V256, b: V256) -> V256 {
363        unified::eq::<_, u16, _>(self, a, b)
364    }
365
366    #[inline(always)]
367    fn u32x8_eq(self, a: V256, b: V256) -> V256 {
368        unified::eq::<_, u32, _>(self, a, b)
369    }
370
371    #[inline(always)]
372    fn u8x32_lt(self, a: V256, b: V256) -> V256 {
373        unified::lt::<_, u8, _>(self, a, b)
374    }
375
376    #[inline(always)]
377    fn u16x16_lt(self, a: V256, b: V256) -> V256 {
378        unified::lt::<_, u16, _>(self, a, b)
379    }
380
381    #[inline(always)]
382    fn u32x8_lt(self, a: V256, b: V256) -> V256 {
383        unified::lt::<_, u32, _>(self, a, b)
384    }
385
386    #[inline(always)]
387    fn i8x32_lt(self, a: V256, b: V256) -> V256 {
388        unified::lt::<_, i8, _>(self, a, b)
389    }
390
391    #[inline(always)]
392    fn i16x16_lt(self, a: V256, b: V256) -> V256 {
393        unified::lt::<_, i16, _>(self, a, b)
394    }
395
396    #[inline(always)]
397    fn i32x8_lt(self, a: V256, b: V256) -> V256 {
398        unified::lt::<_, i32, _>(self, a, b)
399    }
400
401    #[inline(always)]
402    fn u8x32_max(self, a: V256, b: V256) -> V256 {
403        unified::max::<_, u8, _>(self, a, b)
404    }
405
406    #[inline(always)]
407    fn u16x16_max(self, a: V256, b: V256) -> V256 {
408        unified::max::<_, u16, _>(self, a, b)
409    }
410
411    #[inline(always)]
412    fn u32x8_max(self, a: V256, b: V256) -> V256 {
413        unified::max::<_, u32, _>(self, a, b)
414    }
415
416    #[inline(always)]
417    fn i8x32_max(self, a: V256, b: V256) -> V256 {
418        unified::max::<_, i8, _>(self, a, b)
419    }
420
421    #[inline(always)]
422    fn i16x16_max(self, a: V256, b: V256) -> V256 {
423        unified::max::<_, i16, _>(self, a, b)
424    }
425
426    #[inline(always)]
427    fn i32x8_max(self, a: V256, b: V256) -> V256 {
428        unified::max::<_, i32, _>(self, a, b)
429    }
430
431    #[inline(always)]
432    fn u8x32_min(self, a: V256, b: V256) -> V256 {
433        unified::min::<_, u8, _>(self, a, b)
434    }
435
436    #[inline(always)]
437    fn u16x16_min(self, a: V256, b: V256) -> V256 {
438        unified::min::<_, u16, _>(self, a, b)
439    }
440
441    #[inline(always)]
442    fn u32x8_min(self, a: V256, b: V256) -> V256 {
443        unified::min::<_, u32, _>(self, a, b)
444    }
445
446    #[inline(always)]
447    fn i8x32_min(self, a: V256, b: V256) -> V256 {
448        unified::min::<_, i8, _>(self, a, b)
449    }
450
451    #[inline(always)]
452    fn i16x16_min(self, a: V256, b: V256) -> V256 {
453        unified::min::<_, i16, _>(self, a, b)
454    }
455
456    #[inline(always)]
457    fn i32x8_min(self, a: V256, b: V256) -> V256 {
458        unified::min::<_, i32, _>(self, a, b)
459    }
460
461    #[inline(always)]
462    fn u8x16x2_swizzle(self, a: V256, b: V256) -> V256 {
463        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
464        if matches_isa!(Self, AVX2) {
465            return unsafe { t(_mm256_shuffle_epi8(t(a), t(b))) };
466        }
467        {
468            simd256_vop!(self, Self::u8x16_swizzle, a, b)
469        }
470    }
471
472    #[inline(always)]
473    fn u16x16_bswap(self, a: V256) -> V256 {
474        if matches_isa!(Self, AVX2) {
475            return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U16X16);
476        }
477        {
478            simd256_vop!(self, Self::u16x8_bswap, a)
479        }
480    }
481
482    #[inline(always)]
483    fn u32x8_bswap(self, a: V256) -> V256 {
484        if matches_isa!(Self, AVX2) {
485            return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U32X8);
486        }
487        {
488            simd256_vop!(self, Self::u32x4_bswap, a)
489        }
490    }
491
492    #[inline(always)]
493    fn u64x4_bswap(self, a: V256) -> V256 {
494        if matches_isa!(Self, AVX2) {
495            return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U64X4);
496        }
497        {
498            simd256_vop!(self, Self::u64x2_bswap, a)
499        }
500    }
501
502    #[inline(always)]
503    fn u8x32_swizzle(self, a: V256, b: V256) -> V256 {
504        if matches_isa!(Self, SSE2 | WASM128) {
505            let _ = (a, b);
506            unimplemented!()
507        }
508        #[cfg(all(feature = "unstable", target_arch = "arm"))]
509        if matches_isa!(Self, NEON) {
510            let _ = (a, b);
511            unimplemented!()
512        }
513        #[cfg(target_arch = "aarch64")]
514        if matches_isa!(Self, NEON) {
515            return unsafe {
516                let (a, b): (uint8x16x2_t, uint8x16x2_t) = (t(a), t(b));
517                let c = (vqtbl2q_u8(a, b.0), vqtbl2q_u8(a, b.1));
518                t(uint8x16x2_t(c.0, c.1))
519            };
520        }
521        {
522            let _ = (a, b);
523            unreachable!()
524        }
525    }
526
527    #[inline(always)]
528    fn u8x32_any_zero(self, a: V256) -> bool {
529        if matches_isa!(Self, AVX2) {
530            let is_zero = self.u8x32_eq(a, self.v256_create_zero());
531            return self.u8x32_bitmask(is_zero) != 0;
532        }
533        {
534            let a = a.to_v128x2();
535            self.u8x16_any_zero(self.u8x16_min(a.0, a.1))
536        }
537    }
538
539    #[inline(always)]
540    fn u8x32_bitmask(self, a: V256) -> u32 {
541        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
542        if matches_isa!(Self, AVX2) {
543            return unsafe { _mm256_movemask_epi8(t(a)) as u32 };
544        }
545        {
546            let a = a.to_v128x2();
547            let m0 = self.u8x16_bitmask(a.0) as u32;
548            let m1 = self.u8x16_bitmask(a.1) as u32;
549            (m1 << 16) | m0
550        }
551    }
552
553    #[inline(always)]
554    fn u8x32_reduce_max(self, a: V256) -> u8 {
555        let a = a.to_v128x2();
556        self.u8x16_reduce_max(self.u8x16_max(a.0, a.1))
557    }
558
559    #[inline(always)]
560    fn u8x32_reduce_min(self, a: V256) -> u8 {
561        let a = a.to_v128x2();
562        self.u8x16_reduce_min(self.u8x16_min(a.0, a.1))
563    }
564
565    /// for each bit: if a == 1 { b } else { c }
566    ///
567    /// ans = ((b ^ c) & a) ^ c
568    #[inline(always)]
569    fn v256_bsl(self, a: V256, b: V256, c: V256) -> V256 {
570        if matches_isa!(Self, NEON) {
571            return simd256_vop!(self, Self::v128_bsl, a, b, c);
572        }
573        {
574            self.v256_xor(self.v256_and(self.v256_xor(b, c), a), c)
575        }
576    }
577
578    #[inline(always)]
579    fn u16x16_from_u8x16(self, a: V128) -> V256 {
580        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
581        if matches_isa!(Self, AVX2) {
582            return unsafe { t(_mm256_cvtepu8_epi16(t(a))) };
583        }
584        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
585        if matches_isa!(Self, SSE2) {
586            let zero = self.v128_create_zero();
587            let lo = self.u8x16_zip_lo(a, zero);
588            let hi = self.u8x16_zip_hi(a, zero);
589            return V256::from_v128x2((lo, hi));
590        }
591        #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
592        if matches_isa!(Self, NEON) {
593            return unsafe {
594                let a = t(a);
595                let low = vmovl_u8(vget_low_u8(a));
596                let high = vmovl_u8(vget_high_u8(a));
597                t(uint16x8x2_t(low, high))
598            };
599        }
600        #[cfg(target_arch = "wasm32")]
601        if matches_isa!(Self, WASM128) {
602            return unsafe {
603                let a = t(a);
604                let low = t(u16x8_extend_low_u8x16(a));
605                let high = t(u16x8_extend_high_u8x16(a));
606                V256::from_v128x2((low, high))
607            };
608        }
609        {
610            let _ = a;
611            unreachable!()
612        }
613    }
614
615    #[inline(always)]
616    fn u8x16x2_zip_lo(self, a: V256, b: V256) -> V256 {
617        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
618        if matches_isa!(Self, AVX2) {
619            return unsafe { t(_mm256_unpacklo_epi8(t(a), t(b))) };
620        }
621        {
622            simd256_vop!(self, Self::u8x16_zip_lo, a, b)
623        }
624    }
625
626    #[inline(always)]
627    fn u8x16x2_zip_hi(self, a: V256, b: V256) -> V256 {
628        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
629        if matches_isa!(Self, AVX2) {
630            return unsafe { t(_mm256_unpackhi_epi8(t(a), t(b))) };
631        }
632        {
633            simd256_vop!(self, Self::u8x16_zip_hi, a, b)
634        }
635    }
636
637    #[inline(always)]
638    fn u16x8x2_zip_lo(self, a: V256, b: V256) -> V256 {
639        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
640        if matches_isa!(Self, AVX2) {
641            return unsafe { t(_mm256_unpacklo_epi16(t(a), t(b))) };
642        }
643        {
644            simd256_vop!(self, Self::u16x8_zip_lo, a, b)
645        }
646    }
647
648    #[inline(always)]
649    fn u16x8x2_zip_hi(self, a: V256, b: V256) -> V256 {
650        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
651        if matches_isa!(Self, AVX2) {
652            return unsafe { t(_mm256_unpackhi_epi16(t(a), t(b))) };
653        }
654        {
655            simd256_vop!(self, Self::u16x8_zip_hi, a, b)
656        }
657    }
658
659    #[inline(always)]
660    fn u32x4x2_zip_lo(self, a: V256, b: V256) -> V256 {
661        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
662        if matches_isa!(Self, AVX2) {
663            return unsafe { t(_mm256_unpacklo_epi32(t(a), t(b))) };
664        }
665        {
666            simd256_vop!(self, Self::u32x4_zip_lo, a, b)
667        }
668    }
669
670    #[inline(always)]
671    fn u32x4x2_zip_hi(self, a: V256, b: V256) -> V256 {
672        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
673        if matches_isa!(Self, AVX2) {
674            return unsafe { t(_mm256_unpackhi_epi32(t(a), t(b))) };
675        }
676        {
677            simd256_vop!(self, Self::u32x4_zip_hi, a, b)
678        }
679    }
680
681    #[inline(always)]
682    fn u64x2x2_zip_lo(self, a: V256, b: V256) -> V256 {
683        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
684        if matches_isa!(Self, AVX2) {
685            return unsafe { t(_mm256_unpacklo_epi64(t(a), t(b))) };
686        }
687        {
688            simd256_vop!(self, Self::u64x2_zip_lo, a, b)
689        }
690    }
691
692    #[inline(always)]
693    fn u64x2x2_zip_hi(self, a: V256, b: V256) -> V256 {
694        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
695        if matches_isa!(Self, AVX2) {
696            return unsafe { t(_mm256_unpackhi_epi64(t(a), t(b))) };
697        }
698        {
699            simd256_vop!(self, Self::u64x2_zip_hi, a, b)
700        }
701    }
702
703    #[inline(always)]
704    fn v128x2_zip_lo(self, a: V256, b: V256) -> V256 {
705        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
706        if matches_isa!(Self, AVX2) {
707            return unsafe { t(_mm256_permute2x128_si256::<0b0010_0000>(t(a), t(b))) };
708        }
709        if matches_isa!(Self, SSE2 | NEON | WASM128) {
710            let ((a, _), (c, _)) = (a.to_v128x2(), b.to_v128x2());
711            return V256::from_v128x2((a, c));
712        }
713        {
714            unreachable!()
715        }
716    }
717
718    #[inline(always)]
719    fn v128x2_zip_hi(self, a: V256, b: V256) -> V256 {
720        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
721        if matches_isa!(Self, AVX2) {
722            return unsafe { t(_mm256_permute2x128_si256::<0b0011_0001>(t(a), t(b))) };
723        }
724        if matches_isa!(Self, SSE2 | NEON | WASM128) {
725            let ((_, b), (_, d)) = (a.to_v128x2(), b.to_v128x2());
726            return V256::from_v128x2((b, d));
727        }
728        {
729            unreachable!()
730        }
731    }
732
733    #[inline(always)]
734    fn u64x4_permute<const IMM8: i32>(self, a: V256) -> V256 {
735        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
736        if matches_isa!(Self, AVX2) {
737            return unsafe { t(_mm256_permute4x64_epi64::<IMM8>(t(a))) };
738        }
739        if matches_isa!(Self, SSE2 | NEON | WASM128) {
740            let _ = a;
741            unimplemented!()
742        }
743        {
744            let _ = a;
745            unreachable!()
746        }
747    }
748
749    #[inline(always)]
750    fn u8x32_unzip_even(self, a: V256, b: V256) -> V256 {
751        if matches_isa!(Self, SSE2) {
752            unimplemented!()
753        }
754        {
755            let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
756            let ab = self.u8x16_unzip_even(a, b);
757            let cd = self.u8x16_unzip_even(c, d);
758            V256::from_v128x2((ab, cd))
759        }
760    }
761
762    #[inline(always)]
763    fn u8x32_unzip_odd(self, a: V256, b: V256) -> V256 {
764        if matches_isa!(Self, SSE2) {
765            unimplemented!()
766        }
767        {
768            let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
769            let ab = self.u8x16_unzip_odd(a, b);
770            let cd = self.u8x16_unzip_odd(c, d);
771            V256::from_v128x2((ab, cd))
772        }
773    }
774
775    #[inline(always)]
776    fn u64x4_unzip_even(self, a: V256, b: V256) -> V256 {
777        if matches_isa!(Self, AVX2) {
778            let acbd = self.u64x2x2_zip_lo(a, b);
779            let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); // 0213
780            return abcd;
781        }
782        {
783            let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
784            let ab = self.u64x2_zip_lo(a, b);
785            let cd = self.u64x2_zip_lo(c, d);
786            V256::from_v128x2((ab, cd))
787        }
788    }
789
790    #[inline(always)]
791    fn u64x4_unzip_odd(self, a: V256, b: V256) -> V256 {
792        if matches_isa!(Self, AVX2) {
793            let acbd = self.u64x2x2_zip_hi(a, b);
794            let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); // 0213
795            return abcd;
796        }
797        {
798            let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
799            let ab = self.u64x2_zip_hi(a, b);
800            let cd = self.u64x2_zip_hi(c, d);
801            V256::from_v128x2((ab, cd))
802        }
803    }
804
805    #[inline(always)]
806    fn u16x16_mul_hi(self, a: V256, b: V256) -> V256 {
807        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
808        if matches_isa!(Self, AVX2) {
809            return unsafe { t(_mm256_mulhi_epu16(t(a), t(b))) };
810        }
811        {
812            simd256_vop!(self, Self::u16x8_mul_hi, a, b)
813        }
814    }
815
816    #[inline(always)]
817    fn i16x16_mul_hi(self, a: V256, b: V256) -> V256 {
818        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
819        if matches_isa!(Self, AVX2) {
820            return unsafe { t(_mm256_mulhi_epi16(t(a), t(b))) };
821        }
822        {
823            simd256_vop!(self, Self::i16x8_mul_hi, a, b)
824        }
825    }
826
827    #[inline(always)]
828    fn i16x16_maddubs(self, a: V256, b: V256) -> V256 {
829        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
830        if matches_isa!(Self, AVX2) {
831            return unsafe { t(_mm256_maddubs_epi16(t(a), t(b))) };
832        }
833        {
834            simd256_vop!(self, Self::i16x8_maddubs, a, b)
835        }
836    }
837
838    #[inline(always)]
839    fn u32x8_blend<const IMM8: i32>(self, a: V256, b: V256) -> V256 {
840        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
841        if matches_isa!(Self, AVX2) {
842            return unsafe { t(_mm256_blend_epi32::<IMM8>(t(a), t(b))) };
843        }
844        if matches_isa!(Self, NEON | WASM128) {
845            unimplemented!()
846        }
847        {
848            let _ = (a, b);
849            unreachable!()
850        }
851    }
852
853    /// if highbit(c) { b } else { a }
854    #[inline(always)]
855    fn u8x32_blendv(self, a: V256, b: V256, c: V256) -> V256 {
856        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
857        if matches_isa!(Self, AVX2) {
858            return unsafe { t(_mm256_blendv_epi8(t(a), t(b), t(c))) };
859        }
860        if matches_isa!(Self, NEON | WASM128) {
861            unimplemented!()
862        }
863        {
864            simd256_vop!(self, Self::u8x16_blendv, a, b, c)
865        }
866    }
867
868    #[inline(always)]
869    fn i16x16_madd(self, a: V256, b: V256) -> V256 {
870        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
871        if matches_isa!(Self, AVX2) {
872            return unsafe { t(_mm256_madd_epi16(t(a), t(b))) };
873        }
874        {
875            simd256_vop!(self, Self::i16x8_madd, a, b)
876        }
877    }
878
879    #[inline(always)]
880    fn u8x32_avgr(self, a: V256, b: V256) -> V256 {
881        #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
882        if matches_isa!(Self, AVX2) {
883            return unsafe { t(_mm256_avg_epu8(t(a), t(b))) };
884        }
885        {
886            simd256_vop!(self, Self::u8x16_avgr, a, b)
887        }
888    }
889
890    #[inline(always)]
891    fn i8x32_add_sat(self, a: V256, b: V256) -> V256 {
892        unified::add_sat::<_, i8, _>(self, a, b)
893    }
894
895    #[inline(always)]
896    fn u8x32_add_sat(self, a: V256, b: V256) -> V256 {
897        unified::add_sat::<_, u8, _>(self, a, b)
898    }
899}