1use crate::isa::{AVX2, NEON, SSE2, WASM128};
2use crate::vector::{V128, V256};
3use crate::{unified, SIMD128};
4
5#[cfg(any(
6 any(target_arch = "x86", target_arch = "x86_64"),
7 any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"),
8 target_arch = "wasm32"
9))]
10use core::mem::transmute as t;
11
12#[cfg(target_arch = "x86")]
13use core::arch::x86::*;
14
15#[cfg(target_arch = "x86_64")]
16use core::arch::x86_64::*;
17
18#[cfg(all(feature = "unstable", target_arch = "arm"))]
19use core::arch::arm::*;
20
21#[cfg(target_arch = "aarch64")]
22use core::arch::aarch64::*;
23
24#[cfg(target_arch = "wasm32")]
25use core::arch::wasm32::*;
26
27#[macro_export]
28macro_rules! simd256_vop {
29 ($s:expr, $f:expr, $a:expr) => {{
30 let s = $s;
31 let f = $f;
32 let a = $a.to_v128x2();
33 let b = (f(s, a.0), f(s, a.1));
34 V256::from_v128x2(b)
35 }};
36 ($s:expr, $f:expr, $a:expr, $b:expr) => {{
37 let s = $s;
38 let f = $f;
39 let a = $a.to_v128x2();
40 let b = $b.to_v128x2();
41 let c = (f(s, a.0, b.0), f(s, a.1, b.1));
42 V256::from_v128x2(c)
43 }};
44 ($s:expr, $f:expr, $a:expr, $b:expr, $c:expr) => {{
45 let s = $s;
46 let f = $f;
47 let a = $a.to_v128x2();
48 let b = $b.to_v128x2();
49 let c = $c.to_v128x2();
50 let d = (f(s, a.0, b.0, c.0), f(s, a.1, b.1, c.1));
51 V256::from_v128x2(d)
52 }};
53}
54
55pub unsafe trait SIMD256: SIMD128 {
56 #[inline(always)]
57 unsafe fn v256_load(self, addr: *const u8) -> V256 {
58 debug_assert_ptr_align!(addr, 32);
59
60 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
61 if matches_isa!(Self, AVX2) {
62 return t(_mm256_load_si256(addr.cast()));
63 }
64 #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
65 if matches_isa!(Self, NEON) {
66 return t(vld1q_u8_x2(addr.cast()));
67 }
68 {
69 let x0 = self.v128_load(addr);
70 let x1 = self.v128_load(addr.add(16));
71 V256::from_v128x2((x0, x1))
72 }
73 }
74
75 #[inline(always)]
76 unsafe fn v256_load_unaligned(self, addr: *const u8) -> V256 {
77 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
78 if matches_isa!(Self, AVX2) {
79 return t(_mm256_loadu_si256(addr.cast()));
80 }
81 #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
82 if matches_isa!(Self, NEON) {
83 return t(vld1q_u8_x2(addr.cast()));
84 }
85 {
86 let x0 = self.v128_load_unaligned(addr);
87 let x1 = self.v128_load_unaligned(addr.add(16));
88 V256::from_v128x2((x0, x1))
89 }
90 }
91
92 #[inline(always)]
93 unsafe fn v256_store(self, addr: *mut u8, a: V256) {
94 debug_assert_ptr_align!(addr, 32);
95
96 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
97 if matches_isa!(Self, AVX2) {
98 return _mm256_store_si256(addr.cast(), t(a));
99 }
100 #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
101 if matches_isa!(Self, NEON) {
102 return vst1q_u8_x2(addr.cast(), t(a));
103 }
104 {
105 let a = a.to_v128x2();
106 self.v128_store(addr, a.0);
107 self.v128_store(addr.add(16), a.1);
108 }
109 }
110
111 #[inline(always)]
112 unsafe fn v256_store_unaligned(self, addr: *mut u8, a: V256) {
113 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
114 if matches_isa!(Self, AVX2) {
115 return _mm256_storeu_si256(addr.cast(), t(a));
116 }
117 #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
118 if matches_isa!(Self, NEON) {
119 return vst1q_u8_x2(addr.cast(), t(a));
120 }
121 {
122 let a = a.to_v128x2();
123 self.v128_store_unaligned(addr, a.0);
124 self.v128_store_unaligned(addr.add(16), a.1);
125 }
126 }
127
128 #[inline(always)]
129 fn v256_create_zero(self) -> V256 {
130 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
131 if matches_isa!(Self, AVX2) {
132 return unsafe { t(_mm256_setzero_si256()) };
133 }
134 {
135 self.v128_create_zero().x2()
136 }
137 }
138
139 #[inline(always)]
140 fn v256_not(self, a: V256) -> V256 {
141 if matches_isa!(Self, AVX2) {
142 return self.v256_xor(a, self.u8x32_eq(a, a));
143 }
144 {
145 simd256_vop!(self, Self::v128_not, a)
146 }
147 }
148
149 #[inline(always)]
150 fn v256_and(self, a: V256, b: V256) -> V256 {
151 unified::and(self, a, b)
152 }
153
154 #[inline(always)]
155 fn v256_or(self, a: V256, b: V256) -> V256 {
156 unified::or(self, a, b)
157 }
158
159 #[inline(always)]
160 fn v256_xor(self, a: V256, b: V256) -> V256 {
161 unified::xor(self, a, b)
162 }
163
164 #[inline(always)]
165 fn v256_andnot(self, a: V256, b: V256) -> V256 {
166 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
167 if matches_isa!(Self, AVX2) {
168 return unsafe { t(_mm256_andnot_si256(t(b), t(a))) };
169 }
170 {
171 simd256_vop!(self, Self::v128_andnot, a, b)
172 }
173 }
174
175 #[inline(always)]
176 fn v256_all_zero(self, a: V256) -> bool {
177 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
178 if matches_isa!(Self, AVX2) {
179 return unsafe {
180 let a = t(a);
181 _mm256_testz_si256(a, a) != 0
182 };
183 }
184 {
185 let a = a.to_v128x2();
186 self.v128_all_zero(self.v128_or(a.0, a.1))
187 }
188 }
189
190 #[inline(always)]
191 fn u8x32_splat(self, x: u8) -> V256 {
192 unified::splat(self, x)
193 }
194
195 #[inline(always)]
196 fn u16x16_splat(self, x: u16) -> V256 {
197 unified::splat(self, x)
198 }
199
200 #[inline(always)]
201 fn u32x8_splat(self, x: u32) -> V256 {
202 unified::splat(self, x)
203 }
204
205 #[inline(always)]
206 fn u64x4_splat(self, x: u64) -> V256 {
207 unified::splat(self, x)
208 }
209
210 #[inline(always)]
211 fn i8x32_splat(self, x: i8) -> V256 {
212 unified::splat(self, x)
213 }
214
215 #[inline(always)]
216 fn i16x16_splat(self, x: i16) -> V256 {
217 unified::splat(self, x)
218 }
219
220 #[inline(always)]
221 fn i32x8_splat(self, x: i32) -> V256 {
222 unified::splat(self, x)
223 }
224
225 #[inline(always)]
226 fn i64x4_splat(self, x: i64) -> V256 {
227 unified::splat(self, x)
228 }
229
230 #[inline(always)]
231 fn u8x32_add(self, a: V256, b: V256) -> V256 {
232 unified::add::<_, u8, _>(self, a, b)
233 }
234
235 #[inline(always)]
236 fn u16x16_add(self, a: V256, b: V256) -> V256 {
237 unified::add::<_, u16, _>(self, a, b)
238 }
239
240 #[inline(always)]
241 fn u32x8_add(self, a: V256, b: V256) -> V256 {
242 unified::add::<_, u32, _>(self, a, b)
243 }
244
245 #[inline(always)]
246 fn u64x4_add(self, a: V256, b: V256) -> V256 {
247 unified::add::<_, u64, _>(self, a, b)
248 }
249
250 #[inline(always)]
251 fn u8x32_sub(self, a: V256, b: V256) -> V256 {
252 unified::sub::<_, u8, _>(self, a, b)
253 }
254
255 #[inline(always)]
256 fn u16x16_sub(self, a: V256, b: V256) -> V256 {
257 unified::sub::<_, u16, _>(self, a, b)
258 }
259
260 #[inline(always)]
261 fn u32x8_sub(self, a: V256, b: V256) -> V256 {
262 unified::sub::<_, u32, _>(self, a, b)
263 }
264
265 #[inline(always)]
266 fn u64x4_sub(self, a: V256, b: V256) -> V256 {
267 unified::sub::<_, u64, _>(self, a, b)
268 }
269
270 #[inline(always)]
271 fn u8x32_sub_sat(self, a: V256, b: V256) -> V256 {
272 unified::sub_sat::<_, u8, _>(self, a, b)
273 }
274
275 #[inline(always)]
276 fn u16x16_sub_sat(self, a: V256, b: V256) -> V256 {
277 unified::sub_sat::<_, u16, _>(self, a, b)
278 }
279
280 #[inline(always)]
281 fn i8x32_sub_sat(self, a: V256, b: V256) -> V256 {
282 unified::sub_sat::<_, i8, _>(self, a, b)
283 }
284
285 #[inline(always)]
286 fn i16x16_sub_sat(self, a: V256, b: V256) -> V256 {
287 unified::sub_sat::<_, i16, _>(self, a, b)
288 }
289
290 #[inline(always)]
291 fn i16x16_mul_lo(self, a: V256, b: V256) -> V256 {
292 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
293 if matches_isa!(Self, AVX2) {
294 return unsafe { t(_mm256_mullo_epi16(t(a), t(b))) };
295 }
296 {
297 simd256_vop!(self, Self::i16x8_mul_lo, a, b)
298 }
299 }
300
301 #[inline(always)]
302 fn i32x8_mul_lo(self, a: V256, b: V256) -> V256 {
303 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
304 if matches_isa!(Self, AVX2) {
305 return unsafe { t(_mm256_mullo_epi32(t(a), t(b))) };
306 }
307 {
308 simd256_vop!(self, Self::i32x4_mul_lo, a, b)
309 }
310 }
311
312 #[inline(always)]
313 fn u16x16_shl<const IMM8: i32>(self, a: V256) -> V256 {
314 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
315 if matches_isa!(Self, AVX2) {
316 return unsafe { t(_mm256_slli_epi16::<IMM8>(t(a))) };
317 }
318 {
319 simd256_vop!(self, Self::u16x8_shl::<IMM8>, a)
320 }
321 }
322
323 #[inline(always)]
324 fn u32x8_shl<const IMM8: i32>(self, a: V256) -> V256 {
325 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
326 if matches_isa!(Self, AVX2) {
327 return unsafe { t(_mm256_slli_epi32::<IMM8>(t(a))) };
328 }
329 {
330 simd256_vop!(self, Self::u32x4_shl::<IMM8>, a)
331 }
332 }
333
334 #[inline(always)]
335 fn u16x16_shr<const IMM8: i32>(self, a: V256) -> V256 {
336 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
337 if matches_isa!(Self, AVX2) {
338 return unsafe { t(_mm256_srli_epi16::<IMM8>(t(a))) };
339 }
340 {
341 simd256_vop!(self, Self::u16x8_shr::<IMM8>, a)
342 }
343 }
344
345 #[inline(always)]
346 fn u32x8_shr<const IMM8: i32>(self, a: V256) -> V256 {
347 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
348 if matches_isa!(Self, AVX2) {
349 return unsafe { t(_mm256_srli_epi32::<IMM8>(t(a))) };
350 }
351 {
352 simd256_vop!(self, Self::u32x4_shr::<IMM8>, a)
353 }
354 }
355
356 #[inline(always)]
357 fn u8x32_eq(self, a: V256, b: V256) -> V256 {
358 unified::eq::<_, u8, _>(self, a, b)
359 }
360
361 #[inline(always)]
362 fn u16x16_eq(self, a: V256, b: V256) -> V256 {
363 unified::eq::<_, u16, _>(self, a, b)
364 }
365
366 #[inline(always)]
367 fn u32x8_eq(self, a: V256, b: V256) -> V256 {
368 unified::eq::<_, u32, _>(self, a, b)
369 }
370
371 #[inline(always)]
372 fn u8x32_lt(self, a: V256, b: V256) -> V256 {
373 unified::lt::<_, u8, _>(self, a, b)
374 }
375
376 #[inline(always)]
377 fn u16x16_lt(self, a: V256, b: V256) -> V256 {
378 unified::lt::<_, u16, _>(self, a, b)
379 }
380
381 #[inline(always)]
382 fn u32x8_lt(self, a: V256, b: V256) -> V256 {
383 unified::lt::<_, u32, _>(self, a, b)
384 }
385
386 #[inline(always)]
387 fn i8x32_lt(self, a: V256, b: V256) -> V256 {
388 unified::lt::<_, i8, _>(self, a, b)
389 }
390
391 #[inline(always)]
392 fn i16x16_lt(self, a: V256, b: V256) -> V256 {
393 unified::lt::<_, i16, _>(self, a, b)
394 }
395
396 #[inline(always)]
397 fn i32x8_lt(self, a: V256, b: V256) -> V256 {
398 unified::lt::<_, i32, _>(self, a, b)
399 }
400
401 #[inline(always)]
402 fn u8x32_max(self, a: V256, b: V256) -> V256 {
403 unified::max::<_, u8, _>(self, a, b)
404 }
405
406 #[inline(always)]
407 fn u16x16_max(self, a: V256, b: V256) -> V256 {
408 unified::max::<_, u16, _>(self, a, b)
409 }
410
411 #[inline(always)]
412 fn u32x8_max(self, a: V256, b: V256) -> V256 {
413 unified::max::<_, u32, _>(self, a, b)
414 }
415
416 #[inline(always)]
417 fn i8x32_max(self, a: V256, b: V256) -> V256 {
418 unified::max::<_, i8, _>(self, a, b)
419 }
420
421 #[inline(always)]
422 fn i16x16_max(self, a: V256, b: V256) -> V256 {
423 unified::max::<_, i16, _>(self, a, b)
424 }
425
426 #[inline(always)]
427 fn i32x8_max(self, a: V256, b: V256) -> V256 {
428 unified::max::<_, i32, _>(self, a, b)
429 }
430
431 #[inline(always)]
432 fn u8x32_min(self, a: V256, b: V256) -> V256 {
433 unified::min::<_, u8, _>(self, a, b)
434 }
435
436 #[inline(always)]
437 fn u16x16_min(self, a: V256, b: V256) -> V256 {
438 unified::min::<_, u16, _>(self, a, b)
439 }
440
441 #[inline(always)]
442 fn u32x8_min(self, a: V256, b: V256) -> V256 {
443 unified::min::<_, u32, _>(self, a, b)
444 }
445
446 #[inline(always)]
447 fn i8x32_min(self, a: V256, b: V256) -> V256 {
448 unified::min::<_, i8, _>(self, a, b)
449 }
450
451 #[inline(always)]
452 fn i16x16_min(self, a: V256, b: V256) -> V256 {
453 unified::min::<_, i16, _>(self, a, b)
454 }
455
456 #[inline(always)]
457 fn i32x8_min(self, a: V256, b: V256) -> V256 {
458 unified::min::<_, i32, _>(self, a, b)
459 }
460
461 #[inline(always)]
462 fn u8x16x2_swizzle(self, a: V256, b: V256) -> V256 {
463 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
464 if matches_isa!(Self, AVX2) {
465 return unsafe { t(_mm256_shuffle_epi8(t(a), t(b))) };
466 }
467 {
468 simd256_vop!(self, Self::u8x16_swizzle, a, b)
469 }
470 }
471
472 #[inline(always)]
473 fn u16x16_bswap(self, a: V256) -> V256 {
474 if matches_isa!(Self, AVX2) {
475 return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U16X16);
476 }
477 {
478 simd256_vop!(self, Self::u16x8_bswap, a)
479 }
480 }
481
482 #[inline(always)]
483 fn u32x8_bswap(self, a: V256) -> V256 {
484 if matches_isa!(Self, AVX2) {
485 return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U32X8);
486 }
487 {
488 simd256_vop!(self, Self::u32x4_bswap, a)
489 }
490 }
491
492 #[inline(always)]
493 fn u64x4_bswap(self, a: V256) -> V256 {
494 if matches_isa!(Self, AVX2) {
495 return self.u8x16x2_swizzle(a, crate::bswap::SHUFFLE_U64X4);
496 }
497 {
498 simd256_vop!(self, Self::u64x2_bswap, a)
499 }
500 }
501
502 #[inline(always)]
503 fn u8x32_swizzle(self, a: V256, b: V256) -> V256 {
504 if matches_isa!(Self, SSE2 | WASM128) {
505 let _ = (a, b);
506 unimplemented!()
507 }
508 #[cfg(all(feature = "unstable", target_arch = "arm"))]
509 if matches_isa!(Self, NEON) {
510 let _ = (a, b);
511 unimplemented!()
512 }
513 #[cfg(target_arch = "aarch64")]
514 if matches_isa!(Self, NEON) {
515 return unsafe {
516 let (a, b): (uint8x16x2_t, uint8x16x2_t) = (t(a), t(b));
517 let c = (vqtbl2q_u8(a, b.0), vqtbl2q_u8(a, b.1));
518 t(uint8x16x2_t(c.0, c.1))
519 };
520 }
521 {
522 let _ = (a, b);
523 unreachable!()
524 }
525 }
526
527 #[inline(always)]
528 fn u8x32_any_zero(self, a: V256) -> bool {
529 if matches_isa!(Self, AVX2) {
530 let is_zero = self.u8x32_eq(a, self.v256_create_zero());
531 return self.u8x32_bitmask(is_zero) != 0;
532 }
533 {
534 let a = a.to_v128x2();
535 self.u8x16_any_zero(self.u8x16_min(a.0, a.1))
536 }
537 }
538
539 #[inline(always)]
540 fn u8x32_bitmask(self, a: V256) -> u32 {
541 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
542 if matches_isa!(Self, AVX2) {
543 return unsafe { _mm256_movemask_epi8(t(a)) as u32 };
544 }
545 {
546 let a = a.to_v128x2();
547 let m0 = self.u8x16_bitmask(a.0) as u32;
548 let m1 = self.u8x16_bitmask(a.1) as u32;
549 (m1 << 16) | m0
550 }
551 }
552
553 #[inline(always)]
554 fn u8x32_reduce_max(self, a: V256) -> u8 {
555 let a = a.to_v128x2();
556 self.u8x16_reduce_max(self.u8x16_max(a.0, a.1))
557 }
558
559 #[inline(always)]
560 fn u8x32_reduce_min(self, a: V256) -> u8 {
561 let a = a.to_v128x2();
562 self.u8x16_reduce_min(self.u8x16_min(a.0, a.1))
563 }
564
565 #[inline(always)]
569 fn v256_bsl(self, a: V256, b: V256, c: V256) -> V256 {
570 if matches_isa!(Self, NEON) {
571 return simd256_vop!(self, Self::v128_bsl, a, b, c);
572 }
573 {
574 self.v256_xor(self.v256_and(self.v256_xor(b, c), a), c)
575 }
576 }
577
578 #[inline(always)]
579 fn u16x16_from_u8x16(self, a: V128) -> V256 {
580 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
581 if matches_isa!(Self, AVX2) {
582 return unsafe { t(_mm256_cvtepu8_epi16(t(a))) };
583 }
584 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
585 if matches_isa!(Self, SSE2) {
586 let zero = self.v128_create_zero();
587 let lo = self.u8x16_zip_lo(a, zero);
588 let hi = self.u8x16_zip_hi(a, zero);
589 return V256::from_v128x2((lo, hi));
590 }
591 #[cfg(any(all(feature = "unstable", target_arch = "arm"), target_arch = "aarch64"))]
592 if matches_isa!(Self, NEON) {
593 return unsafe {
594 let a = t(a);
595 let low = vmovl_u8(vget_low_u8(a));
596 let high = vmovl_u8(vget_high_u8(a));
597 t(uint16x8x2_t(low, high))
598 };
599 }
600 #[cfg(target_arch = "wasm32")]
601 if matches_isa!(Self, WASM128) {
602 return unsafe {
603 let a = t(a);
604 let low = t(u16x8_extend_low_u8x16(a));
605 let high = t(u16x8_extend_high_u8x16(a));
606 V256::from_v128x2((low, high))
607 };
608 }
609 {
610 let _ = a;
611 unreachable!()
612 }
613 }
614
615 #[inline(always)]
616 fn u8x16x2_zip_lo(self, a: V256, b: V256) -> V256 {
617 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
618 if matches_isa!(Self, AVX2) {
619 return unsafe { t(_mm256_unpacklo_epi8(t(a), t(b))) };
620 }
621 {
622 simd256_vop!(self, Self::u8x16_zip_lo, a, b)
623 }
624 }
625
626 #[inline(always)]
627 fn u8x16x2_zip_hi(self, a: V256, b: V256) -> V256 {
628 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
629 if matches_isa!(Self, AVX2) {
630 return unsafe { t(_mm256_unpackhi_epi8(t(a), t(b))) };
631 }
632 {
633 simd256_vop!(self, Self::u8x16_zip_hi, a, b)
634 }
635 }
636
637 #[inline(always)]
638 fn u16x8x2_zip_lo(self, a: V256, b: V256) -> V256 {
639 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
640 if matches_isa!(Self, AVX2) {
641 return unsafe { t(_mm256_unpacklo_epi16(t(a), t(b))) };
642 }
643 {
644 simd256_vop!(self, Self::u16x8_zip_lo, a, b)
645 }
646 }
647
648 #[inline(always)]
649 fn u16x8x2_zip_hi(self, a: V256, b: V256) -> V256 {
650 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
651 if matches_isa!(Self, AVX2) {
652 return unsafe { t(_mm256_unpackhi_epi16(t(a), t(b))) };
653 }
654 {
655 simd256_vop!(self, Self::u16x8_zip_hi, a, b)
656 }
657 }
658
659 #[inline(always)]
660 fn u32x4x2_zip_lo(self, a: V256, b: V256) -> V256 {
661 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
662 if matches_isa!(Self, AVX2) {
663 return unsafe { t(_mm256_unpacklo_epi32(t(a), t(b))) };
664 }
665 {
666 simd256_vop!(self, Self::u32x4_zip_lo, a, b)
667 }
668 }
669
670 #[inline(always)]
671 fn u32x4x2_zip_hi(self, a: V256, b: V256) -> V256 {
672 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
673 if matches_isa!(Self, AVX2) {
674 return unsafe { t(_mm256_unpackhi_epi32(t(a), t(b))) };
675 }
676 {
677 simd256_vop!(self, Self::u32x4_zip_hi, a, b)
678 }
679 }
680
681 #[inline(always)]
682 fn u64x2x2_zip_lo(self, a: V256, b: V256) -> V256 {
683 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
684 if matches_isa!(Self, AVX2) {
685 return unsafe { t(_mm256_unpacklo_epi64(t(a), t(b))) };
686 }
687 {
688 simd256_vop!(self, Self::u64x2_zip_lo, a, b)
689 }
690 }
691
692 #[inline(always)]
693 fn u64x2x2_zip_hi(self, a: V256, b: V256) -> V256 {
694 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
695 if matches_isa!(Self, AVX2) {
696 return unsafe { t(_mm256_unpackhi_epi64(t(a), t(b))) };
697 }
698 {
699 simd256_vop!(self, Self::u64x2_zip_hi, a, b)
700 }
701 }
702
703 #[inline(always)]
704 fn v128x2_zip_lo(self, a: V256, b: V256) -> V256 {
705 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
706 if matches_isa!(Self, AVX2) {
707 return unsafe { t(_mm256_permute2x128_si256::<0b0010_0000>(t(a), t(b))) };
708 }
709 if matches_isa!(Self, SSE2 | NEON | WASM128) {
710 let ((a, _), (c, _)) = (a.to_v128x2(), b.to_v128x2());
711 return V256::from_v128x2((a, c));
712 }
713 {
714 unreachable!()
715 }
716 }
717
718 #[inline(always)]
719 fn v128x2_zip_hi(self, a: V256, b: V256) -> V256 {
720 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
721 if matches_isa!(Self, AVX2) {
722 return unsafe { t(_mm256_permute2x128_si256::<0b0011_0001>(t(a), t(b))) };
723 }
724 if matches_isa!(Self, SSE2 | NEON | WASM128) {
725 let ((_, b), (_, d)) = (a.to_v128x2(), b.to_v128x2());
726 return V256::from_v128x2((b, d));
727 }
728 {
729 unreachable!()
730 }
731 }
732
733 #[inline(always)]
734 fn u64x4_permute<const IMM8: i32>(self, a: V256) -> V256 {
735 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
736 if matches_isa!(Self, AVX2) {
737 return unsafe { t(_mm256_permute4x64_epi64::<IMM8>(t(a))) };
738 }
739 if matches_isa!(Self, SSE2 | NEON | WASM128) {
740 let _ = a;
741 unimplemented!()
742 }
743 {
744 let _ = a;
745 unreachable!()
746 }
747 }
748
749 #[inline(always)]
750 fn u8x32_unzip_even(self, a: V256, b: V256) -> V256 {
751 if matches_isa!(Self, SSE2) {
752 unimplemented!()
753 }
754 {
755 let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
756 let ab = self.u8x16_unzip_even(a, b);
757 let cd = self.u8x16_unzip_even(c, d);
758 V256::from_v128x2((ab, cd))
759 }
760 }
761
762 #[inline(always)]
763 fn u8x32_unzip_odd(self, a: V256, b: V256) -> V256 {
764 if matches_isa!(Self, SSE2) {
765 unimplemented!()
766 }
767 {
768 let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
769 let ab = self.u8x16_unzip_odd(a, b);
770 let cd = self.u8x16_unzip_odd(c, d);
771 V256::from_v128x2((ab, cd))
772 }
773 }
774
775 #[inline(always)]
776 fn u64x4_unzip_even(self, a: V256, b: V256) -> V256 {
777 if matches_isa!(Self, AVX2) {
778 let acbd = self.u64x2x2_zip_lo(a, b);
779 let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); return abcd;
781 }
782 {
783 let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
784 let ab = self.u64x2_zip_lo(a, b);
785 let cd = self.u64x2_zip_lo(c, d);
786 V256::from_v128x2((ab, cd))
787 }
788 }
789
790 #[inline(always)]
791 fn u64x4_unzip_odd(self, a: V256, b: V256) -> V256 {
792 if matches_isa!(Self, AVX2) {
793 let acbd = self.u64x2x2_zip_hi(a, b);
794 let abcd = self.u64x4_permute::<0b_1101_1000>(acbd); return abcd;
796 }
797 {
798 let ((a, b), (c, d)) = (a.to_v128x2(), b.to_v128x2());
799 let ab = self.u64x2_zip_hi(a, b);
800 let cd = self.u64x2_zip_hi(c, d);
801 V256::from_v128x2((ab, cd))
802 }
803 }
804
805 #[inline(always)]
806 fn u16x16_mul_hi(self, a: V256, b: V256) -> V256 {
807 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
808 if matches_isa!(Self, AVX2) {
809 return unsafe { t(_mm256_mulhi_epu16(t(a), t(b))) };
810 }
811 {
812 simd256_vop!(self, Self::u16x8_mul_hi, a, b)
813 }
814 }
815
816 #[inline(always)]
817 fn i16x16_mul_hi(self, a: V256, b: V256) -> V256 {
818 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
819 if matches_isa!(Self, AVX2) {
820 return unsafe { t(_mm256_mulhi_epi16(t(a), t(b))) };
821 }
822 {
823 simd256_vop!(self, Self::i16x8_mul_hi, a, b)
824 }
825 }
826
827 #[inline(always)]
828 fn i16x16_maddubs(self, a: V256, b: V256) -> V256 {
829 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
830 if matches_isa!(Self, AVX2) {
831 return unsafe { t(_mm256_maddubs_epi16(t(a), t(b))) };
832 }
833 {
834 simd256_vop!(self, Self::i16x8_maddubs, a, b)
835 }
836 }
837
838 #[inline(always)]
839 fn u32x8_blend<const IMM8: i32>(self, a: V256, b: V256) -> V256 {
840 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
841 if matches_isa!(Self, AVX2) {
842 return unsafe { t(_mm256_blend_epi32::<IMM8>(t(a), t(b))) };
843 }
844 if matches_isa!(Self, NEON | WASM128) {
845 unimplemented!()
846 }
847 {
848 let _ = (a, b);
849 unreachable!()
850 }
851 }
852
853 #[inline(always)]
855 fn u8x32_blendv(self, a: V256, b: V256, c: V256) -> V256 {
856 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
857 if matches_isa!(Self, AVX2) {
858 return unsafe { t(_mm256_blendv_epi8(t(a), t(b), t(c))) };
859 }
860 if matches_isa!(Self, NEON | WASM128) {
861 unimplemented!()
862 }
863 {
864 simd256_vop!(self, Self::u8x16_blendv, a, b, c)
865 }
866 }
867
868 #[inline(always)]
869 fn i16x16_madd(self, a: V256, b: V256) -> V256 {
870 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
871 if matches_isa!(Self, AVX2) {
872 return unsafe { t(_mm256_madd_epi16(t(a), t(b))) };
873 }
874 {
875 simd256_vop!(self, Self::i16x8_madd, a, b)
876 }
877 }
878
879 #[inline(always)]
880 fn u8x32_avgr(self, a: V256, b: V256) -> V256 {
881 #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
882 if matches_isa!(Self, AVX2) {
883 return unsafe { t(_mm256_avg_epu8(t(a), t(b))) };
884 }
885 {
886 simd256_vop!(self, Self::u8x16_avgr, a, b)
887 }
888 }
889
890 #[inline(always)]
891 fn i8x32_add_sat(self, a: V256, b: V256) -> V256 {
892 unified::add_sat::<_, i8, _>(self, a, b)
893 }
894
895 #[inline(always)]
896 fn u8x32_add_sat(self, a: V256, b: V256) -> V256 {
897 unified::add_sat::<_, u8, _>(self, a, b)
898 }
899}