From 284299f0a30ab829d2a6ed0e45f634f8b540d4fa Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 21:48:51 +0100 Subject: [PATCH 1/5] Add limited const generic support to kernel! macro This is a lot less complex and scary than I feared it would be. --- fearless_simd/src/kernel_macros.rs | 99 +++++++++++++++++++++++++++--- 1 file changed, 91 insertions(+), 8 deletions(-) diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs index a2ff2c21..676c9d63 100644 --- a/fearless_simd/src/kernel_macros.rs +++ b/fearless_simd/src/kernel_macros.rs @@ -50,7 +50,8 @@ /// /// ## Limitations /// -/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters. +/// The macro only accepts a single plain, safe function item with simple named parameters. +/// The function may optionally have one const generic parameter written as ``. /// However, the body of the function can be as complex as you like. /// /// The SIMD token type must be written as a bare supported name: @@ -66,7 +67,7 @@ macro_rules! kernel { ( $(#[$meta:meta])* - $vis:vis fn $name:ident( + $vis:vis fn $name:ident $()?( $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -75,7 +76,7 @@ macro_rules! kernel { $crate::__fearless_simd_kernel_dispatch! { $token_ty, $(#[$meta])* - $vis fn $name( + $vis fn $name $()?( $token $(, $arg: $arg_ty)* ) $(-> $ret)? { $($kernel_body)* @@ -85,7 +86,7 @@ macro_rules! kernel { ( $(#[$meta:meta])* - $vis:vis fn $name:ident( + $vis:vis fn $name:ident $()?( $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -174,7 +175,7 @@ macro_rules! __fearless_simd_kernel_impl { @token_ty $token_ty:ty; @kernel_attrs $(#[$kernel_attr:meta])*; $(#[$meta:meta])* - $vis:vis fn $name:ident( + $vis:vis fn $name:ident $()?( $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -182,12 +183,12 @@ macro_rules! __fearless_simd_kernel_impl { ) => { #[cfg($cfg)] $(#[$meta])* - $vis fn $name( + $vis fn $name $()?( $token: $token_ty $(, $arg: $arg_ty)* ) $(-> $ret)? { #[inline] // can't use `#[inline(always)]` with target features $(#[$kernel_attr])* - fn __fearless_simd_kernel( + fn __fearless_simd_kernel $()?( $token: $token_ty $(, $arg: $arg_ty)* ) $(-> $ret)? { let _ = $token; @@ -196,7 +197,7 @@ macro_rules! __fearless_simd_kernel_impl { // SAFETY: the SIMD token proves that the required target features are available. #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")] - unsafe { __fearless_simd_kernel($token $(, $arg)*) } + unsafe { __fearless_simd_kernel $(::<$const_param>)?($token $(, $arg)*) } } }; } @@ -226,18 +227,43 @@ mod tests { } ); + crate::kernel!( + fn add_f32x4_neon_const( + neon: Neon, + a: float32x4_t, + b: float32x4_t, + ) -> float32x4_t { + let _ = LANES; + vaddq_f32(a, b) + } + ); + crate::kernel!( fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 { f32x4_add(a, b) } ); + crate::kernel!( + fn add_f32x4_wasm_const(wasm: WasmSimd128, a: v128, b: v128) -> v128 { + let _ = LANES; + f32x4_add(a, b) + } + ); + crate::kernel!( fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) } ); + crate::kernel!( + fn add_i32x8_avx2_const(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { + let _ = LANES; + _mm256_add_epi32(a, b) + } + ); + #[cfg(target_arch = "aarch64")] #[test] fn kernel_instantiates_for_neon() { @@ -256,6 +282,25 @@ mod tests { ); } + #[cfg(target_arch = "aarch64")] + #[test] + fn kernel_instantiates_const_generic_for_neon() { + let Some(neon) = crate::Level::new().as_neon() else { + return; + }; + + let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon); + let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon); + let sum: crate::f32x4<_> = + add_f32x4_neon_const::<4>(neon, a.into(), b.into()).simd_into(neon); + + assert_eq!( + <[f32; 4]>::from(sum), + [11.0, 22.0, 33.0, 44.0], + "`kernel!` should instantiate a const-generic NEON kernel" + ); + } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[test] fn kernel_instantiates_for_wasm_simd128() { @@ -274,6 +319,25 @@ mod tests { ); } + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + #[test] + fn kernel_instantiates_const_generic_for_wasm_simd128() { + let wasm = crate::Level::new() + .as_wasm_simd128() + .expect("WASM SIMD128 should be available when +simd128 is enabled"); + + let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm); + let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm); + let sum: crate::f32x4<_> = + add_f32x4_wasm_const::<4>(wasm, a.into(), b.into()).simd_into(wasm); + + assert_eq!( + <[f32; 4]>::from(sum), + [11.0, 22.0, 33.0, 44.0], + "`kernel!` should instantiate a const-generic WASM SIMD128 kernel" + ); + } + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[test] fn kernel_instantiates_for_avx2() { @@ -291,4 +355,23 @@ mod tests { "`kernel!` should instantiate a working AVX2 kernel" ); } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn kernel_instantiates_const_generic_for_avx2() { + let Some(avx2) = crate::Level::new().as_avx2() else { + return; + }; + + let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2); + let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2); + let sum: crate::i32x8<_> = + add_i32x8_avx2_const::<8>(avx2, a.into(), b.into()).simd_into(avx2); + + assert_eq!( + <[i32; 8]>::from(sum), + [11, 22, 33, 44, 55, 66, 77, 88], + "`kernel!` should instantiate a const-generic AVX2 kernel" + ); + } } From 63df22ed88422093f876440d1253f4d0a5983e91 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:01:25 +0100 Subject: [PATCH 2/5] Drop unnecessary unsafe from WASM helpers, they never needed it --- fearless_simd/src/generated/wasm.rs | 72 +++++++++++------------------ fearless_simd_gen/src/mk_wasm.rs | 14 +++--- 2 files changed, 33 insertions(+), 53 deletions(-) diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 2963ca6b..7c54c052 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -8156,48 +8156,30 @@ impl From> for v128 { #[doc = r" The shift is still expected to be constant in practice, so the match statement will be optimized out."] #[doc = r" This exists because Rust doesn't currently let you do math on const generics."] #[inline(always)] -unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { - unsafe { - match shift { - 0 => i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(a, b), - 1 => i8x16_shuffle::<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16>(a, b), - 2 => i8x16_shuffle::<2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>(a, b), - 3 => i8x16_shuffle::<3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18>(a, b), - 4 => i8x16_shuffle::<4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19>(a, b), - 5 => i8x16_shuffle::<5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20>(a, b), - 6 => i8x16_shuffle::<6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21>(a, b), - 7 => i8x16_shuffle::<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22>(a, b), - 8 => { - i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23>(a, b) - } - 9 => { - i8x16_shuffle::<9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24>(a, b) - } - 10 => i8x16_shuffle::<10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25>( - a, b, - ), - 11 => i8x16_shuffle::<11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26>( - a, b, - ), - 12 => i8x16_shuffle::<12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27>( - a, b, - ), - 13 => i8x16_shuffle::<13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28>( - a, b, - ), - 14 => i8x16_shuffle::<14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29>( - a, b, - ), - 15 => i8x16_shuffle::<15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30>( - a, b, - ), - _ => unreachable!(), - } +fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { + match shift { + 0 => i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(a, b), + 1 => i8x16_shuffle::<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16>(a, b), + 2 => i8x16_shuffle::<2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>(a, b), + 3 => i8x16_shuffle::<3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18>(a, b), + 4 => i8x16_shuffle::<4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19>(a, b), + 5 => i8x16_shuffle::<5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20>(a, b), + 6 => i8x16_shuffle::<6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21>(a, b), + 7 => i8x16_shuffle::<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22>(a, b), + 8 => i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23>(a, b), + 9 => i8x16_shuffle::<9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24>(a, b), + 10 => i8x16_shuffle::<10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25>(a, b), + 11 => i8x16_shuffle::<11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26>(a, b), + 12 => i8x16_shuffle::<12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27>(a, b), + 13 => i8x16_shuffle::<13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28>(a, b), + 14 => i8x16_shuffle::<14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29>(a, b), + 15 => i8x16_shuffle::<15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30>(a, b), + _ => unreachable!(), } } #[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] #[inline(always)] -unsafe fn cross_block_slide_128x2( +fn cross_block_slide_128x2( a: [v128; 2usize], b: [v128; 2usize], shift_bytes: usize, @@ -8205,17 +8187,17 @@ unsafe fn cross_block_slide_128x2( [ { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, ] } #[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] #[inline(always)] -unsafe fn cross_block_slide_128x4( +fn cross_block_slide_128x4( a: [v128; 4usize], b: [v128; 4usize], shift_bytes: usize, @@ -8223,19 +8205,19 @@ unsafe fn cross_block_slide_128x4( [ { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 2usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 3usize, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) }, ] } diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 1a9d35bf..88ef9e44 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -798,12 +798,10 @@ fn mk_slide_helpers() -> TokenStream { /// The shift is still expected to be constant in practice, so the match statement will be optimized out. /// This exists because Rust doesn't currently let you do math on const generics. #[inline(always)] - unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { - unsafe { - match shift { - #(#shifts,)* - _ => unreachable!() - } + fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 { + match shift { + #(#shifts,)* + _ => unreachable!() } } }); @@ -818,7 +816,7 @@ fn mk_slide_helpers() -> TokenStream { quote! { { let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, #i, shift_bytes); - unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) } + dyn_slide_128(lo, hi, shift_bytes % 16) } } }) @@ -827,7 +825,7 @@ fn mk_slide_helpers() -> TokenStream { fns.push(quote! { /// Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`. #[inline(always)] - unsafe fn #helper_name(a: [v128; #num_blocks], b: [v128; #num_blocks], shift_bytes: usize) -> [v128; #num_blocks] { + fn #helper_name(a: [v128; #num_blocks], b: [v128; #num_blocks], shift_bytes: usize) -> [v128; #num_blocks] { // Explicitly unrolled to help LLVM optimize [#(#block_calls),*] } From c684fec6e67b0ed943773e148e9acb171fbc2661 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:12:57 +0100 Subject: [PATCH 3/5] Use kernel! to define helper functions; remove unsafe from their callers --- fearless_simd/src/generated/avx2.rs | 929 +++++++++++++------------- fearless_simd/src/generated/neon.rs | 246 ++++--- fearless_simd/src/generated/sse4_2.rs | 718 ++++++++++---------- fearless_simd_gen/src/mk_neon.rs | 22 +- fearless_simd_gen/src/mk_x86.rs | 137 ++-- 5 files changed, 1006 insertions(+), 1046 deletions(-) diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs index 216c6562..d0aa9f6d 100644 --- a/fearless_simd/src/generated/avx2.rs +++ b/fearless_simd/src/generated/avx2.rs @@ -152,20 +152,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_f32x4(b).val.0, - self.cvt_to_bytes_f32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x4( @@ -675,20 +674,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i8x16(b).val.0, - self.cvt_to_bytes_i8x16(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x16( @@ -1044,20 +1042,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u8x16(b).val.0, - self.cvt_to_bytes_u8x16(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x16( @@ -1575,20 +1572,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i16x8(b).val.0, - self.cvt_to_bytes_i16x8(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x8( @@ -1925,20 +1921,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u16x8(b).val.0, - self.cvt_to_bytes_u16x8(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x8( @@ -2435,20 +2430,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i32x4(b).val.0, - self.cvt_to_bytes_i32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x4( @@ -2805,20 +2799,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u32x4(b).val.0, - self.cvt_to_bytes_u32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x4( @@ -3331,20 +3324,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - if SHIFT >= 2usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_f64x2(b).val.0, - self.cvt_to_bytes_f64x2(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x2(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 2usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x2( @@ -3902,20 +3894,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x8( @@ -3923,20 +3914,19 @@ impl Simd for Avx2 { a: f32x8, b: f32x8, ) -> f32x8 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn abs_f32x8(self, a: f32x8) -> f32x8 { @@ -4489,20 +4479,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x32( @@ -4510,20 +4499,19 @@ impl Simd for Avx2 { a: i8x32, b: i8x32, ) -> i8x32 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { @@ -4947,20 +4935,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x32( @@ -4968,20 +4955,19 @@ impl Simd for Avx2 { a: u8x32, b: u8x32, ) -> u8x32 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { @@ -5581,20 +5567,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x16( @@ -5602,20 +5587,19 @@ impl Simd for Avx2 { a: i16x16, b: i16x16, ) -> i16x16 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { @@ -6022,20 +6006,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x16( @@ -6043,20 +6026,19 @@ impl Simd for Avx2 { a: u16x16, b: u16x16, ) -> u16x16 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { @@ -6654,20 +6636,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x8( @@ -6675,20 +6656,19 @@ impl Simd for Avx2 { a: i32x8, b: i32x8, ) -> i32x8 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { @@ -7091,20 +7071,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x8( @@ -7112,20 +7091,19 @@ impl Simd for Avx2 { a: u32x8, b: u32x8, ) -> u32x8 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn add_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { @@ -7696,20 +7674,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = cross_block_alignr_256x1( - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = cross_block_alignr_256x1( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x4( @@ -7717,20 +7694,19 @@ impl Simd for Avx2 { a: f64x4, b: f64x4, ) -> f64x4 { - unsafe { - if SHIFT >= 2usize { - return b; - } - let result = dyn_alignr_256( - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 2usize { + return b; } + let result = dyn_alignr_256( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn abs_f64x4(self, a: f64x4) -> f64x4 { @@ -8328,20 +8304,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_f32x16(b).val.0, - self.cvt_to_bytes_f32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x16( @@ -8745,20 +8720,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_i8x64(b).val.0, - self.cvt_to_bytes_i8x64(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 64usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x64( @@ -9017,20 +8991,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_u8x64(b).val.0, - self.cvt_to_bytes_u8x64(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 64usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x64( @@ -9482,20 +9455,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_i16x32(b).val.0, - self.cvt_to_bytes_i16x32(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x32( @@ -9763,20 +9735,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_u16x32(b).val.0, - self.cvt_to_bytes_u16x32(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x32( @@ -10236,20 +10207,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_i32x16(b).val.0, - self.cvt_to_bytes_i32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x16( @@ -10513,20 +10483,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_u32x16(b).val.0, - self.cvt_to_bytes_u32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x16( @@ -10956,20 +10925,19 @@ impl Simd for Avx2 { } #[inline(always)] fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_256x2( - self.cvt_to_bytes_f64x8(b).val.0, - self.cvt_to_bytes_f64x8(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x8(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_256x2( + self, + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x8( @@ -11529,12 +11497,12 @@ impl From> for __m256i { crate::transmute::checked_transmute_copy(&value.val) } } -#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] -#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] -#[doc = r" Rust doesn't currently let you do math on const generics."] -#[inline(always)] -unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { - unsafe { +crate::kernel!( + #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] + #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] + #[doc = r" Rust doesn't currently let you do math on const generics."] + #[inline(always)] + fn dyn_alignr_128(token: Avx2, a: __m128i, b: __m128i, shift: usize) -> __m128i { match shift { 0usize => _mm_alignr_epi8::<0i32>(a, b), 1usize => _mm_alignr_epi8::<1i32>(a, b), @@ -11555,13 +11523,13 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { _ => unreachable!(), } } -} -#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] -#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] -#[doc = r" Rust doesn't currently let you do math on const generics."] -#[inline(always)] -unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i { - unsafe { +); +crate::kernel!( + #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] + #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] + #[doc = r" Rust doesn't currently let you do math on const generics."] + #[inline(always)] + fn dyn_alignr_256(token: Avx2, a: __m256i, b: __m256i, shift: usize) -> __m256i { match shift { 0usize => _mm256_alignr_epi8::<0i32>(a, b), 1usize => _mm256_alignr_epi8::<1i32>(a, b), @@ -11582,52 +11550,63 @@ unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i { _ => unreachable!(), } } -} -#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."] -#[doc = r""] -#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"] -#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."] -#[inline(always)] -unsafe fn cross_block_alignr_one( - regs: &[__m256i], - block_idx: usize, - shift_bytes: usize, -) -> __m256i { - let lo_idx = block_idx + (shift_bytes / 16); - let intra_shift = shift_bytes % 16; - let lo_blocks = if lo_idx & 1 == 0 { - regs[lo_idx / 2] - } else { - unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } - }; - let hi_idx = lo_idx + 1; - let hi_blocks = if hi_idx & 1 == 0 { - regs[hi_idx / 2] - } else { - unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } - }; - unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } -} -#[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"] -#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] -#[inline(always)] -unsafe fn cross_block_alignr_256x2( - a: [__m256i; 2], - b: [__m256i; 2], - shift_bytes: usize, -) -> [__m256i; 2] { - let regs = [b[0], b[1], a[0], a[1]]; - unsafe { +); +crate::kernel!( + #[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."] + #[doc = r""] + #[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"] + #[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."] + #[inline(always)] + fn cross_block_alignr_one( + token: Avx2, + regs: &[__m256i], + block_idx: usize, + shift_bytes: usize, + ) -> __m256i { + let lo_idx = block_idx + (shift_bytes / 16); + let intra_shift = shift_bytes % 16; + let lo_blocks = if lo_idx & 1 == 0 { + regs[lo_idx / 2] + } else { + _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) + }; + let hi_idx = lo_idx + 1; + let hi_blocks = if hi_idx & 1 == 0 { + regs[hi_idx / 2] + } else { + _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) + }; + dyn_alignr_256(token, hi_blocks, lo_blocks, intra_shift) + } +); +crate::kernel!( + #[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"] + #[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] + #[inline(always)] + fn cross_block_alignr_256x2( + token: Avx2, + a: [__m256i; 2], + b: [__m256i; 2], + shift_bytes: usize, + ) -> [__m256i; 2] { + let regs = [b[0], b[1], a[0], a[1]]; [ - cross_block_alignr_one(®s, 0, shift_bytes), - cross_block_alignr_one(®s, 2, shift_bytes), + cross_block_alignr_one(token, ®s, 0, shift_bytes), + cross_block_alignr_one(token, ®s, 2, shift_bytes), ] } -} -#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] -#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] -#[inline(always)] -unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { - let regs = [b, a]; - unsafe { cross_block_alignr_one(®s, 0, shift_bytes) } -} +); +crate::kernel!( + #[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"] + #[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."] + #[inline(always)] + fn cross_block_alignr_256x1( + token: Avx2, + a: __m256i, + b: __m256i, + shift_bytes: usize, + ) -> __m256i { + let regs = [b, a]; + cross_block_alignr_one(token, ®s, 0, shift_bytes) + } +); diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs index 954b9c3b..fa092c74 100644 --- a/fearless_simd/src/generated/neon.rs +++ b/fearless_simd/src/generated/neon.rs @@ -145,13 +145,12 @@ impl Simd for Neon { if SHIFT >= 4usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_f32x4(a).val.0, - self.cvt_to_bytes_f32x4(b).val.0, - SHIFT * 4usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_f32x4(a).val.0, + self.cvt_to_bytes_f32x4(b).val.0, + SHIFT * 4usize, + ); self.cvt_from_bytes_f32x4(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -623,13 +622,12 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_i8x16(a).val.0, - self.cvt_to_bytes_i8x16(b).val.0, - SHIFT, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_i8x16(a).val.0, + self.cvt_to_bytes_i8x16(b).val.0, + SHIFT, + ); self.cvt_from_bytes_i8x16(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -990,13 +988,12 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_u8x16(a).val.0, - self.cvt_to_bytes_u8x16(b).val.0, - SHIFT, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_u8x16(a).val.0, + self.cvt_to_bytes_u8x16(b).val.0, + SHIFT, + ); self.cvt_from_bytes_u8x16(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -1525,13 +1522,12 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_i16x8(a).val.0, - self.cvt_to_bytes_i16x8(b).val.0, - SHIFT * 2usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_i16x8(a).val.0, + self.cvt_to_bytes_i16x8(b).val.0, + SHIFT * 2usize, + ); self.cvt_from_bytes_i16x8(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -1892,13 +1888,12 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_u16x8(a).val.0, - self.cvt_to_bytes_u16x8(b).val.0, - SHIFT * 2usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_u16x8(a).val.0, + self.cvt_to_bytes_u16x8(b).val.0, + SHIFT * 2usize, + ); self.cvt_from_bytes_u16x8(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -2417,13 +2412,12 @@ impl Simd for Neon { if SHIFT >= 4usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_i32x4(a).val.0, - self.cvt_to_bytes_i32x4(b).val.0, - SHIFT * 4usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_i32x4(a).val.0, + self.cvt_to_bytes_i32x4(b).val.0, + SHIFT * 4usize, + ); self.cvt_from_bytes_i32x4(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -2794,13 +2788,12 @@ impl Simd for Neon { if SHIFT >= 4usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_u32x4(a).val.0, - self.cvt_to_bytes_u32x4(b).val.0, - SHIFT * 4usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_u32x4(a).val.0, + self.cvt_to_bytes_u32x4(b).val.0, + SHIFT * 4usize, + ); self.cvt_from_bytes_u32x4(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -3318,13 +3311,12 @@ impl Simd for Neon { if SHIFT >= 2usize { return b; } - let result = unsafe { - dyn_vext_128( - self.cvt_to_bytes_f64x2(a).val.0, - self.cvt_to_bytes_f64x2(b).val.0, - SHIFT * 8usize, - ) - }; + let result = dyn_vext_128( + self, + self.cvt_to_bytes_f64x2(a).val.0, + self.cvt_to_bytes_f64x2(b).val.0, + SHIFT * 8usize, + ); self.cvt_from_bytes_f64x2(u8x16 { val: crate::support::Aligned128(result), simd: self, @@ -3899,7 +3891,7 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_f32x8(a).val.0; let b_bytes = self.cvt_to_bytes_f32x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -3913,7 +3905,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -3922,7 +3914,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -4293,7 +4285,7 @@ impl Simd for Neon { if SHIFT >= 32usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i8x32(a).val.0; let b_bytes = self.cvt_to_bytes_i8x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -4307,7 +4299,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -4316,7 +4308,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -4594,7 +4586,7 @@ impl Simd for Neon { if SHIFT >= 32usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u8x32(a).val.0; let b_bytes = self.cvt_to_bytes_u8x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -4608,7 +4600,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -4617,7 +4609,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -5005,7 +4997,7 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i16x16(a).val.0; let b_bytes = self.cvt_to_bytes_i16x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -5019,7 +5011,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -5028,7 +5020,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -5306,7 +5298,7 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u16x16(a).val.0; let b_bytes = self.cvt_to_bytes_u16x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -5320,7 +5312,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -5329,7 +5321,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -5730,7 +5722,7 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i32x8(a).val.0; let b_bytes = self.cvt_to_bytes_i32x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -5744,7 +5736,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -5753,7 +5745,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -6036,7 +6028,7 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u32x8(a).val.0; let b_bytes = self.cvt_to_bytes_u32x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -6050,7 +6042,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6059,7 +6051,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -6444,7 +6436,7 @@ impl Simd for Neon { if SHIFT >= 4usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_f64x4(a).val.0; let b_bytes = self.cvt_to_bytes_f64x4(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1]; @@ -6458,7 +6450,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6467,7 +6459,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -6906,7 +6898,7 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_f32x16(a).val.0; let b_bytes = self.cvt_to_bytes_f32x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -6920,7 +6912,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6929,7 +6921,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6938,7 +6930,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -6947,7 +6939,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -7317,7 +7309,7 @@ impl Simd for Neon { if SHIFT >= 64usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i8x64(a).val.0; let b_bytes = self.cvt_to_bytes_i8x64(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -7331,7 +7323,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7340,7 +7332,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7349,7 +7341,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7358,7 +7350,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -7627,7 +7619,7 @@ impl Simd for Neon { if SHIFT >= 64usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u8x64(a).val.0; let b_bytes = self.cvt_to_bytes_u8x64(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -7641,7 +7633,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7650,7 +7642,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7659,7 +7651,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -7668,7 +7660,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -8041,7 +8033,7 @@ impl Simd for Neon { if SHIFT >= 32usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i16x32(a).val.0; let b_bytes = self.cvt_to_bytes_i16x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -8055,7 +8047,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8064,7 +8056,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8073,7 +8065,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8082,7 +8074,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -8360,7 +8352,7 @@ impl Simd for Neon { if SHIFT >= 32usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u16x32(a).val.0; let b_bytes = self.cvt_to_bytes_u16x32(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -8374,7 +8366,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8383,7 +8375,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8392,7 +8384,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8401,7 +8393,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -8796,7 +8788,7 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_i32x16(a).val.0; let b_bytes = self.cvt_to_bytes_i32x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -8810,7 +8802,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8819,7 +8811,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8828,7 +8820,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -8837,7 +8829,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -9111,7 +9103,7 @@ impl Simd for Neon { if SHIFT >= 16usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_u32x16(a).val.0; let b_bytes = self.cvt_to_bytes_u32x16(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -9125,7 +9117,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9134,7 +9126,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9143,7 +9135,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9152,7 +9144,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -9527,7 +9519,7 @@ impl Simd for Neon { if SHIFT >= 8usize { return b; } - let result = unsafe { + let result = { let a_bytes = self.cvt_to_bytes_f64x8(a).val.0; let b_bytes = self.cvt_to_bytes_f64x8(b).val.0; let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3]; @@ -9541,7 +9533,7 @@ impl Simd for Neon { 0, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9550,7 +9542,7 @@ impl Simd for Neon { 1, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9559,7 +9551,7 @@ impl Simd for Neon { 2, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, { let [lo, hi] = crate::support::cross_block_slide_blocks_at( @@ -9568,7 +9560,7 @@ impl Simd for Neon { 3, shift_bytes, ); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }, ) }; @@ -10476,12 +10468,12 @@ impl From> for int64x2x4_t { crate::transmute::checked_transmute_copy(&value.val) } } -#[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"] -#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] -#[doc = r" Rust doesn't currently let you do math on const generics."] -#[inline(always)] -unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { - unsafe { +crate::kernel!( + #[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"] + #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] + #[doc = r" Rust doesn't currently let you do math on const generics."] + #[inline(always)] + fn dyn_vext_128(neon: Neon, a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { match shift { 0usize => vextq_u8::<0i32>(a, b), 1usize => vextq_u8::<1i32>(a, b), @@ -10502,4 +10494,4 @@ unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t _ => unreachable!(), } } -} +); diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs index 47b81a4d..5f36c1cc 100644 --- a/fearless_simd/src/generated/sse4_2.rs +++ b/fearless_simd/src/generated/sse4_2.rs @@ -178,20 +178,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f32x4(self, a: f32x4, b: f32x4) -> f32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_f32x4(b).val.0, - self.cvt_to_bytes_f32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_f32x4(b).val.0, + self.cvt_to_bytes_f32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x4( @@ -686,20 +685,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i8x16(self, a: i8x16, b: i8x16) -> i8x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i8x16(b).val.0, - self.cvt_to_bytes_i8x16(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i8x16(b).val.0, + self.cvt_to_bytes_i8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x16( @@ -1052,20 +1050,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u8x16(self, a: u8x16, b: u8x16) -> u8x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u8x16(b).val.0, - self.cvt_to_bytes_u8x16(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u8x16(b).val.0, + self.cvt_to_bytes_u8x16(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x16( @@ -1580,20 +1577,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i16x8(self, a: i16x8, b: i16x8) -> i16x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i16x8(b).val.0, - self.cvt_to_bytes_i16x8(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i16x8(b).val.0, + self.cvt_to_bytes_i16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x8( @@ -1927,20 +1923,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u16x8(self, a: u16x8, b: u16x8) -> u16x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u16x8(b).val.0, - self.cvt_to_bytes_u16x8(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u16x8(b).val.0, + self.cvt_to_bytes_u16x8(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x8( @@ -2431,20 +2426,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i32x4(self, a: i32x4, b: i32x4) -> i32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_i32x4(b).val.0, - self.cvt_to_bytes_i32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_i32x4(b).val.0, + self.cvt_to_bytes_i32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x4( @@ -2786,20 +2780,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u32x4(self, a: u32x4, b: u32x4) -> u32x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_u32x4(b).val.0, - self.cvt_to_bytes_u32x4(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_u32x4(b).val.0, + self.cvt_to_bytes_u32x4(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x4( @@ -3294,20 +3287,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f64x2(self, a: f64x2, b: f64x2) -> f64x2 { - unsafe { - if SHIFT >= 2usize { - return b; - } - let result = dyn_alignr_128( - self.cvt_to_bytes_f64x2(b).val.0, - self.cvt_to_bytes_f64x2(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x2(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) + if SHIFT >= 2usize { + return b; } + let result = dyn_alignr_128( + self, + self.cvt_to_bytes_f64x2(b).val.0, + self.cvt_to_bytes_f64x2(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x2( @@ -3842,20 +3834,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f32x8(self, a: f32x8, b: f32x8) -> f32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_f32x8(b).val.0, - self.cvt_to_bytes_f32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_f32x8(b).val.0, + self.cvt_to_bytes_f32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x8( @@ -4214,20 +4205,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i8x32(self, a: i8x32, b: i8x32) -> i8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_i8x32(b).val.0, - self.cvt_to_bytes_i8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_i8x32(b).val.0, + self.cvt_to_bytes_i8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x32( @@ -4493,20 +4483,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u8x32(self, a: u8x32, b: u8x32) -> u8x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_u8x32(b).val.0, - self.cvt_to_bytes_u8x32(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_u8x32(b).val.0, + self.cvt_to_bytes_u8x32(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x32( @@ -4880,20 +4869,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i16x16(self, a: i16x16, b: i16x16) -> i16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_i16x16(b).val.0, - self.cvt_to_bytes_i16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_i16x16(b).val.0, + self.cvt_to_bytes_i16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x16( @@ -5159,20 +5147,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u16x16(self, a: u16x16, b: u16x16) -> u16x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_u16x16(b).val.0, - self.cvt_to_bytes_u16x16(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_u16x16(b).val.0, + self.cvt_to_bytes_u16x16(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x16( @@ -5567,20 +5554,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i32x8(self, a: i32x8, b: i32x8) -> i32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_i32x8(b).val.0, - self.cvt_to_bytes_i32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_i32x8(b).val.0, + self.cvt_to_bytes_i32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x8( @@ -5851,20 +5837,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u32x8(self, a: u32x8, b: u32x8) -> u32x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_u32x8(b).val.0, - self.cvt_to_bytes_u32x8(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_u32x8(b).val.0, + self.cvt_to_bytes_u32x8(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x8( @@ -6235,20 +6220,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f64x4(self, a: f64x4, b: f64x4) -> f64x4 { - unsafe { - if SHIFT >= 4usize { - return b; - } - let result = cross_block_alignr_128x2( - self.cvt_to_bytes_f64x4(b).val.0, - self.cvt_to_bytes_f64x4(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) + if SHIFT >= 4usize { + return b; } + let result = cross_block_alignr_128x2( + self, + self.cvt_to_bytes_f64x4(b).val.0, + self.cvt_to_bytes_f64x4(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x4( @@ -6673,20 +6657,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f32x16(self, a: f32x16, b: f32x16) -> f32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_f32x16(b).val.0, - self.cvt_to_bytes_f32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_f32x16(b).val.0, + self.cvt_to_bytes_f32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x16( @@ -7090,20 +7073,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i8x64(self, a: i8x64, b: i8x64) -> i8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_i8x64(b).val.0, - self.cvt_to_bytes_i8x64(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 64usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i8x64(b).val.0, + self.cvt_to_bytes_i8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x64( @@ -7362,20 +7344,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u8x64(self, a: u8x64, b: u8x64) -> u8x64 { - unsafe { - if SHIFT >= 64usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_u8x64(b).val.0, - self.cvt_to_bytes_u8x64(a).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 64usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_u8x64(b).val.0, + self.cvt_to_bytes_u8x64(a).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x64( @@ -7833,20 +7814,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i16x32(self, a: i16x32, b: i16x32) -> i16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_i16x32(b).val.0, - self.cvt_to_bytes_i16x32(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i16x32(b).val.0, + self.cvt_to_bytes_i16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x32( @@ -8114,20 +8094,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u16x32(self, a: u16x32, b: u16x32) -> u16x32 { - unsafe { - if SHIFT >= 32usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_u16x32(b).val.0, - self.cvt_to_bytes_u16x32(a).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 32usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_u16x32(b).val.0, + self.cvt_to_bytes_u16x32(a).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x32( @@ -8575,20 +8554,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_i32x16(self, a: i32x16, b: i32x16) -> i32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_i32x16(b).val.0, - self.cvt_to_bytes_i32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_i32x16(b).val.0, + self.cvt_to_bytes_i32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x16( @@ -8852,20 +8830,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_u32x16(self, a: u32x16, b: u32x16) -> u32x16 { - unsafe { - if SHIFT >= 16usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_u32x16(b).val.0, - self.cvt_to_bytes_u32x16(a).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 16usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_u32x16(b).val.0, + self.cvt_to_bytes_u32x16(a).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x16( @@ -9274,20 +9251,19 @@ impl Simd for Sse4_2 { } #[inline(always)] fn slide_f64x8(self, a: f64x8, b: f64x8) -> f64x8 { - unsafe { - if SHIFT >= 8usize { - return b; - } - let result = cross_block_alignr_128x4( - self.cvt_to_bytes_f64x8(b).val.0, - self.cvt_to_bytes_f64x8(a).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x8(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) + if SHIFT >= 8usize { + return b; } + let result = cross_block_alignr_128x4( + self, + self.cvt_to_bytes_f64x8(b).val.0, + self.cvt_to_bytes_f64x8(a).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x8( @@ -9828,12 +9804,12 @@ impl From> for __m128i { crate::transmute::checked_transmute_copy(&value.val) } } -#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] -#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] -#[doc = r" Rust doesn't currently let you do math on const generics."] -#[inline(always)] -unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { - unsafe { +crate::kernel!( + #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"] + #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"] + #[doc = r" Rust doesn't currently let you do math on const generics."] + #[inline(always)] + fn dyn_alignr_128(token: Sse4_2, a: __m128i, b: __m128i, shift: usize) -> __m128i { match shift { 0usize => _mm_alignr_epi8::<0i32>(a, b), 1usize => _mm_alignr_epi8::<1i32>(a, b), @@ -9854,50 +9830,62 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i { _ => unreachable!(), } } -} -#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] -#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] -#[inline(always)] -unsafe fn cross_block_alignr_128x2( - a: [__m128i; 2usize], - b: [__m128i; 2usize], - shift_bytes: usize, -) -> [__m128i; 2usize] { - [ - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - ] -} -#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] -#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] -#[inline(always)] -unsafe fn cross_block_alignr_128x4( - a: [__m128i; 4usize], - b: [__m128i; 4usize], - shift_bytes: usize, -) -> [__m128i; 4usize] { - [ - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - { - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }, - ] -} +); +crate::kernel!( + #[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] + #[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] + #[inline(always)] + fn cross_block_alignr_128x2( + token: Sse4_2, + a: [__m128i; 2usize], + b: [__m128i; 2usize], + shift_bytes: usize, + ) -> [__m128i; 2usize] { + [ + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + ] + } +); +crate::kernel!( + #[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."] + #[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."] + #[inline(always)] + fn cross_block_alignr_128x4( + token: Sse4_2, + a: [__m128i; 4usize], + b: [__m128i; 4usize], + shift_bytes: usize, + ) -> [__m128i; 4usize] { + [ + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + { + let [lo, hi] = + crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }, + ] + } +); diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs index 4dfafd2c..401cf164 100644 --- a/fearless_simd_gen/src/mk_neon.rs +++ b/fearless_simd_gen/src/mk_neon.rs @@ -383,9 +383,7 @@ impl Level for Neon { } (WithinBlocks, _) | (_, 128) => { quote! { - unsafe { - dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift) - } + dyn_vext_128(self, self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift) } } (AcrossBlocks, 256 | 512) => { @@ -398,7 +396,7 @@ impl Level for Neon { let bytes_arch_ty = self.arch_ty(&bytes_ty); quote! { - unsafe { + { let a_bytes = self.#to_bytes(a).val.0; let b_bytes = self.#to_bytes(b).val.0; let a_blocks = [#( a_bytes.#blocks ),*]; @@ -407,7 +405,7 @@ impl Level for Neon { let shift_bytes = #byte_shift; #bytes_arch_ty(#({ let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes); - dyn_vext_128(lo, hi, shift_bytes % 16) + dyn_vext_128(self, lo, hi, shift_bytes % 16) }),*) } } @@ -640,17 +638,17 @@ fn mk_slide_helpers() -> TokenStream { }); quote! { - /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still - /// expected to be constant in practice, so the match statement will be optimized out. This exists because - /// Rust doesn't currently let you do math on const generics. - #[inline(always)] - unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { - unsafe { + crate::kernel!( + /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still + /// expected to be constant in practice, so the match statement will be optimized out. This exists because + /// Rust doesn't currently let you do math on const generics. + #[inline(always)] + fn dyn_vext_128(neon: Neon, a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t { match shift { #(#shifts,)* _ => unreachable!() } } - } + ); } } diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs index ddc07ab5..9a910055 100644 --- a/fearless_simd_gen/src/mk_x86.rs +++ b/fearless_simd_gen/src/mk_x86.rs @@ -1502,17 +1502,15 @@ impl X86 { quote! { #method_sig { - unsafe { - if SHIFT >= #max_shift { - return b; - } - - // b and a are swapped here to match ARM's vext semantics. For vext, we can think of `a` as the "left", - // and we concatenate `b` to its "right". This makes sense, since `a` is the left-hand side and `b` is - // the right-hand side. x86's `alignr` is backwards, and treats `b` as the high/left block. - let result = #alignr_op(self.#to_bytes(b).val.0, self.#to_bytes(a).val.0, #byte_shift); - self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) + if SHIFT >= #max_shift { + return b; } + + // b and a are swapped here to match ARM's vext semantics. For vext, we can think of `a` as the "left", + // and we concatenate `b` to its "right". This makes sense, since `a` is the left-hand side and `b` is + // the right-hand side. x86's `alignr` is backwards, and treats `b` as the high/left block. + let result = #alignr_op(self, self.#to_bytes(b).val.0, self.#to_bytes(a).val.0, #byte_shift); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) } } } @@ -2057,6 +2055,7 @@ impl X86 { /// `vext` and our `slide` operation, and the 256-bit AVX2 version still operates *within* 128-bit lanes. fn dyn_alignr_helpers(&self) -> TokenStream { let mut fns = vec![]; + let token_ty = self.token(); let vec_widths: &[usize] = match self { Self::Sse4_2 => &[128], @@ -2077,18 +2076,18 @@ impl X86 { }); fns.push(quote! { - /// This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still - /// expected to be constant in practice, so the match statement will be optimized out. This exists because - /// Rust doesn't currently let you do math on const generics. - #[inline(always)] - unsafe fn #helper_name(a: #arch_ty, b: #arch_ty, shift: usize) -> #arch_ty { - unsafe { + crate::kernel!( + /// This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still + /// expected to be constant in practice, so the match statement will be optimized out. This exists because + /// Rust doesn't currently let you do math on const generics. + #[inline(always)] + fn #helper_name(token: #token_ty, a: #arch_ty, b: #arch_ty, shift: usize) -> #arch_ty { match shift { #(#shifts,)* _ => unreachable!() } } - } + ); }); } @@ -2105,15 +2104,17 @@ impl X86 { // Unroll the construction of the blocks. I tried using `array::from_fn`, but the compiler thought the // closure was too big and didn't inline it. fns.push(quote! { - /// Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`. - /// Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics. - #[inline(always)] - unsafe fn #helper_name(a: [__m128i; #num_blocks], b: [__m128i; #num_blocks], shift_bytes: usize) -> [__m128i; #num_blocks] { - [#({ - let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, #blocks_idx, shift_bytes); - unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) } - }),*] - } + crate::kernel!( + /// Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`. + /// Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics. + #[inline(always)] + fn #helper_name(token: Sse4_2, a: [__m128i; #num_blocks], b: [__m128i; #num_blocks], shift_bytes: usize) -> [__m128i; #num_blocks] { + [#({ + let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, #blocks_idx, shift_bytes); + dyn_alignr_128(token, hi, lo, shift_bytes % 16) + }),*] + } + ); }); } @@ -2124,57 +2125,59 @@ impl X86 { fn avx2_slide_helpers() -> TokenStream { quote! { - /// Computes one output __m256i for `cross_block_alignr_*` operations. - /// - /// Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and - /// `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`. - #[inline(always)] - unsafe fn cross_block_alignr_one(regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i { - let lo_idx = block_idx + (shift_bytes / 16); - let intra_shift = shift_bytes % 16; - let lo_blocks = if lo_idx & 1 == 0 { - regs[lo_idx / 2] - } else { - unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) } - }; + crate::kernel!( + /// Computes one output __m256i for `cross_block_alignr_*` operations. + /// + /// Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and + /// `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`. + #[inline(always)] + fn cross_block_alignr_one(token: Avx2, regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i { + let lo_idx = block_idx + (shift_bytes / 16); + let intra_shift = shift_bytes % 16; + let lo_blocks = if lo_idx & 1 == 0 { + regs[lo_idx / 2] + } else { + _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) + }; - // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`) - let hi_idx = lo_idx + 1; - let hi_blocks = if hi_idx & 1 == 0 { - regs[hi_idx / 2] - } else { - unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) } - }; + // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`) + let hi_idx = lo_idx + 1; + let hi_blocks = if hi_idx & 1 == 0 { + regs[hi_idx / 2] + } else { + _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) + }; - unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) } - } + dyn_alignr_256(token, hi_blocks, lo_blocks, intra_shift) + } + ); - /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset - /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. - #[inline(always)] - unsafe fn cross_block_alignr_256x2(a: [__m256i; 2], b: [__m256i; 2], shift_bytes: usize) -> [__m256i; 2] { - // Concatenation is [b : a], so b blocks come first - let regs = [b[0], b[1], a[0], a[1]]; + crate::kernel!( + /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset + /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + #[inline(always)] + fn cross_block_alignr_256x2(token: Avx2, a: [__m256i; 2], b: [__m256i; 2], shift_bytes: usize) -> [__m256i; 2] { + // Concatenation is [b : a], so b blocks come first + let regs = [b[0], b[1], a[0], a[1]]; - unsafe { [ - cross_block_alignr_one(®s, 0, shift_bytes), - cross_block_alignr_one(®s, 2, shift_bytes), + cross_block_alignr_one(token, ®s, 0, shift_bytes), + cross_block_alignr_one(token, ®s, 2, shift_bytes), ] } - } + ); - /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset - /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. - #[inline(always)] - unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { - // Concatenation is [b : a], so b comes first - let regs = [b, a]; + crate::kernel!( + /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset + /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics. + #[inline(always)] + fn cross_block_alignr_256x1(token: Avx2, a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i { + // Concatenation is [b : a], so b comes first + let regs = [b, a]; - unsafe { - cross_block_alignr_one(®s, 0, shift_bytes) + cross_block_alignr_one(token, ®s, 0, shift_bytes) } - } + ); } } } From 928ea6862e9f95c4007b697be3cbb3ae439e5cf6 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:16:31 +0100 Subject: [PATCH 4/5] Revert "Add limited const generic support to kernel! macro" because it's not needed after all This reverts commit 284299f0a30ab829d2a6ed0e45f634f8b540d4fa. --- fearless_simd/src/kernel_macros.rs | 99 +++--------------------------- 1 file changed, 8 insertions(+), 91 deletions(-) diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs index 676c9d63..a2ff2c21 100644 --- a/fearless_simd/src/kernel_macros.rs +++ b/fearless_simd/src/kernel_macros.rs @@ -50,8 +50,7 @@ /// /// ## Limitations /// -/// The macro only accepts a single plain, safe function item with simple named parameters. -/// The function may optionally have one const generic parameter written as ``. +/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters. /// However, the body of the function can be as complex as you like. /// /// The SIMD token type must be written as a bare supported name: @@ -67,7 +66,7 @@ macro_rules! kernel { ( $(#[$meta:meta])* - $vis:vis fn $name:ident $()?( + $vis:vis fn $name:ident( $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -76,7 +75,7 @@ macro_rules! kernel { $crate::__fearless_simd_kernel_dispatch! { $token_ty, $(#[$meta])* - $vis fn $name $()?( + $vis fn $name( $token $(, $arg: $arg_ty)* ) $(-> $ret)? { $($kernel_body)* @@ -86,7 +85,7 @@ macro_rules! kernel { ( $(#[$meta:meta])* - $vis:vis fn $name:ident $()?( + $vis:vis fn $name:ident( $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -175,7 +174,7 @@ macro_rules! __fearless_simd_kernel_impl { @token_ty $token_ty:ty; @kernel_attrs $(#[$kernel_attr:meta])*; $(#[$meta:meta])* - $vis:vis fn $name:ident $()?( + $vis:vis fn $name:ident( $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)? ) $(-> $ret:ty)? { $($kernel_body:tt)* @@ -183,12 +182,12 @@ macro_rules! __fearless_simd_kernel_impl { ) => { #[cfg($cfg)] $(#[$meta])* - $vis fn $name $()?( + $vis fn $name( $token: $token_ty $(, $arg: $arg_ty)* ) $(-> $ret)? { #[inline] // can't use `#[inline(always)]` with target features $(#[$kernel_attr])* - fn __fearless_simd_kernel $()?( + fn __fearless_simd_kernel( $token: $token_ty $(, $arg: $arg_ty)* ) $(-> $ret)? { let _ = $token; @@ -197,7 +196,7 @@ macro_rules! __fearless_simd_kernel_impl { // SAFETY: the SIMD token proves that the required target features are available. #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")] - unsafe { __fearless_simd_kernel $(::<$const_param>)?($token $(, $arg)*) } + unsafe { __fearless_simd_kernel($token $(, $arg)*) } } }; } @@ -227,43 +226,18 @@ mod tests { } ); - crate::kernel!( - fn add_f32x4_neon_const( - neon: Neon, - a: float32x4_t, - b: float32x4_t, - ) -> float32x4_t { - let _ = LANES; - vaddq_f32(a, b) - } - ); - crate::kernel!( fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 { f32x4_add(a, b) } ); - crate::kernel!( - fn add_f32x4_wasm_const(wasm: WasmSimd128, a: v128, b: v128) -> v128 { - let _ = LANES; - f32x4_add(a, b) - } - ); - crate::kernel!( fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { _mm256_add_epi32(a, b) } ); - crate::kernel!( - fn add_i32x8_avx2_const(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { - let _ = LANES; - _mm256_add_epi32(a, b) - } - ); - #[cfg(target_arch = "aarch64")] #[test] fn kernel_instantiates_for_neon() { @@ -282,25 +256,6 @@ mod tests { ); } - #[cfg(target_arch = "aarch64")] - #[test] - fn kernel_instantiates_const_generic_for_neon() { - let Some(neon) = crate::Level::new().as_neon() else { - return; - }; - - let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon); - let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon); - let sum: crate::f32x4<_> = - add_f32x4_neon_const::<4>(neon, a.into(), b.into()).simd_into(neon); - - assert_eq!( - <[f32; 4]>::from(sum), - [11.0, 22.0, 33.0, 44.0], - "`kernel!` should instantiate a const-generic NEON kernel" - ); - } - #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] #[test] fn kernel_instantiates_for_wasm_simd128() { @@ -319,25 +274,6 @@ mod tests { ); } - #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] - #[test] - fn kernel_instantiates_const_generic_for_wasm_simd128() { - let wasm = crate::Level::new() - .as_wasm_simd128() - .expect("WASM SIMD128 should be available when +simd128 is enabled"); - - let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm); - let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm); - let sum: crate::f32x4<_> = - add_f32x4_wasm_const::<4>(wasm, a.into(), b.into()).simd_into(wasm); - - assert_eq!( - <[f32; 4]>::from(sum), - [11.0, 22.0, 33.0, 44.0], - "`kernel!` should instantiate a const-generic WASM SIMD128 kernel" - ); - } - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] #[test] fn kernel_instantiates_for_avx2() { @@ -355,23 +291,4 @@ mod tests { "`kernel!` should instantiate a working AVX2 kernel" ); } - - #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] - #[test] - fn kernel_instantiates_const_generic_for_avx2() { - let Some(avx2) = crate::Level::new().as_avx2() else { - return; - }; - - let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2); - let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2); - let sum: crate::i32x8<_> = - add_i32x8_avx2_const::<8>(avx2, a.into(), b.into()).simd_into(avx2); - - assert_eq!( - <[i32; 8]>::from(sum), - [11, 22, 33, 44, 55, 66, 77, 88], - "`kernel!` should instantiate a const-generic AVX2 kernel" - ); - } } From 05daa040a41781349e177e5d25b5e22ba6e268d1 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Tue, 16 Jun 2026 22:21:34 +0100 Subject: [PATCH 5/5] Remove unnecessary `unsafe` from WASM --- fearless_simd/src/generated/wasm.rs | 480 +++++++++++++--------------- fearless_simd_gen/src/mk_wasm.rs | 6 +- 2 files changed, 218 insertions(+), 268 deletions(-) diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs index 7c54c052..8762cf03 100644 --- a/fearless_simd/src/generated/wasm.rs +++ b/fearless_simd/src/generated/wasm.rs @@ -138,17 +138,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 4usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_f32x4(a).val.0, - self.cvt_to_bytes_f32x4(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_f32x4(a).val.0, + self.cvt_to_bytes_f32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x4( @@ -434,17 +432,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_i8x16(a).val.0, - self.cvt_to_bytes_i8x16(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_i8x16(a).val.0, + self.cvt_to_bytes_i8x16(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x16( @@ -649,17 +645,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_u8x16(a).val.0, - self.cvt_to_bytes_u8x16(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x16(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_u8x16(a).val.0, + self.cvt_to_bytes_u8x16(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x16(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x16( @@ -950,17 +944,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_i16x8(a).val.0, - self.cvt_to_bytes_i16x8(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_i16x8(a).val.0, + self.cvt_to_bytes_i16x8(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x8( @@ -1149,17 +1141,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_u16x8(a).val.0, - self.cvt_to_bytes_u16x8(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x8(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_u16x8(a).val.0, + self.cvt_to_bytes_u16x8(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x8(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x8( @@ -1430,17 +1420,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 4usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_i32x4(a).val.0, - self.cvt_to_bytes_i32x4(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_i32x4(a).val.0, + self.cvt_to_bytes_i32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x4( @@ -1633,17 +1621,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 4usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_u32x4(a).val.0, - self.cvt_to_bytes_u32x4(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x4(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_u32x4(a).val.0, + self.cvt_to_bytes_u32x4(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x4(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x4( @@ -1914,17 +1900,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 2usize { return b; } - unsafe { - let result = dyn_slide_128( - self.cvt_to_bytes_f64x2(a).val.0, - self.cvt_to_bytes_f64x2(b).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x2(u8x16 { - val: crate::support::Aligned128(result), - simd: self, - }) - } + let result = dyn_slide_128( + self.cvt_to_bytes_f64x2(a).val.0, + self.cvt_to_bytes_f64x2(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x2(u8x16 { + val: crate::support::Aligned128(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x2( @@ -2255,17 +2239,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_f32x8(a).val.0, - self.cvt_to_bytes_f32x8(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_f32x8(a).val.0, + self.cvt_to_bytes_f32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x8( @@ -2627,17 +2609,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 32usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_i8x32(a).val.0, - self.cvt_to_bytes_i8x32(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i8x32(a).val.0, + self.cvt_to_bytes_i8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x32( @@ -2906,17 +2886,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 32usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_u8x32(a).val.0, - self.cvt_to_bytes_u8x32(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x32(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u8x32(a).val.0, + self.cvt_to_bytes_u8x32(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x32(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x32( @@ -3293,17 +3271,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_i16x16(a).val.0, - self.cvt_to_bytes_i16x16(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i16x16(a).val.0, + self.cvt_to_bytes_i16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x16( @@ -3572,17 +3548,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_u16x16(a).val.0, - self.cvt_to_bytes_u16x16(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x16(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u16x16(a).val.0, + self.cvt_to_bytes_u16x16(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x16(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x16( @@ -3968,17 +3942,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_i32x8(a).val.0, - self.cvt_to_bytes_i32x8(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_i32x8(a).val.0, + self.cvt_to_bytes_i32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x8( @@ -4252,17 +4224,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_u32x8(a).val.0, - self.cvt_to_bytes_u32x8(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x8(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_u32x8(a).val.0, + self.cvt_to_bytes_u32x8(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x8(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x8( @@ -4636,17 +4606,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 4usize { return b; } - unsafe { - let result = cross_block_slide_128x2( - self.cvt_to_bytes_f64x4(a).val.0, - self.cvt_to_bytes_f64x4(b).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x4(u8x32 { - val: crate::support::Aligned256(result), - simd: self, - }) - } + let result = cross_block_slide_128x2( + self.cvt_to_bytes_f64x4(a).val.0, + self.cvt_to_bytes_f64x4(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x4(u8x32 { + val: crate::support::Aligned256(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x4( @@ -5074,17 +5042,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_f32x16(a).val.0, - self.cvt_to_bytes_f32x16(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_f32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_f32x16(a).val.0, + self.cvt_to_bytes_f32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_f32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f32x16( @@ -5484,17 +5450,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 64usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_i8x64(a).val.0, - self.cvt_to_bytes_i8x64(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_i8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i8x64(a).val.0, + self.cvt_to_bytes_i8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_i8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i8x64( @@ -5756,17 +5720,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 64usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_u8x64(a).val.0, - self.cvt_to_bytes_u8x64(b).val.0, - SHIFT, - ); - self.cvt_from_bytes_u8x64(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u8x64(a).val.0, + self.cvt_to_bytes_u8x64(b).val.0, + SHIFT, + ); + self.cvt_from_bytes_u8x64(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u8x64( @@ -6193,17 +6155,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 32usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_i16x32(a).val.0, - self.cvt_to_bytes_i16x32(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_i16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i16x32(a).val.0, + self.cvt_to_bytes_i16x32(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_i16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i16x32( @@ -6474,17 +6434,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 32usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_u16x32(a).val.0, - self.cvt_to_bytes_u16x32(b).val.0, - SHIFT * 2usize, - ); - self.cvt_from_bytes_u16x32(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u16x32(a).val.0, + self.cvt_to_bytes_u16x32(b).val.0, + SHIFT * 2usize, + ); + self.cvt_from_bytes_u16x32(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u16x32( @@ -6909,17 +6867,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_i32x16(a).val.0, - self.cvt_to_bytes_i32x16(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_i32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_i32x16(a).val.0, + self.cvt_to_bytes_i32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_i32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_i32x16( @@ -7186,17 +7142,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 16usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_u32x16(a).val.0, - self.cvt_to_bytes_u32x16(b).val.0, - SHIFT * 4usize, - ); - self.cvt_from_bytes_u32x16(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_u32x16(a).val.0, + self.cvt_to_bytes_u32x16(b).val.0, + SHIFT * 4usize, + ); + self.cvt_from_bytes_u32x16(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_u32x16( @@ -7601,17 +7555,15 @@ impl Simd for WasmSimd128 { if SHIFT >= 8usize { return b; } - unsafe { - let result = cross_block_slide_128x4( - self.cvt_to_bytes_f64x8(a).val.0, - self.cvt_to_bytes_f64x8(b).val.0, - SHIFT * 8usize, - ); - self.cvt_from_bytes_f64x8(u8x64 { - val: crate::support::Aligned512(result), - simd: self, - }) - } + let result = cross_block_slide_128x4( + self.cvt_to_bytes_f64x8(a).val.0, + self.cvt_to_bytes_f64x8(b).val.0, + SHIFT * 8usize, + ); + self.cvt_from_bytes_f64x8(u8x64 { + val: crate::support::Aligned512(result), + simd: self, + }) } #[inline(always)] fn slide_within_blocks_f64x8( diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs index 88ef9e44..eb1b4915 100644 --- a/fearless_simd_gen/src/mk_wasm.rs +++ b/fearless_simd_gen/src/mk_wasm.rs @@ -478,10 +478,8 @@ impl Level for WasmSimd128 { return b; } - unsafe { - let result = #slide_op(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift); - self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) - } + let result = #slide_op(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift); + self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self }) } } }