From 284299f0a30ab829d2a6ed0e45f634f8b540d4fa Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 21:48:51 +0100
Subject: [PATCH 1/5] Add limited const generic support to kernel! macro

This is a lot less complex and scary than I feared it would be.
---
 fearless_simd/src/kernel_macros.rs | 99 +++++++++++++++++++++++++++---
 1 file changed, 91 insertions(+), 8 deletions(-)
diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
index a2ff2c21..676c9d63 100644
--- a/fearless_simd/src/kernel_macros.rs
+++ b/fearless_simd/src/kernel_macros.rs
@@ -50,7 +50,8 @@
 ///
 /// ## Limitations
 ///
-/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters.
+/// The macro only accepts a single plain, safe function item with simple named parameters.
+/// The function may optionally have one const generic parameter written as `<const NAME: usize>`.
 /// However, the body of the function can be as complex as you like.
 ///
 /// The SIMD token type must be written as a bare supported name:
@@ -66,7 +67,7 @@
 macro_rules! kernel {
     (
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident(
+        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
             $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -75,7 +76,7 @@ macro_rules! kernel {
         $crate::__fearless_simd_kernel_dispatch! {
             $token_ty,
             $(#[$meta])*
-            $vis fn $name(
+            $vis fn $name $(<const $const_param: usize>)?(
                 $token $(, $arg: $arg_ty)*
             ) $(-> $ret)? {
                 $($kernel_body)*
@@ -85,7 +86,7 @@ macro_rules! kernel {
 
     (
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident(
+        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
             $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -174,7 +175,7 @@ macro_rules! __fearless_simd_kernel_impl {
         @token_ty $token_ty:ty;
         @kernel_attrs $(#[$kernel_attr:meta])*;
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident(
+        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
             $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -182,12 +183,12 @@ macro_rules! __fearless_simd_kernel_impl {
     ) => {
         #[cfg($cfg)]
         $(#[$meta])*
-        $vis fn $name(
+        $vis fn $name $(<const $const_param: usize>)?(
             $token: $token_ty $(, $arg: $arg_ty)*
         ) $(-> $ret)? {
             #[inline] // can't use `#[inline(always)]` with target features
             $(#[$kernel_attr])*
-            fn __fearless_simd_kernel(
+            fn __fearless_simd_kernel $(<const $const_param: usize>)?(
                 $token: $token_ty $(, $arg: $arg_ty)*
             ) $(-> $ret)? {
                 let _ = $token;
@@ -196,7 +197,7 @@ macro_rules! __fearless_simd_kernel_impl {
 
             // SAFETY: the SIMD token proves that the required target features are available.
             #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")]
-            unsafe { __fearless_simd_kernel($token $(, $arg)*) }
+            unsafe { __fearless_simd_kernel $(::<$const_param>)?($token $(, $arg)*) }
         }
     };
 }
@@ -226,18 +227,43 @@ mod tests {
         }
     );
 
+    crate::kernel!(
+        fn add_f32x4_neon_const<const LANES: usize>(
+            neon: Neon,
+            a: float32x4_t,
+            b: float32x4_t,
+        ) -> float32x4_t {
+            let _ = LANES;
+            vaddq_f32(a, b)
+        }
+    );
+
     crate::kernel!(
         fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
             f32x4_add(a, b)
         }
     );
 
+    crate::kernel!(
+        fn add_f32x4_wasm_const<const LANES: usize>(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
+            let _ = LANES;
+            f32x4_add(a, b)
+        }
+    );
+
     crate::kernel!(
         fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
             _mm256_add_epi32(a, b)
         }
     );
 
+    crate::kernel!(
+        fn add_i32x8_avx2_const<const LANES: usize>(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
+            let _ = LANES;
+            _mm256_add_epi32(a, b)
+        }
+    );
+
     #[cfg(target_arch = "aarch64")]
     #[test]
     fn kernel_instantiates_for_neon() {
@@ -256,6 +282,25 @@ mod tests {
         );
     }
 
+    #[cfg(target_arch = "aarch64")]
+    #[test]
+    fn kernel_instantiates_const_generic_for_neon() {
+        let Some(neon) = crate::Level::new().as_neon() else {
+            return;
+        };
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
+        let sum: crate::f32x4<_> =
+            add_f32x4_neon_const::<4>(neon, a.into(), b.into()).simd_into(neon);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a const-generic NEON kernel"
+        );
+    }
+
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     #[test]
     fn kernel_instantiates_for_wasm_simd128() {
@@ -274,6 +319,25 @@ mod tests {
         );
     }
 
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    #[test]
+    fn kernel_instantiates_const_generic_for_wasm_simd128() {
+        let wasm = crate::Level::new()
+            .as_wasm_simd128()
+            .expect("WASM SIMD128 should be available when +simd128 is enabled");
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
+        let sum: crate::f32x4<_> =
+            add_f32x4_wasm_const::<4>(wasm, a.into(), b.into()).simd_into(wasm);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a const-generic WASM SIMD128 kernel"
+        );
+    }
+
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[test]
     fn kernel_instantiates_for_avx2() {
@@ -291,4 +355,23 @@ mod tests {
             "`kernel!` should instantiate a working AVX2 kernel"
         );
     }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[test]
+    fn kernel_instantiates_const_generic_for_avx2() {
+        let Some(avx2) = crate::Level::new().as_avx2() else {
+            return;
+        };
+
+        let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+        let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+        let sum: crate::i32x8<_> =
+            add_i32x8_avx2_const::<8>(avx2, a.into(), b.into()).simd_into(avx2);
+
+        assert_eq!(
+            <[i32; 8]>::from(sum),
+            [11, 22, 33, 44, 55, 66, 77, 88],
+            "`kernel!` should instantiate a const-generic AVX2 kernel"
+        );
+    }
 }

From 63df22ed88422093f876440d1253f4d0a5983e91 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:01:25 +0100
Subject: [PATCH 2/5] Drop unnecessary unsafe from WASM helpers, they never
 needed it

---
 fearless_simd/src/generated/wasm.rs | 72 +++++++++++------------------
 fearless_simd_gen/src/mk_wasm.rs    | 14 +++---
 2 files changed, 33 insertions(+), 53 deletions(-)

diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 2963ca6b..7c54c052 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -8156,48 +8156,30 @@ impl<S: Simd> From<mask64x2<S>> for v128 {
 #[doc = r" The shift is still expected to be constant in practice, so the match statement will be optimized out."]
 #[doc = r" This exists because Rust doesn't currently let you do math on const generics."]
 #[inline(always)]
-unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 {
-    unsafe {
-        match shift {
-            0 => i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(a, b),
-            1 => i8x16_shuffle::<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16>(a, b),
-            2 => i8x16_shuffle::<2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>(a, b),
-            3 => i8x16_shuffle::<3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18>(a, b),
-            4 => i8x16_shuffle::<4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19>(a, b),
-            5 => i8x16_shuffle::<5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20>(a, b),
-            6 => i8x16_shuffle::<6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21>(a, b),
-            7 => i8x16_shuffle::<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22>(a, b),
-            8 => {
-                i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23>(a, b)
-            }
-            9 => {
-                i8x16_shuffle::<9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24>(a, b)
-            }
-            10 => i8x16_shuffle::<10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25>(
-                a, b,
-            ),
-            11 => i8x16_shuffle::<11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26>(
-                a, b,
-            ),
-            12 => i8x16_shuffle::<12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27>(
-                a, b,
-            ),
-            13 => i8x16_shuffle::<13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28>(
-                a, b,
-            ),
-            14 => i8x16_shuffle::<14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29>(
-                a, b,
-            ),
-            15 => i8x16_shuffle::<15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30>(
-                a, b,
-            ),
-            _ => unreachable!(),
-        }
+fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 {
+    match shift {
+        0 => i8x16_shuffle::<0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15>(a, b),
+        1 => i8x16_shuffle::<1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16>(a, b),
+        2 => i8x16_shuffle::<2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17>(a, b),
+        3 => i8x16_shuffle::<3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18>(a, b),
+        4 => i8x16_shuffle::<4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19>(a, b),
+        5 => i8x16_shuffle::<5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20>(a, b),
+        6 => i8x16_shuffle::<6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21>(a, b),
+        7 => i8x16_shuffle::<7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22>(a, b),
+        8 => i8x16_shuffle::<8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23>(a, b),
+        9 => i8x16_shuffle::<9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24>(a, b),
+        10 => i8x16_shuffle::<10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25>(a, b),
+        11 => i8x16_shuffle::<11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26>(a, b),
+        12 => i8x16_shuffle::<12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27>(a, b),
+        13 => i8x16_shuffle::<13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28>(a, b),
+        14 => i8x16_shuffle::<14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29>(a, b),
+        15 => i8x16_shuffle::<15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30>(a, b),
+        _ => unreachable!(),
     }
 }
 #[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
 #[inline(always)]
-unsafe fn cross_block_slide_128x2(
+fn cross_block_slide_128x2(
     a: [v128; 2usize],
     b: [v128; 2usize],
     shift_bytes: usize,
@@ -8205,17 +8187,17 @@ unsafe fn cross_block_slide_128x2(
     [
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
     ]
 }
 #[doc = r" Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
 #[inline(always)]
-unsafe fn cross_block_slide_128x4(
+fn cross_block_slide_128x4(
     a: [v128; 4usize],
     b: [v128; 4usize],
     shift_bytes: usize,
@@ -8223,19 +8205,19 @@ unsafe fn cross_block_slide_128x4(
     [
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 0usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 1usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 2usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
         {
             let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, 3usize, shift_bytes);
-            unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+            dyn_slide_128(lo, hi, shift_bytes % 16)
         },
     ]
 }
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 1a9d35bf..88ef9e44 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -798,12 +798,10 @@ fn mk_slide_helpers() -> TokenStream {
         /// The shift is still expected to be constant in practice, so the match statement will be optimized out.
         /// This exists because Rust doesn't currently let you do math on const generics.
         #[inline(always)]
-        unsafe fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 {
-            unsafe {
-                match shift {
-                    #(#shifts,)*
-                    _ => unreachable!()
-                }
+        fn dyn_slide_128(a: v128, b: v128, shift: usize) -> v128 {
+            match shift {
+                #(#shifts,)*
+                _ => unreachable!()
             }
         }
     });
@@ -818,7 +816,7 @@ fn mk_slide_helpers() -> TokenStream {
                 quote! {
                     {
                         let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a, &b, #i, shift_bytes);
-                        unsafe { dyn_slide_128(lo, hi, shift_bytes % 16) }
+                        dyn_slide_128(lo, hi, shift_bytes % 16)
                     }
                 }
             })
@@ -827,7 +825,7 @@ fn mk_slide_helpers() -> TokenStream {
         fns.push(quote! {
             /// Concatenates `a` and `b` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`.
             #[inline(always)]
-            unsafe fn #helper_name(a: [v128; #num_blocks], b: [v128; #num_blocks], shift_bytes: usize) -> [v128; #num_blocks] {
+            fn #helper_name(a: [v128; #num_blocks], b: [v128; #num_blocks], shift_bytes: usize) -> [v128; #num_blocks] {
                 // Explicitly unrolled to help LLVM optimize
                 [#(#block_calls),*]
             }

From c684fec6e67b0ed943773e148e9acb171fbc2661 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:12:57 +0100
Subject: [PATCH 3/5] Use kernel! to define helper functions; remove unsafe
 from their callers

---
 fearless_simd/src/generated/avx2.rs   | 929 +++++++++++++-------------
 fearless_simd/src/generated/neon.rs   | 246 ++++---
 fearless_simd/src/generated/sse4_2.rs | 718 ++++++++++----------
 fearless_simd_gen/src/mk_neon.rs      |  22 +-
 fearless_simd_gen/src/mk_x86.rs       | 137 ++--
 5 files changed, 1006 insertions(+), 1046 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 216c6562..d0aa9f6d 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -152,20 +152,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_f32x4(b).val.0,
-                self.cvt_to_bytes_f32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_f32x4(b).val.0,
+            self.cvt_to_bytes_f32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x4<const SHIFT: usize>(
@@ -675,20 +674,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i8x16(b).val.0,
-                self.cvt_to_bytes_i8x16(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i8x16(b).val.0,
+            self.cvt_to_bytes_i8x16(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x16<const SHIFT: usize>(
@@ -1044,20 +1042,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u8x16(b).val.0,
-                self.cvt_to_bytes_u8x16(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u8x16(b).val.0,
+            self.cvt_to_bytes_u8x16(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x16<const SHIFT: usize>(
@@ -1575,20 +1572,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i16x8(b).val.0,
-                self.cvt_to_bytes_i16x8(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i16x8(b).val.0,
+            self.cvt_to_bytes_i16x8(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x8<const SHIFT: usize>(
@@ -1925,20 +1921,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u16x8(b).val.0,
-                self.cvt_to_bytes_u16x8(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u16x8(b).val.0,
+            self.cvt_to_bytes_u16x8(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x8<const SHIFT: usize>(
@@ -2435,20 +2430,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i32x4(b).val.0,
-                self.cvt_to_bytes_i32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i32x4(b).val.0,
+            self.cvt_to_bytes_i32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x4<const SHIFT: usize>(
@@ -2805,20 +2799,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u32x4(b).val.0,
-                self.cvt_to_bytes_u32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u32x4(b).val.0,
+            self.cvt_to_bytes_u32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x4<const SHIFT: usize>(
@@ -3331,20 +3324,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe {
-            if SHIFT >= 2usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_f64x2(b).val.0,
-                self.cvt_to_bytes_f64x2(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x2(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 2usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_f64x2(b).val.0,
+            self.cvt_to_bytes_f64x2(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x2<const SHIFT: usize>(
@@ -3902,20 +3894,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_f32x8(b).val.0,
-                self.cvt_to_bytes_f32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x8<const SHIFT: usize>(
@@ -3923,20 +3914,19 @@ impl Simd for Avx2 {
         a: f32x8<Self>,
         b: f32x8<Self>,
     ) -> f32x8<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_f32x8(b).val.0,
-                self.cvt_to_bytes_f32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn abs_f32x8(self, a: f32x8<Self>) -> f32x8<Self> {
@@ -4489,20 +4479,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_i8x32(b).val.0,
-                self.cvt_to_bytes_i8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x32<const SHIFT: usize>(
@@ -4510,20 +4499,19 @@ impl Simd for Avx2 {
         a: i8x32<Self>,
         b: i8x32<Self>,
     ) -> i8x32<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_i8x32(b).val.0,
-                self.cvt_to_bytes_i8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_i8x32(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
@@ -4947,20 +4935,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_u8x32(b).val.0,
-                self.cvt_to_bytes_u8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x32<const SHIFT: usize>(
@@ -4968,20 +4955,19 @@ impl Simd for Avx2 {
         a: u8x32<Self>,
         b: u8x32<Self>,
     ) -> u8x32<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_u8x32(b).val.0,
-                self.cvt_to_bytes_u8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_u8x32(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
@@ -5581,20 +5567,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_i16x16(b).val.0,
-                self.cvt_to_bytes_i16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x16<const SHIFT: usize>(
@@ -5602,20 +5587,19 @@ impl Simd for Avx2 {
         a: i16x16<Self>,
         b: i16x16<Self>,
     ) -> i16x16<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_i16x16(b).val.0,
-                self.cvt_to_bytes_i16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_i16x16(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
@@ -6022,20 +6006,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_u16x16(b).val.0,
-                self.cvt_to_bytes_u16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x16<const SHIFT: usize>(
@@ -6043,20 +6026,19 @@ impl Simd for Avx2 {
         a: u16x16<Self>,
         b: u16x16<Self>,
     ) -> u16x16<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_u16x16(b).val.0,
-                self.cvt_to_bytes_u16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_u16x16(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
@@ -6654,20 +6636,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_i32x8(b).val.0,
-                self.cvt_to_bytes_i32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x8<const SHIFT: usize>(
@@ -6675,20 +6656,19 @@ impl Simd for Avx2 {
         a: i32x8<Self>,
         b: i32x8<Self>,
     ) -> i32x8<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_i32x8(b).val.0,
-                self.cvt_to_bytes_i32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_i32x8(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
@@ -7091,20 +7071,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_u32x8(b).val.0,
-                self.cvt_to_bytes_u32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x8<const SHIFT: usize>(
@@ -7112,20 +7091,19 @@ impl Simd for Avx2 {
         a: u32x8<Self>,
         b: u32x8<Self>,
     ) -> u32x8<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_u32x8(b).val.0,
-                self.cvt_to_bytes_u32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn add_u32x8(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
@@ -7696,20 +7674,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x1(
-                self.cvt_to_bytes_f64x4(b).val.0,
-                self.cvt_to_bytes_f64x4(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x4(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = cross_block_alignr_256x1(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x4<const SHIFT: usize>(
@@ -7717,20 +7694,19 @@ impl Simd for Avx2 {
         a: f64x4<Self>,
         b: f64x4<Self>,
     ) -> f64x4<Self> {
-        unsafe {
-            if SHIFT >= 2usize {
-                return b;
-            }
-            let result = dyn_alignr_256(
-                self.cvt_to_bytes_f64x4(b).val.0,
-                self.cvt_to_bytes_f64x4(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x4(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 2usize {
+            return b;
         }
+        let result = dyn_alignr_256(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn abs_f64x4(self, a: f64x4<Self>) -> f64x4<Self> {
@@ -8328,20 +8304,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_f32x16(b).val.0,
-                self.cvt_to_bytes_f32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            self.cvt_to_bytes_f32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x16<const SHIFT: usize>(
@@ -8745,20 +8720,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_i8x64(b).val.0,
-                self.cvt_to_bytes_i8x64(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 64usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            self.cvt_to_bytes_i8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x64<const SHIFT: usize>(
@@ -9017,20 +8991,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_u8x64(b).val.0,
-                self.cvt_to_bytes_u8x64(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 64usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            self.cvt_to_bytes_u8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x64<const SHIFT: usize>(
@@ -9482,20 +9455,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_i16x32(b).val.0,
-                self.cvt_to_bytes_i16x32(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            self.cvt_to_bytes_i16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x32<const SHIFT: usize>(
@@ -9763,20 +9735,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_u16x32(b).val.0,
-                self.cvt_to_bytes_u16x32(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            self.cvt_to_bytes_u16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x32<const SHIFT: usize>(
@@ -10236,20 +10207,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_i32x16(b).val.0,
-                self.cvt_to_bytes_i32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            self.cvt_to_bytes_i32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x16<const SHIFT: usize>(
@@ -10513,20 +10483,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_u32x16(b).val.0,
-                self.cvt_to_bytes_u32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            self.cvt_to_bytes_u32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x16<const SHIFT: usize>(
@@ -10956,20 +10925,19 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_256x2(
-                self.cvt_to_bytes_f64x8(b).val.0,
-                self.cvt_to_bytes_f64x8(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x8(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_256x2(
+            self,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            self.cvt_to_bytes_f64x8(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x8<const SHIFT: usize>(
@@ -11529,12 +11497,12 @@ impl<S: Simd> From<mask64x4<S>> for __m256i {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
-#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
-#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
-#[doc = r" Rust doesn't currently let you do math on const generics."]
-#[inline(always)]
-unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
-    unsafe {
+crate::kernel!(
+    #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+    #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+    #[doc = r" Rust doesn't currently let you do math on const generics."]
+    #[inline(always)]
+    fn dyn_alignr_128(token: Avx2, a: __m128i, b: __m128i, shift: usize) -> __m128i {
         match shift {
             0usize => _mm_alignr_epi8::<0i32>(a, b),
             1usize => _mm_alignr_epi8::<1i32>(a, b),
@@ -11555,13 +11523,13 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
             _ => unreachable!(),
         }
     }
-}
-#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
-#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
-#[doc = r" Rust doesn't currently let you do math on const generics."]
-#[inline(always)]
-unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
-    unsafe {
+);
+crate::kernel!(
+    #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+    #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+    #[doc = r" Rust doesn't currently let you do math on const generics."]
+    #[inline(always)]
+    fn dyn_alignr_256(token: Avx2, a: __m256i, b: __m256i, shift: usize) -> __m256i {
         match shift {
             0usize => _mm256_alignr_epi8::<0i32>(a, b),
             1usize => _mm256_alignr_epi8::<1i32>(a, b),
@@ -11582,52 +11550,63 @@ unsafe fn dyn_alignr_256(a: __m256i, b: __m256i, shift: usize) -> __m256i {
             _ => unreachable!(),
         }
     }
-}
-#[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."]
-#[doc = r""]
-#[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"]
-#[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."]
-#[inline(always)]
-unsafe fn cross_block_alignr_one(
-    regs: &[__m256i],
-    block_idx: usize,
-    shift_bytes: usize,
-) -> __m256i {
-    let lo_idx = block_idx + (shift_bytes / 16);
-    let intra_shift = shift_bytes % 16;
-    let lo_blocks = if lo_idx & 1 == 0 {
-        regs[lo_idx / 2]
-    } else {
-        unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) }
-    };
-    let hi_idx = lo_idx + 1;
-    let hi_blocks = if hi_idx & 1 == 0 {
-        regs[hi_idx / 2]
-    } else {
-        unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) }
-    };
-    unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) }
-}
-#[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"]
-#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
-#[inline(always)]
-unsafe fn cross_block_alignr_256x2(
-    a: [__m256i; 2],
-    b: [__m256i; 2],
-    shift_bytes: usize,
-) -> [__m256i; 2] {
-    let regs = [b[0], b[1], a[0], a[1]];
-    unsafe {
+);
+crate::kernel!(
+    #[doc = r" Computes one output __m256i for `cross_block_alignr_*` operations."]
+    #[doc = r""]
+    #[doc = r" Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and"]
+    #[doc = r" `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`."]
+    #[inline(always)]
+    fn cross_block_alignr_one(
+        token: Avx2,
+        regs: &[__m256i],
+        block_idx: usize,
+        shift_bytes: usize,
+    ) -> __m256i {
+        let lo_idx = block_idx + (shift_bytes / 16);
+        let intra_shift = shift_bytes % 16;
+        let lo_blocks = if lo_idx & 1 == 0 {
+            regs[lo_idx / 2]
+        } else {
+            _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1])
+        };
+        let hi_idx = lo_idx + 1;
+        let hi_blocks = if hi_idx & 1 == 0 {
+            regs[hi_idx / 2]
+        } else {
+            _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1])
+        };
+        dyn_alignr_256(token, hi_blocks, lo_blocks, intra_shift)
+    }
+);
+crate::kernel!(
+    #[doc = r" Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset"]
+    #[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
+    #[inline(always)]
+    fn cross_block_alignr_256x2(
+        token: Avx2,
+        a: [__m256i; 2],
+        b: [__m256i; 2],
+        shift_bytes: usize,
+    ) -> [__m256i; 2] {
+        let regs = [b[0], b[1], a[0], a[1]];
         [
-            cross_block_alignr_one(&regs, 0, shift_bytes),
-            cross_block_alignr_one(&regs, 2, shift_bytes),
+            cross_block_alignr_one(token, &regs, 0, shift_bytes),
+            cross_block_alignr_one(token, &regs, 2, shift_bytes),
         ]
     }
-}
-#[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"]
-#[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
-#[inline(always)]
-unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i {
-    let regs = [b, a];
-    unsafe { cross_block_alignr_one(&regs, 0, shift_bytes) }
-}
+);
+crate::kernel!(
+    #[doc = r" Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset"]
+    #[doc = r" `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics."]
+    #[inline(always)]
+    fn cross_block_alignr_256x1(
+        token: Avx2,
+        a: __m256i,
+        b: __m256i,
+        shift_bytes: usize,
+    ) -> __m256i {
+        let regs = [b, a];
+        cross_block_alignr_one(token, &regs, 0, shift_bytes)
+    }
+);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index 954b9c3b..fa092c74 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -145,13 +145,12 @@ impl Simd for Neon {
         if SHIFT >= 4usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_f32x4(a).val.0,
-                self.cvt_to_bytes_f32x4(b).val.0,
-                SHIFT * 4usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_f32x4(a).val.0,
+            self.cvt_to_bytes_f32x4(b).val.0,
+            SHIFT * 4usize,
+        );
         self.cvt_from_bytes_f32x4(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -623,13 +622,12 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_i8x16(a).val.0,
-                self.cvt_to_bytes_i8x16(b).val.0,
-                SHIFT,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_i8x16(a).val.0,
+            self.cvt_to_bytes_i8x16(b).val.0,
+            SHIFT,
+        );
         self.cvt_from_bytes_i8x16(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -990,13 +988,12 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_u8x16(a).val.0,
-                self.cvt_to_bytes_u8x16(b).val.0,
-                SHIFT,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_u8x16(a).val.0,
+            self.cvt_to_bytes_u8x16(b).val.0,
+            SHIFT,
+        );
         self.cvt_from_bytes_u8x16(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -1525,13 +1522,12 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_i16x8(a).val.0,
-                self.cvt_to_bytes_i16x8(b).val.0,
-                SHIFT * 2usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_i16x8(a).val.0,
+            self.cvt_to_bytes_i16x8(b).val.0,
+            SHIFT * 2usize,
+        );
         self.cvt_from_bytes_i16x8(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -1892,13 +1888,12 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_u16x8(a).val.0,
-                self.cvt_to_bytes_u16x8(b).val.0,
-                SHIFT * 2usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_u16x8(a).val.0,
+            self.cvt_to_bytes_u16x8(b).val.0,
+            SHIFT * 2usize,
+        );
         self.cvt_from_bytes_u16x8(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -2417,13 +2412,12 @@ impl Simd for Neon {
         if SHIFT >= 4usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_i32x4(a).val.0,
-                self.cvt_to_bytes_i32x4(b).val.0,
-                SHIFT * 4usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_i32x4(a).val.0,
+            self.cvt_to_bytes_i32x4(b).val.0,
+            SHIFT * 4usize,
+        );
         self.cvt_from_bytes_i32x4(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -2794,13 +2788,12 @@ impl Simd for Neon {
         if SHIFT >= 4usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_u32x4(a).val.0,
-                self.cvt_to_bytes_u32x4(b).val.0,
-                SHIFT * 4usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_u32x4(a).val.0,
+            self.cvt_to_bytes_u32x4(b).val.0,
+            SHIFT * 4usize,
+        );
         self.cvt_from_bytes_u32x4(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -3318,13 +3311,12 @@ impl Simd for Neon {
         if SHIFT >= 2usize {
             return b;
         }
-        let result = unsafe {
-            dyn_vext_128(
-                self.cvt_to_bytes_f64x2(a).val.0,
-                self.cvt_to_bytes_f64x2(b).val.0,
-                SHIFT * 8usize,
-            )
-        };
+        let result = dyn_vext_128(
+            self,
+            self.cvt_to_bytes_f64x2(a).val.0,
+            self.cvt_to_bytes_f64x2(b).val.0,
+            SHIFT * 8usize,
+        );
         self.cvt_from_bytes_f64x2(u8x16 {
             val: crate::support::Aligned128(result),
             simd: self,
@@ -3899,7 +3891,7 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_f32x8(a).val.0;
             let b_bytes = self.cvt_to_bytes_f32x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -3913,7 +3905,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -3922,7 +3914,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -4293,7 +4285,7 @@ impl Simd for Neon {
         if SHIFT >= 32usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i8x32(a).val.0;
             let b_bytes = self.cvt_to_bytes_i8x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -4307,7 +4299,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -4316,7 +4308,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -4594,7 +4586,7 @@ impl Simd for Neon {
         if SHIFT >= 32usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u8x32(a).val.0;
             let b_bytes = self.cvt_to_bytes_u8x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -4608,7 +4600,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -4617,7 +4609,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -5005,7 +4997,7 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i16x16(a).val.0;
             let b_bytes = self.cvt_to_bytes_i16x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -5019,7 +5011,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -5028,7 +5020,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -5306,7 +5298,7 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u16x16(a).val.0;
             let b_bytes = self.cvt_to_bytes_u16x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -5320,7 +5312,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -5329,7 +5321,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -5730,7 +5722,7 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i32x8(a).val.0;
             let b_bytes = self.cvt_to_bytes_i32x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -5744,7 +5736,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -5753,7 +5745,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -6036,7 +6028,7 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u32x8(a).val.0;
             let b_bytes = self.cvt_to_bytes_u32x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -6050,7 +6042,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6059,7 +6051,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -6444,7 +6436,7 @@ impl Simd for Neon {
         if SHIFT >= 4usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_f64x4(a).val.0;
             let b_bytes = self.cvt_to_bytes_f64x4(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1];
@@ -6458,7 +6450,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6467,7 +6459,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -6906,7 +6898,7 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_f32x16(a).val.0;
             let b_bytes = self.cvt_to_bytes_f32x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -6920,7 +6912,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6929,7 +6921,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6938,7 +6930,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -6947,7 +6939,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -7317,7 +7309,7 @@ impl Simd for Neon {
         if SHIFT >= 64usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i8x64(a).val.0;
             let b_bytes = self.cvt_to_bytes_i8x64(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -7331,7 +7323,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7340,7 +7332,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7349,7 +7341,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7358,7 +7350,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -7627,7 +7619,7 @@ impl Simd for Neon {
         if SHIFT >= 64usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u8x64(a).val.0;
             let b_bytes = self.cvt_to_bytes_u8x64(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -7641,7 +7633,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7650,7 +7642,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7659,7 +7651,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -7668,7 +7660,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -8041,7 +8033,7 @@ impl Simd for Neon {
         if SHIFT >= 32usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i16x32(a).val.0;
             let b_bytes = self.cvt_to_bytes_i16x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -8055,7 +8047,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8064,7 +8056,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8073,7 +8065,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8082,7 +8074,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -8360,7 +8352,7 @@ impl Simd for Neon {
         if SHIFT >= 32usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u16x32(a).val.0;
             let b_bytes = self.cvt_to_bytes_u16x32(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -8374,7 +8366,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8383,7 +8375,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8392,7 +8384,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8401,7 +8393,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -8796,7 +8788,7 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_i32x16(a).val.0;
             let b_bytes = self.cvt_to_bytes_i32x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -8810,7 +8802,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8819,7 +8811,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8828,7 +8820,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -8837,7 +8829,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -9111,7 +9103,7 @@ impl Simd for Neon {
         if SHIFT >= 16usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_u32x16(a).val.0;
             let b_bytes = self.cvt_to_bytes_u32x16(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -9125,7 +9117,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9134,7 +9126,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9143,7 +9135,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9152,7 +9144,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -9527,7 +9519,7 @@ impl Simd for Neon {
         if SHIFT >= 8usize {
             return b;
         }
-        let result = unsafe {
+        let result = {
             let a_bytes = self.cvt_to_bytes_f64x8(a).val.0;
             let b_bytes = self.cvt_to_bytes_f64x8(b).val.0;
             let a_blocks = [a_bytes.0, a_bytes.1, a_bytes.2, a_bytes.3];
@@ -9541,7 +9533,7 @@ impl Simd for Neon {
                         0,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9550,7 +9542,7 @@ impl Simd for Neon {
                         1,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9559,7 +9551,7 @@ impl Simd for Neon {
                         2,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
                 {
                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(
@@ -9568,7 +9560,7 @@ impl Simd for Neon {
                         3,
                         shift_bytes,
                     );
-                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                 },
             )
         };
@@ -10476,12 +10468,12 @@ impl<S: Simd> From<mask64x8<S>> for int64x2x4_t {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
-#[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"]
-#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
-#[doc = r" Rust doesn't currently let you do math on const generics."]
-#[inline(always)]
-unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
-    unsafe {
+crate::kernel!(
+    #[doc = r" This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still"]
+    #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+    #[doc = r" Rust doesn't currently let you do math on const generics."]
+    #[inline(always)]
+    fn dyn_vext_128(neon: Neon, a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
         match shift {
             0usize => vextq_u8::<0i32>(a, b),
             1usize => vextq_u8::<1i32>(a, b),
@@ -10502,4 +10494,4 @@ unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t
             _ => unreachable!(),
         }
     }
-}
+);
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 47b81a4d..5f36c1cc 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -178,20 +178,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f32x4<const SHIFT: usize>(self, a: f32x4<Self>, b: f32x4<Self>) -> f32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_f32x4(b).val.0,
-                self.cvt_to_bytes_f32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_f32x4(b).val.0,
+            self.cvt_to_bytes_f32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x4<const SHIFT: usize>(
@@ -686,20 +685,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i8x16<const SHIFT: usize>(self, a: i8x16<Self>, b: i8x16<Self>) -> i8x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i8x16(b).val.0,
-                self.cvt_to_bytes_i8x16(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i8x16(b).val.0,
+            self.cvt_to_bytes_i8x16(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x16<const SHIFT: usize>(
@@ -1052,20 +1050,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u8x16<const SHIFT: usize>(self, a: u8x16<Self>, b: u8x16<Self>) -> u8x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u8x16(b).val.0,
-                self.cvt_to_bytes_u8x16(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u8x16(b).val.0,
+            self.cvt_to_bytes_u8x16(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x16<const SHIFT: usize>(
@@ -1580,20 +1577,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i16x8<const SHIFT: usize>(self, a: i16x8<Self>, b: i16x8<Self>) -> i16x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i16x8(b).val.0,
-                self.cvt_to_bytes_i16x8(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i16x8(b).val.0,
+            self.cvt_to_bytes_i16x8(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x8<const SHIFT: usize>(
@@ -1927,20 +1923,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u16x8<const SHIFT: usize>(self, a: u16x8<Self>, b: u16x8<Self>) -> u16x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u16x8(b).val.0,
-                self.cvt_to_bytes_u16x8(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u16x8(b).val.0,
+            self.cvt_to_bytes_u16x8(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x8<const SHIFT: usize>(
@@ -2431,20 +2426,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i32x4<const SHIFT: usize>(self, a: i32x4<Self>, b: i32x4<Self>) -> i32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_i32x4(b).val.0,
-                self.cvt_to_bytes_i32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_i32x4(b).val.0,
+            self.cvt_to_bytes_i32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x4<const SHIFT: usize>(
@@ -2786,20 +2780,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u32x4<const SHIFT: usize>(self, a: u32x4<Self>, b: u32x4<Self>) -> u32x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_u32x4(b).val.0,
-                self.cvt_to_bytes_u32x4(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_u32x4(b).val.0,
+            self.cvt_to_bytes_u32x4(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x4<const SHIFT: usize>(
@@ -3294,20 +3287,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f64x2<const SHIFT: usize>(self, a: f64x2<Self>, b: f64x2<Self>) -> f64x2<Self> {
-        unsafe {
-            if SHIFT >= 2usize {
-                return b;
-            }
-            let result = dyn_alignr_128(
-                self.cvt_to_bytes_f64x2(b).val.0,
-                self.cvt_to_bytes_f64x2(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x2(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
+        if SHIFT >= 2usize {
+            return b;
         }
+        let result = dyn_alignr_128(
+            self,
+            self.cvt_to_bytes_f64x2(b).val.0,
+            self.cvt_to_bytes_f64x2(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x2<const SHIFT: usize>(
@@ -3842,20 +3834,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f32x8<const SHIFT: usize>(self, a: f32x8<Self>, b: f32x8<Self>) -> f32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_f32x8(b).val.0,
-                self.cvt_to_bytes_f32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            self.cvt_to_bytes_f32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x8<const SHIFT: usize>(
@@ -4214,20 +4205,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i8x32<const SHIFT: usize>(self, a: i8x32<Self>, b: i8x32<Self>) -> i8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_i8x32(b).val.0,
-                self.cvt_to_bytes_i8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            self.cvt_to_bytes_i8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x32<const SHIFT: usize>(
@@ -4493,20 +4483,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u8x32<const SHIFT: usize>(self, a: u8x32<Self>, b: u8x32<Self>) -> u8x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_u8x32(b).val.0,
-                self.cvt_to_bytes_u8x32(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            self.cvt_to_bytes_u8x32(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x32<const SHIFT: usize>(
@@ -4880,20 +4869,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i16x16<const SHIFT: usize>(self, a: i16x16<Self>, b: i16x16<Self>) -> i16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_i16x16(b).val.0,
-                self.cvt_to_bytes_i16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            self.cvt_to_bytes_i16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x16<const SHIFT: usize>(
@@ -5159,20 +5147,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u16x16<const SHIFT: usize>(self, a: u16x16<Self>, b: u16x16<Self>) -> u16x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_u16x16(b).val.0,
-                self.cvt_to_bytes_u16x16(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            self.cvt_to_bytes_u16x16(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x16<const SHIFT: usize>(
@@ -5567,20 +5554,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i32x8<const SHIFT: usize>(self, a: i32x8<Self>, b: i32x8<Self>) -> i32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_i32x8(b).val.0,
-                self.cvt_to_bytes_i32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            self.cvt_to_bytes_i32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x8<const SHIFT: usize>(
@@ -5851,20 +5837,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u32x8<const SHIFT: usize>(self, a: u32x8<Self>, b: u32x8<Self>) -> u32x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_u32x8(b).val.0,
-                self.cvt_to_bytes_u32x8(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            self.cvt_to_bytes_u32x8(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x8<const SHIFT: usize>(
@@ -6235,20 +6220,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f64x4<const SHIFT: usize>(self, a: f64x4<Self>, b: f64x4<Self>) -> f64x4<Self> {
-        unsafe {
-            if SHIFT >= 4usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x2(
-                self.cvt_to_bytes_f64x4(b).val.0,
-                self.cvt_to_bytes_f64x4(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x4(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
+        if SHIFT >= 4usize {
+            return b;
         }
+        let result = cross_block_alignr_128x2(
+            self,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            self.cvt_to_bytes_f64x4(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x4<const SHIFT: usize>(
@@ -6673,20 +6657,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f32x16<const SHIFT: usize>(self, a: f32x16<Self>, b: f32x16<Self>) -> f32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_f32x16(b).val.0,
-                self.cvt_to_bytes_f32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            self.cvt_to_bytes_f32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x16<const SHIFT: usize>(
@@ -7090,20 +7073,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i8x64<const SHIFT: usize>(self, a: i8x64<Self>, b: i8x64<Self>) -> i8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_i8x64(b).val.0,
-                self.cvt_to_bytes_i8x64(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 64usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            self.cvt_to_bytes_i8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x64<const SHIFT: usize>(
@@ -7362,20 +7344,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u8x64<const SHIFT: usize>(self, a: u8x64<Self>, b: u8x64<Self>) -> u8x64<Self> {
-        unsafe {
-            if SHIFT >= 64usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_u8x64(b).val.0,
-                self.cvt_to_bytes_u8x64(a).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 64usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            self.cvt_to_bytes_u8x64(a).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x64<const SHIFT: usize>(
@@ -7833,20 +7814,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i16x32<const SHIFT: usize>(self, a: i16x32<Self>, b: i16x32<Self>) -> i16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_i16x32(b).val.0,
-                self.cvt_to_bytes_i16x32(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            self.cvt_to_bytes_i16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x32<const SHIFT: usize>(
@@ -8114,20 +8094,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u16x32<const SHIFT: usize>(self, a: u16x32<Self>, b: u16x32<Self>) -> u16x32<Self> {
-        unsafe {
-            if SHIFT >= 32usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_u16x32(b).val.0,
-                self.cvt_to_bytes_u16x32(a).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 32usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            self.cvt_to_bytes_u16x32(a).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x32<const SHIFT: usize>(
@@ -8575,20 +8554,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_i32x16<const SHIFT: usize>(self, a: i32x16<Self>, b: i32x16<Self>) -> i32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_i32x16(b).val.0,
-                self.cvt_to_bytes_i32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            self.cvt_to_bytes_i32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x16<const SHIFT: usize>(
@@ -8852,20 +8830,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_u32x16<const SHIFT: usize>(self, a: u32x16<Self>, b: u32x16<Self>) -> u32x16<Self> {
-        unsafe {
-            if SHIFT >= 16usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_u32x16(b).val.0,
-                self.cvt_to_bytes_u32x16(a).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 16usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            self.cvt_to_bytes_u32x16(a).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x16<const SHIFT: usize>(
@@ -9274,20 +9251,19 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn slide_f64x8<const SHIFT: usize>(self, a: f64x8<Self>, b: f64x8<Self>) -> f64x8<Self> {
-        unsafe {
-            if SHIFT >= 8usize {
-                return b;
-            }
-            let result = cross_block_alignr_128x4(
-                self.cvt_to_bytes_f64x8(b).val.0,
-                self.cvt_to_bytes_f64x8(a).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x8(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
+        if SHIFT >= 8usize {
+            return b;
         }
+        let result = cross_block_alignr_128x4(
+            self,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            self.cvt_to_bytes_f64x8(a).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x8<const SHIFT: usize>(
@@ -9828,12 +9804,12 @@ impl<S: Simd> From<mask64x2<S>> for __m128i {
         crate::transmute::checked_transmute_copy(&value.val)
     }
 }
-#[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
-#[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
-#[doc = r" Rust doesn't currently let you do math on const generics."]
-#[inline(always)]
-unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
-    unsafe {
+crate::kernel!(
+    #[doc = r" This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still"]
+    #[doc = r" expected to be constant in practice, so the match statement will be optimized out. This exists because"]
+    #[doc = r" Rust doesn't currently let you do math on const generics."]
+    #[inline(always)]
+    fn dyn_alignr_128(token: Sse4_2, a: __m128i, b: __m128i, shift: usize) -> __m128i {
         match shift {
             0usize => _mm_alignr_epi8::<0i32>(a, b),
             1usize => _mm_alignr_epi8::<1i32>(a, b),
@@ -9854,50 +9830,62 @@ unsafe fn dyn_alignr_128(a: __m128i, b: __m128i, shift: usize) -> __m128i {
             _ => unreachable!(),
         }
     }
-}
-#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
-#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
-#[inline(always)]
-unsafe fn cross_block_alignr_128x2(
-    a: [__m128i; 2usize],
-    b: [__m128i; 2usize],
-    shift_bytes: usize,
-) -> [__m128i; 2usize] {
-    [
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-    ]
-}
-#[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
-#[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
-#[inline(always)]
-unsafe fn cross_block_alignr_128x4(
-    a: [__m128i; 4usize],
-    b: [__m128i; 4usize],
-    shift_bytes: usize,
-) -> [__m128i; 4usize] {
-    [
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-        {
-            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes);
-            unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-        },
-    ]
-}
+);
+crate::kernel!(
+    #[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
+    #[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
+    #[inline(always)]
+    fn cross_block_alignr_128x2(
+        token: Sse4_2,
+        a: [__m128i; 2usize],
+        b: [__m128i; 2usize],
+        shift_bytes: usize,
+    ) -> [__m128i; 2usize] {
+        [
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+        ]
+    }
+);
+crate::kernel!(
+    #[doc = r" Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`."]
+    #[doc = r" Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics."]
+    #[inline(always)]
+    fn cross_block_alignr_128x4(
+        token: Sse4_2,
+        a: [__m128i; 4usize],
+        b: [__m128i; 4usize],
+        shift_bytes: usize,
+    ) -> [__m128i; 4usize] {
+        [
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 0usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 1usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 2usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+            {
+                let [lo, hi] =
+                    crate::support::cross_block_slide_blocks_at(&b, &a, 3usize, shift_bytes);
+                dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+            },
+        ]
+    }
+);
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 4dfafd2c..401cf164 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -383,9 +383,7 @@ impl Level for Neon {
                     }
                     (WithinBlocks, _) | (_, 128) => {
                         quote! {
-                            unsafe {
-                                dyn_vext_128(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift)
-                            }
+                            dyn_vext_128(self, self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift)
                         }
                     }
                     (AcrossBlocks, 256 | 512) => {
@@ -398,7 +396,7 @@ impl Level for Neon {
                         let bytes_arch_ty = self.arch_ty(&bytes_ty);
 
                         quote! {
-                            unsafe {
+                            {
                                 let a_bytes = self.#to_bytes(a).val.0;
                                 let b_bytes = self.#to_bytes(b).val.0;
                                 let a_blocks = [#( a_bytes.#blocks ),*];
@@ -407,7 +405,7 @@ impl Level for Neon {
                                 let shift_bytes = #byte_shift;
                                 #bytes_arch_ty(#({
                                     let [lo, hi] = crate::support::cross_block_slide_blocks_at(&a_blocks, &b_blocks, #blocks3, shift_bytes);
-                                    dyn_vext_128(lo, hi, shift_bytes % 16)
+                                    dyn_vext_128(self, lo, hi, shift_bytes % 16)
                                 }),*)
                             }
                         }
@@ -640,17 +638,17 @@ fn mk_slide_helpers() -> TokenStream {
     });
 
     quote! {
-        /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still
-        /// expected to be constant in practice, so the match statement will be optimized out. This exists because
-        /// Rust doesn't currently let you do math on const generics.
-        #[inline(always)]
-        unsafe fn dyn_vext_128(a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
-            unsafe {
+        crate::kernel!(
+            /// This is a version of the `vext` intrinsic that takes a non-const shift argument. The shift is still
+            /// expected to be constant in practice, so the match statement will be optimized out. This exists because
+            /// Rust doesn't currently let you do math on const generics.
+            #[inline(always)]
+            fn dyn_vext_128(neon: Neon, a: uint8x16_t, b: uint8x16_t, shift: usize) -> uint8x16_t {
                 match shift {
                     #(#shifts,)*
                     _ => unreachable!()
                 }
             }
-        }
+        );
     }
 }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index ddc07ab5..9a910055 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -1502,17 +1502,15 @@ impl X86 {
 
         quote! {
             #method_sig {
-                unsafe {
-                    if SHIFT >= #max_shift {
-                        return b;
-                    }
-
-                    // b and a are swapped here to match ARM's vext semantics. For vext, we can think of `a` as the "left",
-                    // and we concatenate `b` to its "right". This makes sense, since `a` is the left-hand side and `b` is
-                    // the right-hand side. x86's `alignr` is backwards, and treats `b` as the high/left block.
-                    let result = #alignr_op(self.#to_bytes(b).val.0, self.#to_bytes(a).val.0, #byte_shift);
-                    self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
+                if SHIFT >= #max_shift {
+                    return b;
                 }
+
+                // b and a are swapped here to match ARM's vext semantics. For vext, we can think of `a` as the "left",
+                // and we concatenate `b` to its "right". This makes sense, since `a` is the left-hand side and `b` is
+                // the right-hand side. x86's `alignr` is backwards, and treats `b` as the high/left block.
+                let result = #alignr_op(self, self.#to_bytes(b).val.0, self.#to_bytes(a).val.0, #byte_shift);
+                self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
             }
         }
     }
@@ -2057,6 +2055,7 @@ impl X86 {
     /// `vext` and our `slide` operation, and the 256-bit AVX2 version still operates *within* 128-bit lanes.
     fn dyn_alignr_helpers(&self) -> TokenStream {
         let mut fns = vec![];
+        let token_ty = self.token();
 
         let vec_widths: &[usize] = match self {
             Self::Sse4_2 => &[128],
@@ -2077,18 +2076,18 @@ impl X86 {
             });
 
             fns.push(quote! {
-                /// This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still
-                /// expected to be constant in practice, so the match statement will be optimized out. This exists because
-                /// Rust doesn't currently let you do math on const generics.
-                #[inline(always)]
-                unsafe fn #helper_name(a: #arch_ty, b: #arch_ty, shift: usize) -> #arch_ty {
-                    unsafe {
+                crate::kernel!(
+                    /// This is a version of the `alignr` intrinsic that takes a non-const shift argument. The shift is still
+                    /// expected to be constant in practice, so the match statement will be optimized out. This exists because
+                    /// Rust doesn't currently let you do math on const generics.
+                    #[inline(always)]
+                    fn #helper_name(token: #token_ty, a: #arch_ty, b: #arch_ty, shift: usize) -> #arch_ty {
                         match shift {
                             #(#shifts,)*
                             _ => unreachable!()
                         }
                     }
-                }
+                );
             });
         }
 
@@ -2105,15 +2104,17 @@ impl X86 {
             // Unroll the construction of the blocks. I tried using `array::from_fn`, but the compiler thought the
             // closure was too big and didn't inline it.
             fns.push(quote! {
-                /// Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`.
-                /// Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics.
-                #[inline(always)]
-                unsafe fn #helper_name(a: [__m128i; #num_blocks], b: [__m128i; #num_blocks], shift_bytes: usize) -> [__m128i; #num_blocks] {
-                    [#({
-                        let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, #blocks_idx, shift_bytes);
-                        unsafe { dyn_alignr_128(hi, lo, shift_bytes % 16) }
-                    }),*]
-                }
+                crate::kernel!(
+                    /// Concatenates `b` and `a` (each N blocks) and extracts N blocks starting at byte offset `shift_bytes`.
+                    /// Extracts from [b : a] (b in low bytes, a in high bytes), matching `alignr` semantics.
+                    #[inline(always)]
+                    fn #helper_name(token: Sse4_2, a: [__m128i; #num_blocks], b: [__m128i; #num_blocks], shift_bytes: usize) -> [__m128i; #num_blocks] {
+                        [#({
+                            let [lo, hi] = crate::support::cross_block_slide_blocks_at(&b, &a, #blocks_idx, shift_bytes);
+                            dyn_alignr_128(token, hi, lo, shift_bytes % 16)
+                        }),*]
+                    }
+                );
             });
         }
 
@@ -2124,57 +2125,59 @@ impl X86 {
 
     fn avx2_slide_helpers() -> TokenStream {
         quote! {
-            /// Computes one output __m256i for `cross_block_alignr_*` operations.
-            ///
-            /// Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and
-            /// `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`.
-            #[inline(always)]
-            unsafe fn cross_block_alignr_one(regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i {
-                let lo_idx = block_idx + (shift_bytes / 16);
-                let intra_shift = shift_bytes % 16;
-                let lo_blocks = if lo_idx & 1 == 0 {
-                    regs[lo_idx / 2]
-                } else {
-                    unsafe { _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1]) }
-                };
+            crate::kernel!(
+                /// Computes one output __m256i for `cross_block_alignr_*` operations.
+                ///
+                /// Given an array of registers, each containing two 128-bit blocks, extracts two adjacent blocks (`lo_idx` and
+                /// `hi_idx` = `lo_idx + 1`) and performs `alignr` with `intra_shift`.
+                #[inline(always)]
+                fn cross_block_alignr_one(token: Avx2, regs: &[__m256i], block_idx: usize, shift_bytes: usize) -> __m256i {
+                    let lo_idx = block_idx + (shift_bytes / 16);
+                    let intra_shift = shift_bytes % 16;
+                    let lo_blocks = if lo_idx & 1 == 0 {
+                        regs[lo_idx / 2]
+                    } else {
+                        _mm256_permute2x128_si256::<0x21>(regs[lo_idx / 2], regs[(lo_idx / 2) + 1])
+                    };
 
-                // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`)
-                let hi_idx = lo_idx + 1;
-                let hi_blocks = if hi_idx & 1 == 0 {
-                    regs[hi_idx / 2]
-                } else {
-                    unsafe { _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1]) }
-                };
+                    // For hi_blocks, we need blocks (`lo_idx + 1`) and (`lo_idx + 2`)
+                    let hi_idx = lo_idx + 1;
+                    let hi_blocks = if hi_idx & 1 == 0 {
+                        regs[hi_idx / 2]
+                    } else {
+                        _mm256_permute2x128_si256::<0x21>(regs[hi_idx / 2], regs[(hi_idx / 2) + 1])
+                    };
 
-                unsafe { dyn_alignr_256(hi_blocks, lo_blocks, intra_shift) }
-            }
+                    dyn_alignr_256(token, hi_blocks, lo_blocks, intra_shift)
+                }
+            );
 
-            /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset
-            /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics.
-            #[inline(always)]
-            unsafe fn cross_block_alignr_256x2(a: [__m256i; 2], b: [__m256i; 2], shift_bytes: usize) -> [__m256i; 2] {
-                // Concatenation is [b : a], so b blocks come first
-                let regs = [b[0], b[1], a[0], a[1]];
+            crate::kernel!(
+                /// Concatenates `b` and `a` (each 2 x __m256i = 4 blocks) and extracts 4 blocks starting at byte offset
+                /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics.
+                #[inline(always)]
+                fn cross_block_alignr_256x2(token: Avx2, a: [__m256i; 2], b: [__m256i; 2], shift_bytes: usize) -> [__m256i; 2] {
+                    // Concatenation is [b : a], so b blocks come first
+                    let regs = [b[0], b[1], a[0], a[1]];
 
-                unsafe {
                     [
-                        cross_block_alignr_one(&regs, 0, shift_bytes),
-                        cross_block_alignr_one(&regs, 2, shift_bytes),
+                        cross_block_alignr_one(token, &regs, 0, shift_bytes),
+                        cross_block_alignr_one(token, &regs, 2, shift_bytes),
                     ]
                 }
-            }
+            );
 
-            /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset
-            /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics.
-            #[inline(always)]
-            unsafe fn cross_block_alignr_256x1(a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i {
-                // Concatenation is [b : a], so b comes first
-                let regs = [b, a];
+            crate::kernel!(
+                /// Concatenates `b` and `a` (each 1 x __m256i = 2 blocks) and extracts 2 blocks starting at byte offset
+                /// `shift_bytes`. Extracts from [b : a] (b in low bytes, a in high bytes), matching alignr semantics.
+                #[inline(always)]
+                fn cross_block_alignr_256x1(token: Avx2, a: __m256i, b: __m256i, shift_bytes: usize) -> __m256i {
+                    // Concatenation is [b : a], so b comes first
+                    let regs = [b, a];
 
-                unsafe {
-                    cross_block_alignr_one(&regs, 0, shift_bytes)
+                    cross_block_alignr_one(token, &regs, 0, shift_bytes)
                 }
-            }
+            );
         }
     }
 }

From 928ea6862e9f95c4007b697be3cbb3ae439e5cf6 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:16:31 +0100
Subject: [PATCH 4/5] Revert "Add limited const generic support to kernel!
 macro" because it's not needed after all

This reverts commit 284299f0a30ab829d2a6ed0e45f634f8b540d4fa.
---
 fearless_simd/src/kernel_macros.rs | 99 +++---------------------------
 1 file changed, 8 insertions(+), 91 deletions(-)

diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
index 676c9d63..a2ff2c21 100644
--- a/fearless_simd/src/kernel_macros.rs
+++ b/fearless_simd/src/kernel_macros.rs
@@ -50,8 +50,7 @@
 ///
 /// ## Limitations
 ///
-/// The macro only accepts a single plain, safe function item with simple named parameters.
-/// The function may optionally have one const generic parameter written as `<const NAME: usize>`.
+/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters.
 /// However, the body of the function can be as complex as you like.
 ///
 /// The SIMD token type must be written as a bare supported name:
@@ -67,7 +66,7 @@
 macro_rules! kernel {
     (
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
+        $vis:vis fn $name:ident(
             $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -76,7 +75,7 @@ macro_rules! kernel {
         $crate::__fearless_simd_kernel_dispatch! {
             $token_ty,
             $(#[$meta])*
-            $vis fn $name $(<const $const_param: usize>)?(
+            $vis fn $name(
                 $token $(, $arg: $arg_ty)*
             ) $(-> $ret)? {
                 $($kernel_body)*
@@ -86,7 +85,7 @@ macro_rules! kernel {
 
     (
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
+        $vis:vis fn $name:ident(
             $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -175,7 +174,7 @@ macro_rules! __fearless_simd_kernel_impl {
         @token_ty $token_ty:ty;
         @kernel_attrs $(#[$kernel_attr:meta])*;
         $(#[$meta:meta])*
-        $vis:vis fn $name:ident $(<const $const_param:ident : usize>)?(
+        $vis:vis fn $name:ident(
             $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
         ) $(-> $ret:ty)? {
             $($kernel_body:tt)*
@@ -183,12 +182,12 @@ macro_rules! __fearless_simd_kernel_impl {
     ) => {
         #[cfg($cfg)]
         $(#[$meta])*
-        $vis fn $name $(<const $const_param: usize>)?(
+        $vis fn $name(
             $token: $token_ty $(, $arg: $arg_ty)*
         ) $(-> $ret)? {
             #[inline] // can't use `#[inline(always)]` with target features
             $(#[$kernel_attr])*
-            fn __fearless_simd_kernel $(<const $const_param: usize>)?(
+            fn __fearless_simd_kernel(
                 $token: $token_ty $(, $arg: $arg_ty)*
             ) $(-> $ret)? {
                 let _ = $token;
@@ -197,7 +196,7 @@ macro_rules! __fearless_simd_kernel_impl {
 
             // SAFETY: the SIMD token proves that the required target features are available.
             #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")]
-            unsafe { __fearless_simd_kernel $(::<$const_param>)?($token $(, $arg)*) }
+            unsafe { __fearless_simd_kernel($token $(, $arg)*) }
         }
     };
 }
@@ -227,43 +226,18 @@ mod tests {
         }
     );
 
-    crate::kernel!(
-        fn add_f32x4_neon_const<const LANES: usize>(
-            neon: Neon,
-            a: float32x4_t,
-            b: float32x4_t,
-        ) -> float32x4_t {
-            let _ = LANES;
-            vaddq_f32(a, b)
-        }
-    );
-
     crate::kernel!(
         fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
             f32x4_add(a, b)
         }
     );
 
-    crate::kernel!(
-        fn add_f32x4_wasm_const<const LANES: usize>(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
-            let _ = LANES;
-            f32x4_add(a, b)
-        }
-    );
-
     crate::kernel!(
         fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
             _mm256_add_epi32(a, b)
         }
     );
 
-    crate::kernel!(
-        fn add_i32x8_avx2_const<const LANES: usize>(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
-            let _ = LANES;
-            _mm256_add_epi32(a, b)
-        }
-    );
-
     #[cfg(target_arch = "aarch64")]
     #[test]
     fn kernel_instantiates_for_neon() {
@@ -282,25 +256,6 @@ mod tests {
         );
     }
 
-    #[cfg(target_arch = "aarch64")]
-    #[test]
-    fn kernel_instantiates_const_generic_for_neon() {
-        let Some(neon) = crate::Level::new().as_neon() else {
-            return;
-        };
-
-        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
-        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
-        let sum: crate::f32x4<_> =
-            add_f32x4_neon_const::<4>(neon, a.into(), b.into()).simd_into(neon);
-
-        assert_eq!(
-            <[f32; 4]>::from(sum),
-            [11.0, 22.0, 33.0, 44.0],
-            "`kernel!` should instantiate a const-generic NEON kernel"
-        );
-    }
-
     #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
     #[test]
     fn kernel_instantiates_for_wasm_simd128() {
@@ -319,25 +274,6 @@ mod tests {
         );
     }
 
-    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
-    #[test]
-    fn kernel_instantiates_const_generic_for_wasm_simd128() {
-        let wasm = crate::Level::new()
-            .as_wasm_simd128()
-            .expect("WASM SIMD128 should be available when +simd128 is enabled");
-
-        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
-        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
-        let sum: crate::f32x4<_> =
-            add_f32x4_wasm_const::<4>(wasm, a.into(), b.into()).simd_into(wasm);
-
-        assert_eq!(
-            <[f32; 4]>::from(sum),
-            [11.0, 22.0, 33.0, 44.0],
-            "`kernel!` should instantiate a const-generic WASM SIMD128 kernel"
-        );
-    }
-
     #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
     #[test]
     fn kernel_instantiates_for_avx2() {
@@ -355,23 +291,4 @@ mod tests {
             "`kernel!` should instantiate a working AVX2 kernel"
         );
     }
-
-    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
-    #[test]
-    fn kernel_instantiates_const_generic_for_avx2() {
-        let Some(avx2) = crate::Level::new().as_avx2() else {
-            return;
-        };
-
-        let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
-        let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
-        let sum: crate::i32x8<_> =
-            add_i32x8_avx2_const::<8>(avx2, a.into(), b.into()).simd_into(avx2);
-
-        assert_eq!(
-            <[i32; 8]>::from(sum),
-            [11, 22, 33, 44, 55, 66, 77, 88],
-            "`kernel!` should instantiate a const-generic AVX2 kernel"
-        );
-    }
 }

From 05daa040a41781349e177e5d25b5e22ba6e268d1 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Tue, 16 Jun 2026 22:21:34 +0100
Subject: [PATCH 5/5] Remove unnecessary `unsafe` from WASM

---
 fearless_simd/src/generated/wasm.rs | 480 +++++++++++++---------------
 fearless_simd_gen/src/mk_wasm.rs    |   6 +-
 2 files changed, 218 insertions(+), 268 deletions(-)

diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 7c54c052..8762cf03 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -138,17 +138,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 4usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_f32x4(a).val.0,
-                self.cvt_to_bytes_f32x4(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_f32x4(a).val.0,
+            self.cvt_to_bytes_f32x4(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x4<const SHIFT: usize>(
@@ -434,17 +432,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_i8x16(a).val.0,
-                self.cvt_to_bytes_i8x16(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_i8x16(a).val.0,
+            self.cvt_to_bytes_i8x16(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x16<const SHIFT: usize>(
@@ -649,17 +645,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_u8x16(a).val.0,
-                self.cvt_to_bytes_u8x16(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x16(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_u8x16(a).val.0,
+            self.cvt_to_bytes_u8x16(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x16(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x16<const SHIFT: usize>(
@@ -950,17 +944,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_i16x8(a).val.0,
-                self.cvt_to_bytes_i16x8(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_i16x8(a).val.0,
+            self.cvt_to_bytes_i16x8(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x8<const SHIFT: usize>(
@@ -1149,17 +1141,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_u16x8(a).val.0,
-                self.cvt_to_bytes_u16x8(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x8(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_u16x8(a).val.0,
+            self.cvt_to_bytes_u16x8(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x8(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x8<const SHIFT: usize>(
@@ -1430,17 +1420,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 4usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_i32x4(a).val.0,
-                self.cvt_to_bytes_i32x4(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_i32x4(a).val.0,
+            self.cvt_to_bytes_i32x4(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x4<const SHIFT: usize>(
@@ -1633,17 +1621,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 4usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_u32x4(a).val.0,
-                self.cvt_to_bytes_u32x4(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x4(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_u32x4(a).val.0,
+            self.cvt_to_bytes_u32x4(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x4(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x4<const SHIFT: usize>(
@@ -1914,17 +1900,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 2usize {
             return b;
         }
-        unsafe {
-            let result = dyn_slide_128(
-                self.cvt_to_bytes_f64x2(a).val.0,
-                self.cvt_to_bytes_f64x2(b).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x2(u8x16 {
-                val: crate::support::Aligned128(result),
-                simd: self,
-            })
-        }
+        let result = dyn_slide_128(
+            self.cvt_to_bytes_f64x2(a).val.0,
+            self.cvt_to_bytes_f64x2(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x2(u8x16 {
+            val: crate::support::Aligned128(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x2<const SHIFT: usize>(
@@ -2255,17 +2239,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_f32x8(a).val.0,
-                self.cvt_to_bytes_f32x8(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_f32x8(a).val.0,
+            self.cvt_to_bytes_f32x8(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x8<const SHIFT: usize>(
@@ -2627,17 +2609,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 32usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_i8x32(a).val.0,
-                self.cvt_to_bytes_i8x32(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_i8x32(a).val.0,
+            self.cvt_to_bytes_i8x32(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x32<const SHIFT: usize>(
@@ -2906,17 +2886,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 32usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_u8x32(a).val.0,
-                self.cvt_to_bytes_u8x32(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x32(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u8x32(a).val.0,
+            self.cvt_to_bytes_u8x32(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x32(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x32<const SHIFT: usize>(
@@ -3293,17 +3271,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_i16x16(a).val.0,
-                self.cvt_to_bytes_i16x16(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_i16x16(a).val.0,
+            self.cvt_to_bytes_i16x16(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x16<const SHIFT: usize>(
@@ -3572,17 +3548,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_u16x16(a).val.0,
-                self.cvt_to_bytes_u16x16(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x16(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u16x16(a).val.0,
+            self.cvt_to_bytes_u16x16(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x16(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x16<const SHIFT: usize>(
@@ -3968,17 +3942,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_i32x8(a).val.0,
-                self.cvt_to_bytes_i32x8(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_i32x8(a).val.0,
+            self.cvt_to_bytes_i32x8(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x8<const SHIFT: usize>(
@@ -4252,17 +4224,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_u32x8(a).val.0,
-                self.cvt_to_bytes_u32x8(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x8(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_u32x8(a).val.0,
+            self.cvt_to_bytes_u32x8(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x8(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x8<const SHIFT: usize>(
@@ -4636,17 +4606,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 4usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x2(
-                self.cvt_to_bytes_f64x4(a).val.0,
-                self.cvt_to_bytes_f64x4(b).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x4(u8x32 {
-                val: crate::support::Aligned256(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x2(
+            self.cvt_to_bytes_f64x4(a).val.0,
+            self.cvt_to_bytes_f64x4(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x4(u8x32 {
+            val: crate::support::Aligned256(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x4<const SHIFT: usize>(
@@ -5074,17 +5042,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_f32x16(a).val.0,
-                self.cvt_to_bytes_f32x16(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_f32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_f32x16(a).val.0,
+            self.cvt_to_bytes_f32x16(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_f32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f32x16<const SHIFT: usize>(
@@ -5484,17 +5450,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 64usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_i8x64(a).val.0,
-                self.cvt_to_bytes_i8x64(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_i8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i8x64(a).val.0,
+            self.cvt_to_bytes_i8x64(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_i8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i8x64<const SHIFT: usize>(
@@ -5756,17 +5720,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 64usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_u8x64(a).val.0,
-                self.cvt_to_bytes_u8x64(b).val.0,
-                SHIFT,
-            );
-            self.cvt_from_bytes_u8x64(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_u8x64(a).val.0,
+            self.cvt_to_bytes_u8x64(b).val.0,
+            SHIFT,
+        );
+        self.cvt_from_bytes_u8x64(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u8x64<const SHIFT: usize>(
@@ -6193,17 +6155,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 32usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_i16x32(a).val.0,
-                self.cvt_to_bytes_i16x32(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_i16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i16x32(a).val.0,
+            self.cvt_to_bytes_i16x32(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_i16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i16x32<const SHIFT: usize>(
@@ -6474,17 +6434,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 32usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_u16x32(a).val.0,
-                self.cvt_to_bytes_u16x32(b).val.0,
-                SHIFT * 2usize,
-            );
-            self.cvt_from_bytes_u16x32(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_u16x32(a).val.0,
+            self.cvt_to_bytes_u16x32(b).val.0,
+            SHIFT * 2usize,
+        );
+        self.cvt_from_bytes_u16x32(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u16x32<const SHIFT: usize>(
@@ -6909,17 +6867,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_i32x16(a).val.0,
-                self.cvt_to_bytes_i32x16(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_i32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_i32x16(a).val.0,
+            self.cvt_to_bytes_i32x16(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_i32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_i32x16<const SHIFT: usize>(
@@ -7186,17 +7142,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 16usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_u32x16(a).val.0,
-                self.cvt_to_bytes_u32x16(b).val.0,
-                SHIFT * 4usize,
-            );
-            self.cvt_from_bytes_u32x16(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_u32x16(a).val.0,
+            self.cvt_to_bytes_u32x16(b).val.0,
+            SHIFT * 4usize,
+        );
+        self.cvt_from_bytes_u32x16(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_u32x16<const SHIFT: usize>(
@@ -7601,17 +7555,15 @@ impl Simd for WasmSimd128 {
         if SHIFT >= 8usize {
             return b;
         }
-        unsafe {
-            let result = cross_block_slide_128x4(
-                self.cvt_to_bytes_f64x8(a).val.0,
-                self.cvt_to_bytes_f64x8(b).val.0,
-                SHIFT * 8usize,
-            );
-            self.cvt_from_bytes_f64x8(u8x64 {
-                val: crate::support::Aligned512(result),
-                simd: self,
-            })
-        }
+        let result = cross_block_slide_128x4(
+            self.cvt_to_bytes_f64x8(a).val.0,
+            self.cvt_to_bytes_f64x8(b).val.0,
+            SHIFT * 8usize,
+        );
+        self.cvt_from_bytes_f64x8(u8x64 {
+            val: crate::support::Aligned512(result),
+            simd: self,
+        })
     }
     #[inline(always)]
     fn slide_within_blocks_f64x8<const SHIFT: usize>(
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 88ef9e44..eb1b4915 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -478,10 +478,8 @@ impl Level for WasmSimd128 {
                             return b;
                         }
 
-                        unsafe {
-                            let result = #slide_op(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift);
-                            self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
-                        }
+                        let result = #slide_op(self.#to_bytes(a).val.0, self.#to_bytes(b).val.0, #byte_shift);
+                        self.#from_bytes(#combined_bytes { val: #block_wrapper(result), simd: self })
                     }
                 }
             }