linebender · Shnatsel · May 19, 2026 · May 18, 2026 · May 18, 2026 · May 18, 2026
diff --git a/fearless_simd/examples/srgb.rs b/fearless_simd/examples/srgb.rs
@@ -8,41 +8,39 @@
 
 use fearless_simd::{Level, dispatch, f32x4, prelude::*};
 
-// This block shows how to use safe wrappers for compile-time enforcement
-// of using valid SIMD intrinsics.
-#[cfg(feature = "safe_wrappers")]
-#[inline(always)]
-fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    // #[cfg(target_arch = "x86_64")]
-    // if let Some(avx2) = a.simd.level().as_avx2() {
-    //     return avx2
-    //         .sse4_1
-    //         ._mm_blend_ps::<8>(a.into(), b.into())
-    //         .simd_into(a.simd);
-    // }
-    #[cfg(target_arch = "aarch64")]
-    if let Some(neon) = a.simd.level().as_neon() {
-        return neon
-            .neon
-            .vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-            .simd_into(a.simd);
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__m128, _mm_blend_ps};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__m128, _mm_blend_ps};
+
+fearless_simd::kernel! {
+    #[inline]
+    fn copy_alpha_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
+        vcopyq_laneq_f32::<3, 3>(a, b)
+    }
+}
+
+fearless_simd::kernel! {
+    #[inline]
+    fn copy_alpha_sse4_2(sse4_2: Sse4_2, a: __m128, b: __m128) -> __m128 {
+        _mm_blend_ps::<8>(a, b)
     }
-    let mut result = a;
-    result[3] = b[3];
-    result
 }
 
-// This block lets the example compile without safe wrappers.
-#[cfg(not(feature = "safe_wrappers"))]
 #[inline(always)]
 fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if let Some(sse4_2) = a.simd.level().as_sse4_2() {
+        return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd);
+    }
+
     #[cfg(target_arch = "aarch64")]
-    if let Some(_neon) = a.simd.level().as_neon() {
-        unsafe {
-            return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-                .simd_into(a.simd);
-        }
+    if let Some(neon) = a.simd.level().as_neon() {
+        return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd);
     }
+
     let mut result = a;
     result[3] = b[3];
     result

diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
@@ -0,0 +1,296 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+/// Creates a context where you can safely call intrinsics
+/// available at the SIMD level named by the function's first argument.
+///
+/// This is useful if the portable abstractions are not enough, and you need to
+/// use platform-specific intrinsics for parts of the computation.
+///
+/// The first argument must be a SIMD token written as `token: Neon`,
+/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`.
+/// The generated wrapper uses the corresponding `$crate::` token type in its
+/// actual signature.
+///
+/// For levels with runtime-detected target features, the macro runs your body
+/// inside an inner function annotated with the appropriate `#[target_feature]`
+/// attributes. That makes platform-specific intrinsics from `core::arch` or
+/// `std::arch` safe to call in the body, as long as they do not have safety
+/// requirements beyond those target features.
+///
+/// ## Example
+///
+/// ```rust
+/// # #[allow(unused_imports)]
+/// use fearless_simd::{i32x8, prelude::*};
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::{__m256i, _mm256_add_epi32};
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::{__m256i, _mm256_add_epi32};
+///
+/// fearless_simd::kernel! {
+///     fn add_i32x8(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
+///         _mm256_add_epi32(a, b)
+///     }
+/// }
+///
+/// # fn main() {
+/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// if let Some(avx2) = fearless_simd::Level::new().as_avx2() {
+///     let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+///     let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+///     let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2);
+///
+///     assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]);
+/// }
+/// # }
+/// ```
+///
+/// See the [sRGB example] for an end-to-end use of kernel macros.
+///
+/// [sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs
+///
+/// ## Limitations
+///
+/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters.
+/// However, the body of the function can be as complex as you like.
+///
+/// The SIMD token type must be written as a bare supported name:
+/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
+///
+/// For soundness, this macro only accepts safe functions.
+///
+/// ```compile_fail
+/// fearless_simd::kernel! {
+///     unsafe fn should_not_compile(avx2: Avx2) {}
+/// }
+#[macro_export]
+macro_rules! kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        $crate::__fearless_simd_kernel_dispatch! {
+            $token_ty,
+            $(#[$meta])*
+            $vis fn $name(
+                $token $(, $arg: $arg_ty)*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+        }
+    };
+
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        compile_error!(concat!(
+            "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            stringify!($token_ty),
+            "`",
+        ));
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __fearless_simd_kernel_dispatch {
+    (
+        Neon,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg target_arch = "aarch64";
+            @token_ty $crate::Neon;
+            @kernel_attrs #[target_feature(enable = "neon")];
+            $($body)*
+        }
+    };
+
+    (
+        WasmSimd128,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg all(target_arch = "wasm32", target_feature = "simd128");
+            @token_ty $crate::WasmSimd128;
+            @kernel_attrs;
+            $($body)*
+        }
+    };
+
+    (
+        Sse4_2,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Sse4_2;
+            @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")];
+            $($body)*
+        }
+    };
+
+    (
+        Avx2,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Avx2;
+            @kernel_attrs #[target_feature(
+                enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave"
+            )];
+            $($body)*
+        }
+    };
+
+    (
+        $token_ty:ident,
+        $($body:tt)*
+    ) => {
+        compile_error!(concat!(
+            "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            stringify!($token_ty),
+            "`",
+        ));
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __fearless_simd_kernel_impl {
+    (
+        @cfg $cfg:meta;
+        @token_ty $token_ty:ty;
+        @kernel_attrs $(#[$kernel_attr:meta])*;
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg($cfg)]
+        $(#[$meta])*
+        $vis fn $name(
+            $token: $token_ty $(, $arg: $arg_ty)*
+        ) $(-> $ret)? {
+            #[inline] // can't use `#[inline(always)]` with target features
+            $(#[$kernel_attr])*
+            fn __fearless_simd_kernel(
+                $token: $token_ty $(, $arg: $arg_ty)*
+            ) $(-> $ret)? {
+                let _ = $token;
+                $($kernel_body)*
+            }
+
+            // SAFETY: the SIMD token proves that the required target features are available.
+            #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")]
+            unsafe { __fearless_simd_kernel($token $(, $arg)*) }
+        }
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "wasm32", target_feature = "simd128")
+    ))]
+    use crate::prelude::*;
+
+    #[cfg(target_arch = "aarch64")]
+    use core::arch::aarch64::{float32x4_t, vaddq_f32};
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    use core::arch::wasm32::{f32x4_add, v128};
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::{__m256i, _mm256_add_epi32};
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::{__m256i, _mm256_add_epi32};
+
+    crate::kernel! {
+        fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
+            vaddq_f32(a, b)
+        }
+    }
+
+    crate::kernel! {
+        fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
+            f32x4_add(a, b)
+        }
+    }
+
+    crate::kernel! {
+        fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
+            _mm256_add_epi32(a, b)
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    #[test]
+    fn kernel_instantiates_for_neon() {
+        let Some(neon) = crate::Level::new().as_neon() else {
+            return;
+        };
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
+        let sum: crate::f32x4<_> = add_f32x4_neon(neon, a.into(), b.into()).simd_into(neon);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a working NEON kernel"
+        );
+    }
+
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    #[test]
+    fn kernel_instantiates_for_wasm_simd128() {
+        let wasm = crate::Level::new()
+            .as_wasm_simd128()
+            .expect("WASM SIMD128 should be available when +simd128 is enabled");
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
+        let sum: crate::f32x4<_> = add_f32x4_wasm(wasm, a.into(), b.into()).simd_into(wasm);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a working WASM SIMD128 kernel"
+        );
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[test]
+    fn kernel_instantiates_for_avx2() {
+        let Some(avx2) = crate::Level::new().as_avx2() else {
+            return;
+        };
+
+        let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+        let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+        let sum: crate::i32x8<_> = add_i32x8_avx2(avx2, a.into(), b.into()).simd_into(avx2);
+
+        assert_eq!(
+            <[i32; 8]>::from(sum),
+            [11, 22, 33, 44, 55, 66, 77, 88],
+            "`kernel!` should instantiate a working AVX2 kernel"
+        );
+    }
+}
diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
@@ -148,6 +148,7 @@ pub mod core_arch;
 mod impl_macros;
 
 mod generated;
+mod kernel_macros;
 mod macros;
 mod support;
 mod traits;