diff --git a/fearless_simd/examples/play.rs b/fearless_simd/examples/play.rs
index d2d5372b5..4d4209ae5 100644
--- a/fearless_simd/examples/play.rs
+++ b/fearless_simd/examples/play.rs
@@ -30,26 +30,10 @@ fn foo<S: Simd>(simd: S, x: f32) -> f32 {
     simd.splat_f32x4(x).sqrt()[0]
 }
 
-// currently requires `safe_wrappers` feature
-fn do_something_on_neon(_level: Level) -> f32 {
-    #[cfg(all(feature = "safe_wrappers", target_arch = "aarch64"))]
-    if let Some(neon) = _level.as_neon() {
-        return neon.vectorize(
-            #[inline(always)]
-            || {
-                let v = neon.neon.vdupq_n_f32(42.0);
-                neon.neon.vgetq_lane_f32::<0>(v)
-            },
-        );
-    }
-    0.0
-}
-
 fn main() {
     let level = Level::new();
     let x = level.dispatch(Foo);
     let y = dispatch!(level, simd => foo(simd, 42.0));
-    let z = do_something_on_neon(level);
 
-    println!("level = {level:?}, x = {x}, y = {y}, z = {z}");
+    println!("level = {level:?}, x = {x}, y = {y}");
 }
diff --git a/fearless_simd/examples/srgb.rs b/fearless_simd/examples/srgb.rs
index ca40533e1..2eb91c025 100644
--- a/fearless_simd/examples/srgb.rs
+++ b/fearless_simd/examples/srgb.rs
@@ -1,53 +1,68 @@
 // Copyright 2024 the Fearless_SIMD Authors
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
-#![expect(
-    missing_docs,
-    reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
-)]
+//! Converts a single RGBA pixel from linear RGB to sRGB.
+//!
+//! This example demonstrates the usual Fearless SIMD structure:
+//!
+//! - write the main computation as an `#[inline(always)]` function generic over
+//!   [`Simd`];
+//! - use [`dispatch!`] at the non-SIMD boundary to run it with the best
+//!   available target features;
+//! - drop down to [`kernel!`](fearless_simd::kernel) when a small part of the
+//!   computation needs a target-specific intrinsic.
+//!
+//! The RGB channels are converted with portable SIMD operations. The alpha
+//! channel is copied unchanged, using an architecture-specific lane-copy
+//! intrinsic if one is available and a scalar fallback otherwise.
 
 use fearless_simd::{Level, dispatch, f32x4, prelude::*};
 
-// This block shows how to use safe wrappers for compile-time enforcement
-// of using valid SIMD intrinsics.
-#[cfg(feature = "safe_wrappers")]
-#[inline(always)]
-fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    // #[cfg(target_arch = "x86_64")]
-    // if let Some(avx2) = a.simd.level().as_avx2() {
-    //     return avx2
-    //         .sse4_1
-    //         ._mm_blend_ps::<8>(a.into(), b.into())
-    //         .simd_into(a.simd);
-    // }
-    #[cfg(target_arch = "aarch64")]
-    if let Some(neon) = a.simd.level().as_neon() {
-        return neon
-            .neon
-            .vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-            .simd_into(a.simd);
+#[cfg(target_arch = "aarch64")]
+use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32};
+#[cfg(target_arch = "x86")]
+use core::arch::x86::{__m128, _mm_blend_ps};
+#[cfg(target_arch = "x86_64")]
+use core::arch::x86_64::{__m128, _mm_blend_ps};
+
+fearless_simd::kernel! {
+    /// Copy the alpha lane on AArch64 using a NEON lane-copy intrinsic.
+    #[inline]
+    fn copy_alpha_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
+        vcopyq_laneq_f32::<3, 3>(a, b)
     }
-    let mut result = a;
-    result[3] = b[3];
-    result
 }
 
-// This block lets the example compile without safe wrappers.
-#[cfg(not(feature = "safe_wrappers"))]
+fearless_simd::kernel! {
+    /// Copy the alpha lane on x86 using the SSE4.2 token to enable SSE4.1 blend instructions.
+    #[inline]
+    fn copy_alpha_sse4_2(sse4_2: Sse4_2, a: __m128, b: __m128) -> __m128 {
+        _mm_blend_ps::<8>(a, b)
+    }
+}
+
+/// Return `a` with its alpha channel replaced by `b`'s alpha channel.
+///
+/// This helper shows how portable SIMD code can opportunistically call
+/// target-specific kernels while still providing a fallback for every backend.
 #[inline(always)]
 fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    if let Some(sse4_2) = a.simd.level().as_sse4_2() {
+        return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd);
+    }
+
     #[cfg(target_arch = "aarch64")]
-    if let Some(_neon) = a.simd.level().as_neon() {
-        unsafe {
-            return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
-                .simd_into(a.simd);
-        }
+    if let Some(neon) = a.simd.level().as_neon() {
+        return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd);
     }
+
     let mut result = a;
     result[3] = b[3];
     result
 }
 
+/// Approximate the linear-RGB to sRGB transfer curve for RGB, preserving alpha.
 #[inline(always)]
 fn to_srgb<S: Simd>(simd: S, rgba: [f32; 4]) -> [f32; 4] {
     let v: f32x4<S> = rgba.simd_into(simd);
diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs
new file mode 100644
index 000000000..58c794cf3
--- /dev/null
+++ b/fearless_simd/src/kernel_macros.rs
@@ -0,0 +1,296 @@
+// Copyright 2025 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+/// Creates a context where you can safely call intrinsics
+/// available at the SIMD level named by the function's first argument.
+///
+/// This is useful if the portable abstractions are not enough, and you need to
+/// use platform-specific intrinsics for parts of the computation.
+///
+/// The first argument must be a SIMD token written as `token: Neon`,
+/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`.
+/// The generated wrapper uses the corresponding `$crate::` token type in its
+/// actual signature.
+///
+/// For levels with runtime-detected target features, the macro runs your body
+/// inside an inner function annotated with the appropriate `#[target_feature]`
+/// attributes. That makes platform-specific intrinsics from `core::arch` or
+/// `std::arch` safe to call in the body, as long as they do not have safety
+/// requirements beyond those target features.
+///
+/// ## Example
+///
+/// ```rust
+/// # #[allow(unused_imports)]
+/// use fearless_simd::{i32x8, prelude::*};
+/// #[cfg(target_arch = "x86")]
+/// use std::arch::x86::{__m256i, _mm256_add_epi32};
+/// #[cfg(target_arch = "x86_64")]
+/// use std::arch::x86_64::{__m256i, _mm256_add_epi32};
+///
+/// fearless_simd::kernel! {
+///     fn add_i32x8(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
+///         _mm256_add_epi32(a, b)
+///     }
+/// }
+///
+/// # fn main() {
+/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+/// if let Some(avx2) = fearless_simd::Level::new().as_avx2() {
+///     let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+///     let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+///     let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2);
+///
+///     assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]);
+/// }
+/// # }
+/// ```
+///
+/// See the [sRGB example] for an end-to-end use of kernel macros.
+///
+/// [sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs
+///
+/// ## Limitations
+///
+/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters.
+/// However, the body of the function can be as complex as you like.
+///
+/// The SIMD token type must be written as a bare supported name:
+/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
+///
+/// For soundness, this macro only accepts safe functions.
+///
+/// ```compile_fail
+/// fearless_simd::kernel! {
+///     unsafe fn should_not_compile(avx2: Avx2) {}
+/// }
+#[macro_export]
+macro_rules! kernel {
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        $crate::__fearless_simd_kernel_dispatch! {
+            $token_ty,
+            $(#[$meta])*
+            $vis fn $name(
+                $token $(, $arg: $arg_ty)*
+            ) $(-> $ret)? {
+                $($kernel_body)*
+            }
+        }
+    };
+
+    (
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        compile_error!(concat!(
+            "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            stringify!($token_ty),
+            "`",
+        ));
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __fearless_simd_kernel_dispatch {
+    (
+        Neon,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg target_arch = "aarch64";
+            @token_ty $crate::Neon;
+            @kernel_attrs #[target_feature(enable = "neon")];
+            $($body)*
+        }
+    };
+
+    (
+        WasmSimd128,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg all(target_arch = "wasm32", target_feature = "simd128");
+            @token_ty $crate::WasmSimd128;
+            @kernel_attrs;
+            $($body)*
+        }
+    };
+
+    (
+        Sse4_2,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Sse4_2;
+            @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")];
+            $($body)*
+        }
+    };
+
+    (
+        Avx2,
+        $($body:tt)*
+    ) => {
+        $crate::__fearless_simd_kernel_impl! {
+            @cfg any(target_arch = "x86", target_arch = "x86_64");
+            @token_ty $crate::Avx2;
+            @kernel_attrs #[target_feature(
+                enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave"
+            )];
+            $($body)*
+        }
+    };
+
+    (
+        $token_ty:ident,
+        $($body:tt)*
+    ) => {
+        compile_error!(concat!(
+            "fearless_simd::kernel! expects its SIMD token argument type to be written as ",
+            "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
+            stringify!($token_ty),
+            "`",
+        ));
+    };
+}
+
+#[doc(hidden)]
+#[macro_export]
+macro_rules! __fearless_simd_kernel_impl {
+    (
+        @cfg $cfg:meta;
+        @token_ty $token_ty:ty;
+        @kernel_attrs $(#[$kernel_attr:meta])*;
+        $(#[$meta:meta])*
+        $vis:vis fn $name:ident(
+            $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
+        ) $(-> $ret:ty)? {
+            $($kernel_body:tt)*
+        }
+    ) => {
+        #[cfg($cfg)]
+        $(#[$meta])*
+        $vis fn $name(
+            $token: $token_ty $(, $arg: $arg_ty)*
+        ) $(-> $ret)? {
+            #[inline] // can't use `#[inline(always)]` with target features
+            $(#[$kernel_attr])*
+            fn __fearless_simd_kernel(
+                $token: $token_ty $(, $arg: $arg_ty)*
+            ) $(-> $ret)? {
+                let _ = $token;
+                $($kernel_body)*
+            }
+
+            // SAFETY: the SIMD token proves that the required target features are available.
+            #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")]
+            unsafe { __fearless_simd_kernel($token $(, $arg)*) }
+        }
+    };
+}
+
+#[cfg(test)]
+mod tests {
+    #[cfg(any(
+        target_arch = "aarch64",
+        target_arch = "x86",
+        target_arch = "x86_64",
+        all(target_arch = "wasm32", target_feature = "simd128")
+    ))]
+    use crate::prelude::*;
+
+    #[cfg(target_arch = "aarch64")]
+    use core::arch::aarch64::{float32x4_t, vaddq_f32};
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    use core::arch::wasm32::{f32x4_add, v128};
+    #[cfg(target_arch = "x86")]
+    use core::arch::x86::{__m256i, _mm256_add_epi32};
+    #[cfg(target_arch = "x86_64")]
+    use core::arch::x86_64::{__m256i, _mm256_add_epi32};
+
+    crate::kernel! {
+        fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
+            vaddq_f32(a, b)
+        }
+    }
+
+    crate::kernel! {
+        fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
+            f32x4_add(a, b)
+        }
+    }
+
+    crate::kernel! {
+        fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
+            _mm256_add_epi32(a, b)
+        }
+    }
+
+    #[cfg(target_arch = "aarch64")]
+    #[test]
+    fn kernel_instantiates_for_neon() {
+        let Some(neon) = crate::Level::new().as_neon() else {
+            return;
+        };
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
+        let sum: crate::f32x4<_> = add_f32x4_neon(neon, a.into(), b.into()).simd_into(neon);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a working NEON kernel"
+        );
+    }
+
+    #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
+    #[test]
+    fn kernel_instantiates_for_wasm_simd128() {
+        let wasm = crate::Level::new()
+            .as_wasm_simd128()
+            .expect("WASM SIMD128 should be available when +simd128 is enabled");
+
+        let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
+        let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
+        let sum: crate::f32x4<_> = add_f32x4_wasm(wasm, a.into(), b.into()).simd_into(wasm);
+
+        assert_eq!(
+            <[f32; 4]>::from(sum),
+            [11.0, 22.0, 33.0, 44.0],
+            "`kernel!` should instantiate a working WASM SIMD128 kernel"
+        );
+    }
+
+    #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
+    #[test]
+    fn kernel_instantiates_for_avx2() {
+        let Some(avx2) = crate::Level::new().as_avx2() else {
+            return;
+        };
+
+        let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
+        let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
+        let sum: crate::i32x8<_> = add_i32x8_avx2(avx2, a.into(), b.into()).simd_into(avx2);
+
+        assert_eq!(
+            <[i32; 8]>::from(sum),
+            [11, 22, 33, 44, 55, 66, 77, 88],
+            "`kernel!` should instantiate a working AVX2 kernel"
+        );
+    }
+}
diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs
index 10041217a..145df9f19 100644
--- a/fearless_simd/src/lib.rs
+++ b/fearless_simd/src/lib.rs
@@ -148,6 +148,7 @@ pub mod core_arch;
 mod impl_macros;
 
 mod generated;
+mod kernel_macros;
 mod macros;
 mod support;
 mod traits;