diff --git a/fearless_simd/examples/play.rs b/fearless_simd/examples/play.rs index d2d5372b5..4d4209ae5 100644 --- a/fearless_simd/examples/play.rs +++ b/fearless_simd/examples/play.rs @@ -30,26 +30,10 @@ fn foo(simd: S, x: f32) -> f32 { simd.splat_f32x4(x).sqrt()[0] } -// currently requires `safe_wrappers` feature -fn do_something_on_neon(_level: Level) -> f32 { - #[cfg(all(feature = "safe_wrappers", target_arch = "aarch64"))] - if let Some(neon) = _level.as_neon() { - return neon.vectorize( - #[inline(always)] - || { - let v = neon.neon.vdupq_n_f32(42.0); - neon.neon.vgetq_lane_f32::<0>(v) - }, - ); - } - 0.0 -} - fn main() { let level = Level::new(); let x = level.dispatch(Foo); let y = dispatch!(level, simd => foo(simd, 42.0)); - let z = do_something_on_neon(level); - println!("level = {level:?}, x = {x}, y = {y}, z = {z}"); + println!("level = {level:?}, x = {x}, y = {y}"); } diff --git a/fearless_simd/examples/srgb.rs b/fearless_simd/examples/srgb.rs index ca40533e1..2eb91c025 100644 --- a/fearless_simd/examples/srgb.rs +++ b/fearless_simd/examples/srgb.rs @@ -1,53 +1,68 @@ // Copyright 2024 the Fearless_SIMD Authors // SPDX-License-Identifier: Apache-2.0 OR MIT -#![expect( - missing_docs, - reason = "TODO: https://github.com/linebender/fearless_simd/issues/40" -)] +//! Converts a single RGBA pixel from linear RGB to sRGB. +//! +//! This example demonstrates the usual Fearless SIMD structure: +//! +//! - write the main computation as an `#[inline(always)]` function generic over +//! [`Simd`]; +//! - use [`dispatch!`] at the non-SIMD boundary to run it with the best +//! available target features; +//! - drop down to [`kernel!`](fearless_simd::kernel) when a small part of the +//! computation needs a target-specific intrinsic. +//! +//! The RGB channels are converted with portable SIMD operations. The alpha +//! channel is copied unchanged, using an architecture-specific lane-copy +//! intrinsic if one is available and a scalar fallback otherwise. use fearless_simd::{Level, dispatch, f32x4, prelude::*}; -// This block shows how to use safe wrappers for compile-time enforcement -// of using valid SIMD intrinsics. -#[cfg(feature = "safe_wrappers")] -#[inline(always)] -fn copy_alpha(a: f32x4, b: f32x4) -> f32x4 { - // #[cfg(target_arch = "x86_64")] - // if let Some(avx2) = a.simd.level().as_avx2() { - // return avx2 - // .sse4_1 - // ._mm_blend_ps::<8>(a.into(), b.into()) - // .simd_into(a.simd); - // } - #[cfg(target_arch = "aarch64")] - if let Some(neon) = a.simd.level().as_neon() { - return neon - .neon - .vcopyq_laneq_f32::<3, 3>(a.into(), b.into()) - .simd_into(a.simd); +#[cfg(target_arch = "aarch64")] +use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32}; +#[cfg(target_arch = "x86")] +use core::arch::x86::{__m128, _mm_blend_ps}; +#[cfg(target_arch = "x86_64")] +use core::arch::x86_64::{__m128, _mm_blend_ps}; + +fearless_simd::kernel! { + /// Copy the alpha lane on AArch64 using a NEON lane-copy intrinsic. + #[inline] + fn copy_alpha_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t { + vcopyq_laneq_f32::<3, 3>(a, b) } - let mut result = a; - result[3] = b[3]; - result } -// This block lets the example compile without safe wrappers. -#[cfg(not(feature = "safe_wrappers"))] +fearless_simd::kernel! { + /// Copy the alpha lane on x86 using the SSE4.2 token to enable SSE4.1 blend instructions. + #[inline] + fn copy_alpha_sse4_2(sse4_2: Sse4_2, a: __m128, b: __m128) -> __m128 { + _mm_blend_ps::<8>(a, b) + } +} + +/// Return `a` with its alpha channel replaced by `b`'s alpha channel. +/// +/// This helper shows how portable SIMD code can opportunistically call +/// target-specific kernels while still providing a fallback for every backend. #[inline(always)] fn copy_alpha(a: f32x4, b: f32x4) -> f32x4 { + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + if let Some(sse4_2) = a.simd.level().as_sse4_2() { + return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd); + } + #[cfg(target_arch = "aarch64")] - if let Some(_neon) = a.simd.level().as_neon() { - unsafe { - return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into()) - .simd_into(a.simd); - } + if let Some(neon) = a.simd.level().as_neon() { + return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd); } + let mut result = a; result[3] = b[3]; result } +/// Approximate the linear-RGB to sRGB transfer curve for RGB, preserving alpha. #[inline(always)] fn to_srgb(simd: S, rgba: [f32; 4]) -> [f32; 4] { let v: f32x4 = rgba.simd_into(simd); diff --git a/fearless_simd/src/kernel_macros.rs b/fearless_simd/src/kernel_macros.rs new file mode 100644 index 000000000..58c794cf3 --- /dev/null +++ b/fearless_simd/src/kernel_macros.rs @@ -0,0 +1,296 @@ +// Copyright 2025 the Fearless_SIMD Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +/// Creates a context where you can safely call intrinsics +/// available at the SIMD level named by the function's first argument. +/// +/// This is useful if the portable abstractions are not enough, and you need to +/// use platform-specific intrinsics for parts of the computation. +/// +/// The first argument must be a SIMD token written as `token: Neon`, +/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`. +/// The generated wrapper uses the corresponding `$crate::` token type in its +/// actual signature. +/// +/// For levels with runtime-detected target features, the macro runs your body +/// inside an inner function annotated with the appropriate `#[target_feature]` +/// attributes. That makes platform-specific intrinsics from `core::arch` or +/// `std::arch` safe to call in the body, as long as they do not have safety +/// requirements beyond those target features. +/// +/// ## Example +/// +/// ```rust +/// # #[allow(unused_imports)] +/// use fearless_simd::{i32x8, prelude::*}; +/// #[cfg(target_arch = "x86")] +/// use std::arch::x86::{__m256i, _mm256_add_epi32}; +/// #[cfg(target_arch = "x86_64")] +/// use std::arch::x86_64::{__m256i, _mm256_add_epi32}; +/// +/// fearless_simd::kernel! { +/// fn add_i32x8(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { +/// _mm256_add_epi32(a, b) +/// } +/// } +/// +/// # fn main() { +/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] +/// if let Some(avx2) = fearless_simd::Level::new().as_avx2() { +/// let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2); +/// let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2); +/// let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2); +/// +/// assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]); +/// } +/// # } +/// ``` +/// +/// See the [sRGB example] for an end-to-end use of kernel macros. +/// +/// [sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs +/// +/// ## Limitations +/// +/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters. +/// However, the body of the function can be as complex as you like. +/// +/// The SIMD token type must be written as a bare supported name: +/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases. +/// +/// For soundness, this macro only accepts safe functions. +/// +/// ```compile_fail +/// fearless_simd::kernel! { +/// unsafe fn should_not_compile(avx2: Avx2) {} +/// } +#[macro_export] +macro_rules! kernel { + ( + $(#[$meta:meta])* + $vis:vis fn $name:ident( + $token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)? + ) $(-> $ret:ty)? { + $($kernel_body:tt)* + } + ) => { + $crate::__fearless_simd_kernel_dispatch! { + $token_ty, + $(#[$meta])* + $vis fn $name( + $token $(, $arg: $arg_ty)* + ) $(-> $ret)? { + $($kernel_body)* + } + } + }; + + ( + $(#[$meta:meta])* + $vis:vis fn $name:ident( + $token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)? + ) $(-> $ret:ty)? { + $($kernel_body:tt)* + } + ) => { + compile_error!(concat!( + "fearless_simd::kernel! expects its SIMD token argument type to be written as ", + "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `", + stringify!($token_ty), + "`", + )); + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __fearless_simd_kernel_dispatch { + ( + Neon, + $($body:tt)* + ) => { + $crate::__fearless_simd_kernel_impl! { + @cfg target_arch = "aarch64"; + @token_ty $crate::Neon; + @kernel_attrs #[target_feature(enable = "neon")]; + $($body)* + } + }; + + ( + WasmSimd128, + $($body:tt)* + ) => { + $crate::__fearless_simd_kernel_impl! { + @cfg all(target_arch = "wasm32", target_feature = "simd128"); + @token_ty $crate::WasmSimd128; + @kernel_attrs; + $($body)* + } + }; + + ( + Sse4_2, + $($body:tt)* + ) => { + $crate::__fearless_simd_kernel_impl! { + @cfg any(target_arch = "x86", target_arch = "x86_64"); + @token_ty $crate::Sse4_2; + @kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")]; + $($body)* + } + }; + + ( + Avx2, + $($body:tt)* + ) => { + $crate::__fearless_simd_kernel_impl! { + @cfg any(target_arch = "x86", target_arch = "x86_64"); + @token_ty $crate::Avx2; + @kernel_attrs #[target_feature( + enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave" + )]; + $($body)* + } + }; + + ( + $token_ty:ident, + $($body:tt)* + ) => { + compile_error!(concat!( + "fearless_simd::kernel! expects its SIMD token argument type to be written as ", + "one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `", + stringify!($token_ty), + "`", + )); + }; +} + +#[doc(hidden)] +#[macro_export] +macro_rules! __fearless_simd_kernel_impl { + ( + @cfg $cfg:meta; + @token_ty $token_ty:ty; + @kernel_attrs $(#[$kernel_attr:meta])*; + $(#[$meta:meta])* + $vis:vis fn $name:ident( + $token:ident $(, $arg:ident : $arg_ty:ty)* $(,)? + ) $(-> $ret:ty)? { + $($kernel_body:tt)* + } + ) => { + #[cfg($cfg)] + $(#[$meta])* + $vis fn $name( + $token: $token_ty $(, $arg: $arg_ty)* + ) $(-> $ret)? { + #[inline] // can't use `#[inline(always)]` with target features + $(#[$kernel_attr])* + fn __fearless_simd_kernel( + $token: $token_ty $(, $arg: $arg_ty)* + ) $(-> $ret)? { + let _ = $token; + $($kernel_body)* + } + + // SAFETY: the SIMD token proves that the required target features are available. + #[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")] + unsafe { __fearless_simd_kernel($token $(, $arg)*) } + } + }; +} + +#[cfg(test)] +mod tests { + #[cfg(any( + target_arch = "aarch64", + target_arch = "x86", + target_arch = "x86_64", + all(target_arch = "wasm32", target_feature = "simd128") + ))] + use crate::prelude::*; + + #[cfg(target_arch = "aarch64")] + use core::arch::aarch64::{float32x4_t, vaddq_f32}; + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + use core::arch::wasm32::{f32x4_add, v128}; + #[cfg(target_arch = "x86")] + use core::arch::x86::{__m256i, _mm256_add_epi32}; + #[cfg(target_arch = "x86_64")] + use core::arch::x86_64::{__m256i, _mm256_add_epi32}; + + crate::kernel! { + fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t { + vaddq_f32(a, b) + } + } + + crate::kernel! { + fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 { + f32x4_add(a, b) + } + } + + crate::kernel! { + fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i { + _mm256_add_epi32(a, b) + } + } + + #[cfg(target_arch = "aarch64")] + #[test] + fn kernel_instantiates_for_neon() { + let Some(neon) = crate::Level::new().as_neon() else { + return; + }; + + let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon); + let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon); + let sum: crate::f32x4<_> = add_f32x4_neon(neon, a.into(), b.into()).simd_into(neon); + + assert_eq!( + <[f32; 4]>::from(sum), + [11.0, 22.0, 33.0, 44.0], + "`kernel!` should instantiate a working NEON kernel" + ); + } + + #[cfg(all(target_arch = "wasm32", target_feature = "simd128"))] + #[test] + fn kernel_instantiates_for_wasm_simd128() { + let wasm = crate::Level::new() + .as_wasm_simd128() + .expect("WASM SIMD128 should be available when +simd128 is enabled"); + + let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm); + let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm); + let sum: crate::f32x4<_> = add_f32x4_wasm(wasm, a.into(), b.into()).simd_into(wasm); + + assert_eq!( + <[f32; 4]>::from(sum), + [11.0, 22.0, 33.0, 44.0], + "`kernel!` should instantiate a working WASM SIMD128 kernel" + ); + } + + #[cfg(any(target_arch = "x86", target_arch = "x86_64"))] + #[test] + fn kernel_instantiates_for_avx2() { + let Some(avx2) = crate::Level::new().as_avx2() else { + return; + }; + + let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2); + let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2); + let sum: crate::i32x8<_> = add_i32x8_avx2(avx2, a.into(), b.into()).simd_into(avx2); + + assert_eq!( + <[i32; 8]>::from(sum), + [11, 22, 33, 44, 55, 66, 77, 88], + "`kernel!` should instantiate a working AVX2 kernel" + ); + } +} diff --git a/fearless_simd/src/lib.rs b/fearless_simd/src/lib.rs index 10041217a..145df9f19 100644 --- a/fearless_simd/src/lib.rs +++ b/fearless_simd/src/lib.rs @@ -148,6 +148,7 @@ pub mod core_arch; mod impl_macros; mod generated; +mod kernel_macros; mod macros; mod support; mod traits;