Skip to content
Merged
Show file tree
Hide file tree
Changes from 6 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
54 changes: 26 additions & 28 deletions fearless_simd/examples/srgb.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,41 +8,39 @@

use fearless_simd::{Level, dispatch, f32x4, prelude::*};

// This block shows how to use safe wrappers for compile-time enforcement
// of using valid SIMD intrinsics.
#[cfg(feature = "safe_wrappers")]
#[inline(always)]
fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
// #[cfg(target_arch = "x86_64")]
// if let Some(avx2) = a.simd.level().as_avx2() {
// return avx2
// .sse4_1
// ._mm_blend_ps::<8>(a.into(), b.into())
// .simd_into(a.simd);
// }
#[cfg(target_arch = "aarch64")]
if let Some(neon) = a.simd.level().as_neon() {
return neon
.neon
.vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
.simd_into(a.simd);
#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::{float32x4_t, vcopyq_laneq_f32};
#[cfg(target_arch = "x86")]
use core::arch::x86::{__m128, _mm_blend_ps};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{__m128, _mm_blend_ps};

fearless_simd::kernel! {
#[inline]
fn copy_alpha_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
vcopyq_laneq_f32::<3, 3>(a, b)
}
}

fearless_simd::kernel! {
#[inline]
fn copy_alpha_sse4_2(sse4_2: Sse4_2, a: __m128, b: __m128) -> __m128 {
_mm_blend_ps::<8>(a, b)
}
let mut result = a;
result[3] = b[3];
result
}

// This block lets the example compile without safe wrappers.
#[cfg(not(feature = "safe_wrappers"))]
#[inline(always)]
fn copy_alpha<S: Simd>(a: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
if let Some(sse4_2) = a.simd.level().as_sse4_2() {
return copy_alpha_sse4_2(sse4_2, a.into(), b.into()).simd_into(a.simd);
}

#[cfg(target_arch = "aarch64")]
if let Some(_neon) = a.simd.level().as_neon() {
unsafe {
return core::arch::aarch64::vcopyq_laneq_f32::<3, 3>(a.into(), b.into())
.simd_into(a.simd);
}
if let Some(neon) = a.simd.level().as_neon() {
return copy_alpha_neon(neon, a.into(), b.into()).simd_into(a.simd);
}

let mut result = a;
result[3] = b[3];
result
Expand Down
296 changes: 296 additions & 0 deletions fearless_simd/src/kernel_macros.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,296 @@
// Copyright 2025 the Fearless_SIMD Authors

@ajakubowicz-canva ajakubowicz-canva May 21, 2026

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Extreme nit drive by. Should this be 2026? (No action required)

// SPDX-License-Identifier: Apache-2.0 OR MIT

/// Creates a context where you can safely call intrinsics
/// available at the SIMD level named by the function's first argument.
///
/// This is useful if the portable abstractions are not enough, and you need to
/// use platform-specific intrinsics for parts of the computation.
///
/// The first argument must be a SIMD token written as `token: Neon`,
/// `token: WasmSimd128`, `token: Sse4_2`, or `token: Avx2`.
/// The generated wrapper uses the corresponding `$crate::` token type in its
/// actual signature.
///
/// For levels with runtime-detected target features, the macro runs your body
/// inside an inner function annotated with the appropriate `#[target_feature]`
/// attributes. That makes platform-specific intrinsics from `core::arch` or
/// `std::arch` safe to call in the body, as long as they do not have safety
/// requirements beyond those target features.
///
/// ## Example
///
/// ```rust
/// # #[allow(unused_imports)]
/// use fearless_simd::{i32x8, prelude::*};
/// #[cfg(target_arch = "x86")]
/// use std::arch::x86::{__m256i, _mm256_add_epi32};
/// #[cfg(target_arch = "x86_64")]
/// use std::arch::x86_64::{__m256i, _mm256_add_epi32};
///
/// fearless_simd::kernel! {
/// fn add_i32x8(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
/// _mm256_add_epi32(a, b)
/// }
/// }
///
/// # fn main() {
/// #[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
/// if let Some(avx2) = fearless_simd::Level::new().as_avx2() {
/// let a: i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
/// let b: i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
/// let sum: i32x8<_> = add_i32x8(avx2, a.into(), b.into()).simd_into(avx2);
///
/// assert_eq!(<[i32; 8]>::from(sum), [11, 22, 33, 44, 55, 66, 77, 88]);
/// }
/// # }
/// ```
///
/// See the [sRGB example] for an end-to-end use of kernel macros.
///
/// [sRGB example]: https://github.com/linebender/fearless_simd/blob/main/fearless_simd/examples/srgb.rs
///
/// ## Limitations
///
/// The macro only accepts a single plain, safe, non-generic function item with simple named parameters.
/// However, the body of the function can be as complex as you like.
///
/// The SIMD token type must be written as a bare supported name:
/// literally `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`. No paths or aliases.
///
/// For soundness, this macro only accepts safe functions.
///
/// ```compile_fail
/// fearless_simd::kernel! {
/// unsafe fn should_not_compile(avx2: Avx2) {}
/// }
#[macro_export]
macro_rules! kernel {
(
$(#[$meta:meta])*
$vis:vis fn $name:ident(
$token:ident : $token_ty:ident $(, $arg:ident : $arg_ty:ty)* $(,)?

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What do you think about optionally accepting a fearless_simd:: prefix before the type here?

This isn't blocking.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It's not that hard, just a lot of boilerplate: Shnatsel@897f83e

This is a cosmetic issue and I don't really see the point of supporting spelling out the levels like that, so this is not where I'd prefer to spend the complexity budget. I'm happy to reconsider if we have actual user feedback to this effect.

) $(-> $ret:ty)? {
$($kernel_body:tt)*
}
) => {
$crate::__fearless_simd_kernel_dispatch! {
$token_ty,
$(#[$meta])*
$vis fn $name(
$token $(, $arg: $arg_ty)*
) $(-> $ret)? {
$($kernel_body)*
}
}
};

(
$(#[$meta:meta])*
$vis:vis fn $name:ident(
$token:ident : $token_ty:ty $(, $arg:ident : $arg_ty:ty)* $(,)?
) $(-> $ret:ty)? {
$($kernel_body:tt)*
}
) => {
compile_error!(concat!(
"fearless_simd::kernel! expects its SIMD token argument type to be written as ",
"one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
stringify!($token_ty),
"`",
));
};
}

#[doc(hidden)]
#[macro_export]
macro_rules! __fearless_simd_kernel_dispatch {
(
Neon,
$($body:tt)*
) => {
$crate::__fearless_simd_kernel_impl! {
@cfg target_arch = "aarch64";
@token_ty $crate::Neon;
@kernel_attrs #[target_feature(enable = "neon")];
$($body)*
}
};

(
WasmSimd128,
$($body:tt)*
) => {
$crate::__fearless_simd_kernel_impl! {
@cfg all(target_arch = "wasm32", target_feature = "simd128");
@token_ty $crate::WasmSimd128;
@kernel_attrs;
$($body)*
}
};

(
Sse4_2,
$($body:tt)*
) => {
$crate::__fearless_simd_kernel_impl! {
@cfg any(target_arch = "x86", target_arch = "x86_64");
@token_ty $crate::Sse4_2;
@kernel_attrs #[target_feature(enable = "sse4.2,cmpxchg16b,popcnt")];
$($body)*
}
};

(
Avx2,
$($body:tt)*
) => {
$crate::__fearless_simd_kernel_impl! {
@cfg any(target_arch = "x86", target_arch = "x86_64");
@token_ty $crate::Avx2;
@kernel_attrs #[target_feature(
enable = "avx2,bmi1,bmi2,cmpxchg16b,f16c,fma,lzcnt,movbe,popcnt,xsave"
)];
$($body)*
}
};

(
$token_ty:ident,
$($body:tt)*
) => {
compile_error!(concat!(
"fearless_simd::kernel! expects its SIMD token argument type to be written as ",
"one of `Neon`, `WasmSimd128`, `Sse4_2`, or `Avx2`; got `",
stringify!($token_ty),
"`",
));
};
}

#[doc(hidden)]
#[macro_export]
macro_rules! __fearless_simd_kernel_impl {
(
@cfg $cfg:meta;
@token_ty $token_ty:ty;
@kernel_attrs $(#[$kernel_attr:meta])*;
$(#[$meta:meta])*
$vis:vis fn $name:ident(
$token:ident $(, $arg:ident : $arg_ty:ty)* $(,)?
) $(-> $ret:ty)? {
$($kernel_body:tt)*
}
) => {
#[cfg($cfg)]
$(#[$meta])*
$vis fn $name(

Copy link
Copy Markdown
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I believe that this outer function should be #[inline(always)] in most cases. But I don't know how that works with the inner function being marked #[inline].

I think this is something which can be tuned. Plus, users can (currently) add their own #[inline(always)], so this isn't blocking.

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I erred on the side of letting users tune the inlining.

I've already hit this in practice when using fearless_simd for PhastFT, and made a PR to reduce the overly aggressive inlining in an unrelated part of fearless_simd: #183

$token: $token_ty $(, $arg: $arg_ty)*
) $(-> $ret)? {
#[inline] // can't use `#[inline(always)]` with target features
$(#[$kernel_attr])*
fn __fearless_simd_kernel(
$token: $token_ty $(, $arg: $arg_ty)*
) $(-> $ret)? {
let _ = $token;
$($kernel_body)*
}

// SAFETY: the SIMD token proves that the required target features are available.
#[allow(unused_unsafe, reason = "for WASM which has no target feature requirements and is safe to call")]
unsafe { __fearless_simd_kernel($token $(, $arg)*) }
}
};
}

#[cfg(test)]
mod tests {
Comment thread
Shnatsel marked this conversation as resolved.
#[cfg(any(
target_arch = "aarch64",
target_arch = "x86",
target_arch = "x86_64",
all(target_arch = "wasm32", target_feature = "simd128")
))]
use crate::prelude::*;

#[cfg(target_arch = "aarch64")]
use core::arch::aarch64::{float32x4_t, vaddq_f32};
#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
use core::arch::wasm32::{f32x4_add, v128};
#[cfg(target_arch = "x86")]
use core::arch::x86::{__m256i, _mm256_add_epi32};
#[cfg(target_arch = "x86_64")]
use core::arch::x86_64::{__m256i, _mm256_add_epi32};

crate::kernel! {
fn add_f32x4_neon(neon: Neon, a: float32x4_t, b: float32x4_t) -> float32x4_t {
vaddq_f32(a, b)
}
}

crate::kernel! {
fn add_f32x4_wasm(wasm: WasmSimd128, a: v128, b: v128) -> v128 {
f32x4_add(a, b)
}
}

crate::kernel! {
fn add_i32x8_avx2(avx2: Avx2, a: __m256i, b: __m256i) -> __m256i {
_mm256_add_epi32(a, b)
}
}

#[cfg(target_arch = "aarch64")]
#[test]
fn kernel_instantiates_for_neon() {
let Some(neon) = crate::Level::new().as_neon() else {
return;
};

let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(neon);
let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(neon);
let sum: crate::f32x4<_> = add_f32x4_neon(neon, a.into(), b.into()).simd_into(neon);

assert_eq!(
<[f32; 4]>::from(sum),
[11.0, 22.0, 33.0, 44.0],
"`kernel!` should instantiate a working NEON kernel"
);
}

#[cfg(all(target_arch = "wasm32", target_feature = "simd128"))]
#[test]
fn kernel_instantiates_for_wasm_simd128() {
let wasm = crate::Level::new()
.as_wasm_simd128()
.expect("WASM SIMD128 should be available when +simd128 is enabled");

let a: crate::f32x4<_> = [1.0, 2.0, 3.0, 4.0].simd_into(wasm);
let b: crate::f32x4<_> = [10.0, 20.0, 30.0, 40.0].simd_into(wasm);
let sum: crate::f32x4<_> = add_f32x4_wasm(wasm, a.into(), b.into()).simd_into(wasm);

assert_eq!(
<[f32; 4]>::from(sum),
[11.0, 22.0, 33.0, 44.0],
"`kernel!` should instantiate a working WASM SIMD128 kernel"
);
}

#[cfg(any(target_arch = "x86", target_arch = "x86_64"))]
#[test]
fn kernel_instantiates_for_avx2() {
let Some(avx2) = crate::Level::new().as_avx2() else {
return;
};

let a: crate::i32x8<_> = [1, 2, 3, 4, 5, 6, 7, 8].simd_into(avx2);
let b: crate::i32x8<_> = [10, 20, 30, 40, 50, 60, 70, 80].simd_into(avx2);
let sum: crate::i32x8<_> = add_i32x8_avx2(avx2, a.into(), b.into()).simd_into(avx2);

assert_eq!(
<[i32; 8]>::from(sum),
[11, 22, 33, 44, 55, 66, 77, 88],
"`kernel!` should instantiate a working AVX2 kernel"
);
}
}
1 change: 1 addition & 0 deletions fearless_simd/src/lib.rs
Original file line number Diff line number Diff line change
Expand Up @@ -148,6 +148,7 @@ pub mod core_arch;
mod impl_macros;

mod generated;
mod kernel_macros;
mod macros;
mod support;
mod traits;
Expand Down
Loading