From d31ed07dec38086a15603a9f7dfac90acf782c14 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 14:27:05 +0100
Subject: [PATCH 01/16] Add optimized to/from_bitmask on masks, and an initial
 version of test/set, along with tests for them

---
 fearless_simd/src/generated/avx2.rs       | 366 ++++++++++++++++++++++
 fearless_simd/src/generated/fallback.rs   | 180 +++++++++++
 fearless_simd/src/generated/neon.rs       | 180 +++++++++++
 fearless_simd/src/generated/simd_trait.rs |  79 +++++
 fearless_simd/src/generated/simd_types.rs | 120 ++++++-
 fearless_simd/src/generated/sse4_2.rs     | 247 +++++++++++++++
 fearless_simd/src/generated/wasm.rs       | 180 +++++++++++
 fearless_simd_gen/src/generic.rs          |  54 ++++
 fearless_simd_gen/src/mk_fallback.rs      |   5 +-
 fearless_simd_gen/src/mk_neon.rs          |   7 +-
 fearless_simd_gen/src/mk_simd_trait.rs    |  35 +++
 fearless_simd_gen/src/mk_simd_types.rs    |  12 +
 fearless_simd_gen/src/mk_wasm.rs          |   6 +-
 fearless_simd_gen/src/mk_x86.rs           | 187 +++++++++++
 fearless_simd_gen/src/ops.rs              |  31 ++
 fearless_simd_gen/src/types.rs            |   2 +-
 fearless_simd_tests/tests/mask_methods.rs |  62 ++++
 fearless_simd_tests/tests/mod.rs          |   1 +
 fearless_simd_tests/tests/soundness.rs    |  47 +++
 19 files changed, 1783 insertions(+), 18 deletions(-)
 create mode 100644 fearless_simd_tests/tests/mask_methods.rs
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 27e3b8df1..44cb960f2 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -895,6 +895,46 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        unsafe {
+            {
+                let bit_bytes = _mm_cvtsi32_si128(bits as i32);
+                let bit_bytes = _mm_shuffle_epi8(
+                    bit_bytes,
+                    _mm_setr_epi8(
+                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                    ),
+                );
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            }
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1386,6 +1426,45 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        unsafe {
+            _mm_cvtepi8_epi16({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        unsafe {
+            let mut bits = _mm_movemask_epi8(a.into()) as u32 as u64;
+            bits &= 21845u64;
+            bits = (bits | (bits >> 1)) & 13107u64;
+            bits = (bits | (bits >> 2)) & 3855u64;
+            bits = (bits | (bits >> 4)) & 255u64;
+            bits
+        }
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1887,6 +1966,38 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        unsafe {
+            _mm_cvtepi8_epi32({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2189,6 +2300,38 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        unsafe {
+            _mm_cvtepi8_epi64({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3270,6 +3413,65 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m256i, [i8; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        unsafe {
+            {
+                let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32));
+                let bit_bytes = _mm256_shuffle_epi8(
+                    bit_bytes,
+                    _mm256_setr_epi8(
+                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 2u8 as i8, 2u8 as i8,
+                        2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
+                        3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
+                        3u8 as i8, 3u8 as i8,
+                    ),
+                );
+                let bit_mask = _mm256_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+            }
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        unsafe { _mm256_movemask_epi8(a.into()) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -3963,6 +4165,54 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m256i, [i16; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        unsafe {
+            _mm256_cvtepi8_epi16({
+                let bit_bytes = _mm_cvtsi32_si128(bits as i32);
+                let bit_bytes = _mm_shuffle_epi8(
+                    bit_bytes,
+                    _mm_setr_epi8(
+                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                    ),
+                );
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        unsafe {
+            let mut bits = _mm256_movemask_epi8(a.into()) as u32 as u64;
+            bits &= 1431655765u64;
+            bits = (bits | (bits >> 1)) & 858993459u64;
+            bits = (bits | (bits >> 2)) & 252645135u64;
+            bits = (bits | (bits >> 4)) & 16711935u64;
+            bits = (bits | (bits >> 8)) & 65535u64;
+            bits
+        }
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4601,6 +4851,38 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m256i, [i32; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        unsafe {
+            _mm256_cvtepi8_epi32({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        unsafe { _mm256_movemask_ps(_mm256_castsi256_ps(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -4978,6 +5260,38 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<__m256i, [i64; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        unsafe {
+            _mm256_cvtepi8_epi64({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        unsafe { _mm256_movemask_pd(_mm256_castsi256_pd(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         unsafe { _mm256_and_si256(a.into(), b.into()).simd_into(self) }
     }
@@ -6076,6 +6390,19 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<[__m256i; 2usize], [i8; 64usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6814,6 +7141,19 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<[__m256i; 2usize], [i16; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7516,6 +7856,19 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<[__m256i; 2usize], [i32; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -7929,6 +8282,19 @@ impl Simd for Avx2 {
         unsafe { core::mem::transmute::<[__m256i; 2usize], [i64; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        let lo = self.from_bitmask_mask64x4(bits);
+        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
+        self.combine_mask64x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x8(a);
+        let lo = self.to_bitmask_mask64x4(lo);
+        let hi = self.to_bitmask_mask64x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/fallback.rs b/fearless_simd/src/generated/fallback.rs
index 43e8fd1ba..1dc4b5d96 100644
--- a/fearless_simd/src/generated/fallback.rs
+++ b/fearless_simd/src/generated/fallback.rs
@@ -1818,6 +1818,25 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        let lanes: [i8; 16usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        let lanes = self.as_array_mask8x16(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 16usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         [
             i8::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -2964,6 +2983,25 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        let lanes: [i16; 8usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        let lanes = self.as_array_mask16x8(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 8usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         [
             i16::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -3802,6 +3840,25 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        let lanes: [i32; 4usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        let lanes = self.as_array_mask32x4(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 4usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         [
             i32::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -4196,6 +4253,25 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        let lanes: [i64; 2usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        let lanes = self.as_array_mask64x2(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 2usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         [
             i64::bitand(a.val.0[0usize], &b.val.0[0usize]),
@@ -5176,6 +5252,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -5824,6 +5913,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -6452,6 +6554,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -6853,6 +6968,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -7921,6 +8049,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -8597,6 +8738,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -9237,6 +9391,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -9624,6 +9791,19 @@ impl Simd for Fallback {
         a.val.0
     }
     #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        let lo = self.from_bitmask_mask64x4(bits);
+        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
+        self.combine_mask64x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x8(a);
+        let lo = self.to_bitmask_mask64x4(lo);
+        let hi = self.to_bitmask_mask64x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index ad46f2f55..f4cd64937 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -786,6 +786,25 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int8x16_t, [i8; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        let lanes: [i8; 16usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        let lanes = self.as_array_mask8x16(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 16usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { vandq_s8(a.into(), b.into()).simd_into(self) }
     }
@@ -1272,6 +1291,25 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int16x8_t, [i16; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        let lanes: [i16; 8usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        let lanes = self.as_array_mask16x8(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 8usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { vandq_s16(a.into(), b.into()).simd_into(self) }
     }
@@ -1762,6 +1800,25 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int32x4_t, [i32; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        let lanes: [i32; 4usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        let lanes = self.as_array_mask32x4(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 4usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { vandq_s32(a.into(), b.into()).simd_into(self) }
     }
@@ -2065,6 +2122,25 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int64x2_t, [i64; 2usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        let lanes: [i64; 2usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        let lanes = self.as_array_mask64x2(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 2usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { vandq_s64(a.into(), b.into()).simd_into(self) }
     }
@@ -3146,6 +3222,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int8x16x2_t, [i8; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3874,6 +3963,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int16x8x2_t, [i16; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4595,6 +4697,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int32x4x2_t, [i32; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5046,6 +5161,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int64x2x2_t, [i64; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6199,6 +6327,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int8x16x4_t, [i8; 64usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6958,6 +7099,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int16x8x4_t, [i16; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7699,6 +7853,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int32x4x4_t, [i32; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8150,6 +8317,19 @@ impl Simd for Neon {
         unsafe { core::mem::transmute::<int64x2x4_t, [i64; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        let lo = self.from_bitmask_mask64x4(bits);
+        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
+        self.combine_mask64x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x8(a);
+        let lo = self.to_bitmask_mask64x4(lo);
+        let hi = self.to_bitmask_mask64x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 01a5ee0f9..57058e0eb 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -396,6 +396,10 @@ pub trait Simd:
     fn load_array_mask8x16(self, val: [i8; 16usize]) -> mask8x16<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask8x16(self, a: mask8x16<Self>) -> [i8; 16usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -595,6 +599,10 @@ pub trait Simd:
     fn load_array_mask16x8(self, val: [i16; 8usize]) -> mask16x8<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask16x8(self, a: mask16x8<Self>) -> [i16; 8usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -796,6 +804,10 @@ pub trait Simd:
     fn load_array_mask32x4(self, val: [i32; 4usize]) -> mask32x4<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask32x4(self, a: mask32x4<Self>) -> [i32; 4usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -921,6 +933,10 @@ pub trait Simd:
     fn load_array_mask64x2(self, val: [i64; 2usize]) -> mask64x2<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask64x2(self, a: mask64x2<Self>) -> [i64; 2usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1232,6 +1248,10 @@ pub trait Simd:
     fn load_array_mask8x32(self, val: [i8; 32usize]) -> mask8x32<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask8x32(self, a: mask8x32<Self>) -> [i8; 32usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1439,6 +1459,10 @@ pub trait Simd:
     fn load_array_mask16x16(self, val: [i16; 16usize]) -> mask16x16<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask16x16(self, a: mask16x16<Self>) -> [i16; 16usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1646,6 +1670,10 @@ pub trait Simd:
     fn load_array_mask32x8(self, val: [i32; 8usize]) -> mask32x8<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask32x8(self, a: mask32x8<Self>) -> [i32; 8usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -1775,6 +1803,10 @@ pub trait Simd:
     fn load_array_mask64x4(self, val: [i64; 4usize]) -> mask64x4<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask64x4(self, a: mask64x4<Self>) -> [i64; 4usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2088,6 +2120,10 @@ pub trait Simd:
     fn load_array_mask8x64(self, val: [i8; 64usize]) -> mask8x64<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask8x64(self, a: mask8x64<Self>) -> [i8; 64usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2293,6 +2329,10 @@ pub trait Simd:
     fn load_array_mask16x32(self, val: [i16; 32usize]) -> mask16x32<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask16x32(self, a: mask16x32<Self>) -> [i16; 32usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2498,6 +2538,10 @@ pub trait Simd:
     fn load_array_mask32x16(self, val: [i32; 16usize]) -> mask32x16<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask32x16(self, a: mask32x16<Self>) -> [i32; 16usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2623,6 +2667,10 @@ pub trait Simd:
     fn load_array_mask64x8(self, val: [i64; 8usize]) -> mask64x8<Self>;
     #[doc = "Convert a SIMD mask to signed integer mask lanes."]
     fn as_array_mask64x8(self, a: mask64x8<Self>) -> [i64; 8usize];
+    #[doc = "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored."]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self>;
+    #[doc = "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared."]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64;
     #[doc = "Compute the logical AND of two masks."]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self>;
     #[doc = "Compute the logical OR of two masks."]
@@ -2951,6 +2999,37 @@ pub trait SimdMask<S: Simd>:
     fn witness(&self) -> S;
     #[doc = r" Create a SIMD mask with all lanes set to the given boolean value."]
     fn splat(simd: S, val: bool) -> Self;
+    #[doc = r" Create a mask from a compact bitmask."]
+    #[doc = r""]
+    #[doc = r" Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above"]
+    #[doc = r" [`Self::N`] are ignored."]
+    fn from_bitmask(simd: S, bits: u64) -> Self;
+    #[doc = r" Convert this mask to a compact bitmask."]
+    #[doc = r""]
+    #[doc = r" Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above"]
+    #[doc = r" [`Self::N`] are cleared."]
+    fn to_bitmask(self) -> u64;
+    #[doc = r" Test whether one logical lane is set."]
+    #[doc = r""]
+    #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."]
+    #[inline(always)]
+    fn test(&self, index: usize) -> bool {
+        assert!(index < Self::N);
+        (((*self).to_bitmask() >> index) & 1) != 0
+    }
+    #[doc = r" Sets the value of one logical lane."]
+    #[doc = r""]
+    #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."]
+    #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < Self::N);
+        let lane_mask = Self::from_bitmask(self.witness(), 1u64 << index);
+        if value {
+            *self = *self | lane_mask;
+        } else {
+            *self = *self & !lane_mask;
+        }
+    }
     #[doc = r" Create a SIMD mask from signed integer mask lanes."]
     #[doc = r""]
     #[doc = r" The slice must be exactly the size of the SIMD mask."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index ec0e074ff..2893ba9d5 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -625,7 +625,7 @@ impl<S: Simd> crate::SimdCombine<S> for u8x16<S> {
         self.simd.combine_u8x16(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x16<S: Simd> {
     pub(crate) val: S::mask8x16,
@@ -674,6 +674,14 @@ impl<S: Simd> crate::SimdMask<S> for mask8x16<S> {
         simd.splat_mask8x16(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask8x16(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask8x16(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 16] = slice.try_into().unwrap();
         simd.load_array_mask8x16(*slice)
@@ -1074,7 +1082,7 @@ impl<S: Simd> crate::SimdCombine<S> for u16x8<S> {
         self.simd.combine_u16x8(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x8<S: Simd> {
     pub(crate) val: S::mask16x8,
@@ -1123,6 +1131,14 @@ impl<S: Simd> crate::SimdMask<S> for mask16x8<S> {
         simd.splat_mask16x8(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask16x8(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask16x8(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 8] = slice.try_into().unwrap();
         simd.load_array_mask16x8(*slice)
@@ -1547,7 +1563,7 @@ impl<S: Simd> crate::SimdCombine<S> for u32x4<S> {
         self.simd.combine_u32x4(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x4<S: Simd> {
     pub(crate) val: S::mask32x4,
@@ -1596,6 +1612,14 @@ impl<S: Simd> crate::SimdMask<S> for mask32x4<S> {
         simd.splat_mask32x4(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask32x4(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask32x4(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 4] = slice.try_into().unwrap();
         simd.load_array_mask32x4(*slice)
@@ -1861,7 +1885,7 @@ impl<S: Simd> crate::SimdCombine<S> for f64x2<S> {
         self.simd.combine_f64x2(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x2<S: Simd> {
     pub(crate) val: S::mask64x2,
@@ -1910,6 +1934,14 @@ impl<S: Simd> crate::SimdMask<S> for mask64x2<S> {
         simd.splat_mask64x2(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask64x2(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask64x2(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 2] = slice.try_into().unwrap();
         simd.load_array_mask64x2(*slice)
@@ -2580,7 +2612,7 @@ impl<S: Simd> crate::SimdCombine<S> for u8x32<S> {
         self.simd.combine_u8x32(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x32<S: Simd> {
     pub(crate) val: S::mask8x32,
@@ -2629,6 +2661,14 @@ impl<S: Simd> crate::SimdMask<S> for mask8x32<S> {
         simd.splat_mask8x32(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask8x32(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask8x32(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 32] = slice.try_into().unwrap();
         simd.load_array_mask8x32(*slice)
@@ -3055,7 +3095,7 @@ impl<S: Simd> crate::SimdCombine<S> for u16x16<S> {
         self.simd.combine_u16x16(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x16<S: Simd> {
     pub(crate) val: S::mask16x16,
@@ -3104,6 +3144,14 @@ impl<S: Simd> crate::SimdMask<S> for mask16x16<S> {
         simd.splat_mask16x16(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask16x16(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask16x16(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 16] = slice.try_into().unwrap();
         simd.load_array_mask16x16(*slice)
@@ -3542,7 +3590,7 @@ impl<S: Simd> crate::SimdCombine<S> for u32x8<S> {
         self.simd.combine_u32x8(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x8<S: Simd> {
     pub(crate) val: S::mask32x8,
@@ -3591,6 +3639,14 @@ impl<S: Simd> crate::SimdMask<S> for mask32x8<S> {
         simd.splat_mask32x8(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask32x8(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask32x8(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 8] = slice.try_into().unwrap();
         simd.load_array_mask32x8(*slice)
@@ -3863,7 +3919,7 @@ impl<S: Simd> crate::SimdCombine<S> for f64x4<S> {
         self.simd.combine_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x4<S: Simd> {
     pub(crate) val: S::mask64x4,
@@ -3912,6 +3968,14 @@ impl<S: Simd> crate::SimdMask<S> for mask64x4<S> {
         simd.splat_mask64x4(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask64x4(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask64x4(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 4] = slice.try_into().unwrap();
         simd.load_array_mask64x4(*slice)
@@ -4570,7 +4634,7 @@ impl<S: Simd> crate::SimdSplit<S> for u8x64<S> {
         self.simd.split_u8x64(self)
     }
 }
-#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x64<S: Simd> {
     pub(crate) val: S::mask8x64,
@@ -4619,6 +4683,14 @@ impl<S: Simd> crate::SimdMask<S> for mask8x64<S> {
         simd.splat_mask8x64(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask8x64(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask8x64(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 64] = slice.try_into().unwrap();
         simd.load_array_mask8x64(*slice)
@@ -5033,7 +5105,7 @@ impl<S: Simd> crate::SimdSplit<S> for u16x32<S> {
         self.simd.split_u16x32(self)
     }
 }
-#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x32<S: Simd> {
     pub(crate) val: S::mask16x32,
@@ -5082,6 +5154,14 @@ impl<S: Simd> crate::SimdMask<S> for mask16x32<S> {
         simd.splat_mask16x32(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask16x32(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask16x32(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 32] = slice.try_into().unwrap();
         simd.load_array_mask16x32(*slice)
@@ -5520,7 +5600,7 @@ impl<S: Simd> crate::SimdSplit<S> for u32x16<S> {
         self.simd.split_u32x16(self)
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x16<S: Simd> {
     pub(crate) val: S::mask32x16,
@@ -5569,6 +5649,14 @@ impl<S: Simd> crate::SimdMask<S> for mask32x16<S> {
         simd.splat_mask32x16(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask32x16(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask32x16(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 16] = slice.try_into().unwrap();
         simd.load_array_mask32x16(*slice)
@@ -5835,7 +5923,7 @@ impl<S: Simd> crate::SimdSplit<S> for f64x8<S> {
         self.simd.split_f64x8(self)
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x8<S: Simd> {
     pub(crate) val: S::mask64x8,
@@ -5884,6 +5972,14 @@ impl<S: Simd> crate::SimdMask<S> for mask64x8<S> {
         simd.splat_mask64x8(val)
     }
     #[inline(always)]
+    fn from_bitmask(simd: S, bits: u64) -> Self {
+        simd.from_bitmask_mask64x8(bits)
+    }
+    #[inline(always)]
+    fn to_bitmask(self) -> u64 {
+        self.simd.to_bitmask_mask64x8(self)
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 8] = slice.try_into().unwrap();
         simd.load_array_mask64x8(*slice)
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 6388d315c..eb894b43e 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -935,6 +935,46 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<__m128i, [i8; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        unsafe {
+            {
+                let bit_bytes = _mm_cvtsi32_si128(bits as i32);
+                let bit_bytes = _mm_shuffle_epi8(
+                    bit_bytes,
+                    _mm_setr_epi8(
+                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                    ),
+                );
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            }
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        unsafe { _mm_movemask_epi8(a.into()) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1435,6 +1475,45 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<__m128i, [i16; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        unsafe {
+            _mm_cvtepi8_epi16({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        unsafe {
+            let mut bits = _mm_movemask_epi8(a.into()) as u32 as u64;
+            bits &= 21845u64;
+            bits = (bits | (bits >> 1)) & 13107u64;
+            bits = (bits | (bits >> 2)) & 3855u64;
+            bits = (bits | (bits >> 4)) & 255u64;
+            bits
+        }
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -1945,6 +2024,38 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<__m128i, [i32; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        unsafe {
+            _mm_cvtepi8_epi32({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        unsafe { _mm_movemask_ps(_mm_castsi128_ps(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -2253,6 +2364,38 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<__m128i, [i64; 2usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        unsafe {
+            _mm_cvtepi8_epi64({
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            })
+            .simd_into(self)
+        }
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        unsafe { _mm_movemask_pd(_mm_castsi128_pd(a.into())) as u32 as u64 }
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         unsafe { _mm_and_si128(a.into(), b.into()).simd_into(self) }
     }
@@ -3268,6 +3411,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 2usize], [i8; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3952,6 +4108,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 2usize], [i16; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4627,6 +4796,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 2usize], [i32; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -5054,6 +5236,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 2usize], [i64; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6175,6 +6370,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 4usize], [i8; 64usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6905,6 +7113,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 4usize], [i16; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7607,6 +7828,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 4usize], [i32; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -8020,6 +8254,19 @@ impl Simd for Sse4_2 {
         unsafe { core::mem::transmute::<[__m128i; 4usize], [i64; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        let lo = self.from_bitmask_mask64x4(bits);
+        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
+        self.combine_mask64x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x8(a);
+        let lo = self.to_bitmask_mask64x4(lo);
+        let hi = self.to_bitmask_mask64x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index 4eb7671bb..ba14075b0 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -852,6 +852,25 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<v128, [i8; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
+        let lanes: [i8; 16usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
+        let lanes = self.as_array_mask8x16(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 16usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1341,6 +1360,25 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<v128, [i16; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
+        let lanes: [i16; 8usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
+        let lanes = self.as_array_mask16x8(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 8usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -1834,6 +1872,25 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<v128, [i32; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
+        let lanes: [i32; 4usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
+        let lanes = self.as_array_mask32x4(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 4usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -2169,6 +2226,25 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<v128, [i64; 2usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
+        let lanes: [i64; 2usize] =
+            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+        lanes.simd_into(self)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
+        let lanes = self.as_array_mask64x2(a);
+        let mut bits = 0u64;
+        let mut i = 0;
+        while i < 2usize {
+            if lanes[i] != 0 {
+                bits |= 1u64 << i;
+            }
+            i += 1;
+        }
+        bits
+    }
+    #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
         v128_and(a.into(), b.into()).simd_into(self)
     }
@@ -3191,6 +3267,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 2usize], [i8; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x32(self, bits: u64) -> mask8x32<Self> {
+        let lo = self.from_bitmask_mask8x16(bits);
+        let hi = self.from_bitmask_mask8x16(bits >> 16usize);
+        self.combine_mask8x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x32(self, a: mask8x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x32(a);
+        let lo = self.to_bitmask_mask8x16(lo);
+        let hi = self.to_bitmask_mask8x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask8x32(self, a: mask8x32<Self>, b: mask8x32<Self>) -> mask8x32<Self> {
         let (a0, a1) = self.split_mask8x32(a);
         let (b0, b1) = self.split_mask8x32(b);
@@ -3873,6 +3962,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 2usize], [i16; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
+        let lo = self.from_bitmask_mask16x8(bits);
+        let hi = self.from_bitmask_mask16x8(bits >> 8usize);
+        self.combine_mask16x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x16(a);
+        let lo = self.to_bitmask_mask16x8(lo);
+        let hi = self.to_bitmask_mask16x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
         let (a0, a1) = self.split_mask16x16(a);
         let (b0, b1) = self.split_mask16x16(b);
@@ -4548,6 +4650,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 2usize], [i32; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
+        let lo = self.from_bitmask_mask32x4(bits);
+        let hi = self.from_bitmask_mask32x4(bits >> 4usize);
+        self.combine_mask32x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x8(self, a: mask32x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x8(a);
+        let lo = self.to_bitmask_mask32x4(lo);
+        let hi = self.to_bitmask_mask32x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask32x8(self, a: mask32x8<Self>, b: mask32x8<Self>) -> mask32x8<Self> {
         let (a0, a1) = self.split_mask32x8(a);
         let (b0, b1) = self.split_mask32x8(b);
@@ -4975,6 +5090,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 2usize], [i64; 4usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
+        let lo = self.from_bitmask_mask64x2(bits);
+        let hi = self.from_bitmask_mask64x2(bits >> 2usize);
+        self.combine_mask64x2(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x4(self, a: mask64x4<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x4(a);
+        let lo = self.to_bitmask_mask64x2(lo);
+        let hi = self.to_bitmask_mask64x2(hi);
+        lo | (hi << 2usize)
+    }
+    #[inline(always)]
     fn and_mask64x4(self, a: mask64x4<Self>, b: mask64x4<Self>) -> mask64x4<Self> {
         let (a0, a1) = self.split_mask64x4(a);
         let (b0, b1) = self.split_mask64x4(b);
@@ -6104,6 +6232,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 4usize], [i8; 64usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
+        let lo = self.from_bitmask_mask8x32(bits);
+        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
+        self.combine_mask8x32(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
+        let (lo, hi) = self.split_mask8x64(a);
+        let lo = self.to_bitmask_mask8x32(lo);
+        let hi = self.to_bitmask_mask8x32(hi);
+        lo | (hi << 32usize)
+    }
+    #[inline(always)]
     fn and_mask8x64(self, a: mask8x64<Self>, b: mask8x64<Self>) -> mask8x64<Self> {
         let (a0, a1) = self.split_mask8x64(a);
         let (b0, b1) = self.split_mask8x64(b);
@@ -6821,6 +6962,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 4usize], [i16; 32usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask16x32(self, bits: u64) -> mask16x32<Self> {
+        let lo = self.from_bitmask_mask16x16(bits);
+        let hi = self.from_bitmask_mask16x16(bits >> 16usize);
+        self.combine_mask16x16(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
+        let (lo, hi) = self.split_mask16x32(a);
+        let lo = self.to_bitmask_mask16x16(lo);
+        let hi = self.to_bitmask_mask16x16(hi);
+        lo | (hi << 16usize)
+    }
+    #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
         let (a0, a1) = self.split_mask16x32(a);
         let (b0, b1) = self.split_mask16x32(b);
@@ -7520,6 +7674,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 4usize], [i32; 16usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
+        let lo = self.from_bitmask_mask32x8(bits);
+        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
+        self.combine_mask32x8(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
+        let (lo, hi) = self.split_mask32x16(a);
+        let lo = self.to_bitmask_mask32x8(lo);
+        let hi = self.to_bitmask_mask32x8(hi);
+        lo | (hi << 8usize)
+    }
+    #[inline(always)]
     fn and_mask32x16(self, a: mask32x16<Self>, b: mask32x16<Self>) -> mask32x16<Self> {
         let (a0, a1) = self.split_mask32x16(a);
         let (b0, b1) = self.split_mask32x16(b);
@@ -7933,6 +8100,19 @@ impl Simd for WasmSimd128 {
         unsafe { core::mem::transmute::<[v128; 4usize], [i64; 8usize]>(a.val.0) }
     }
     #[inline(always)]
+    fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
+        let lo = self.from_bitmask_mask64x4(bits);
+        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
+        self.combine_mask64x4(lo, hi)
+    }
+    #[inline(always)]
+    fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
+        let (lo, hi) = self.split_mask64x8(a);
+        let lo = self.to_bitmask_mask64x4(lo);
+        let hi = self.to_bitmask_mask64x4(hi);
+        lo | (hi << 4usize)
+    }
+    #[inline(always)]
     fn and_mask64x8(self, a: mask64x8<Self>, b: mask64x8<Self>) -> mask64x8<Self> {
         let (a0, a1) = self.split_mask64x8(a);
         let (b0, b1) = self.split_mask64x8(b);
diff --git a/fearless_simd_gen/src/generic.rs b/fearless_simd_gen/src/generic.rs
index fd390f613..e2d2cfeef 100644
--- a/fearless_simd_gen/src/generic.rs
+++ b/fearless_simd_gen/src/generic.rs
@@ -187,6 +187,27 @@ pub(crate) fn generic_op(op: &Op, ty: &VecType) -> TokenStream {
                 }
             }
         }
+        OpSig::MaskFromBitmask => {
+            let half_len = half.len;
+            quote! {
+                #method_sig {
+                    let lo = self.#do_half(bits);
+                    let hi = self.#do_half(bits >> #half_len);
+                    self.#combine(lo, hi)
+                }
+            }
+        }
+        OpSig::MaskToBitmask => {
+            let half_len = half.len;
+            quote! {
+                #method_sig {
+                    let (lo, hi) = self.#split(a);
+                    let lo = self.#do_half(lo);
+                    let hi = self.#do_half(hi);
+                    lo | (hi << #half_len)
+                }
+            }
+        }
         OpSig::LoadInterleaved {
             block_size,
             block_count,
@@ -455,3 +476,36 @@ pub(crate) fn generic_from_bytes(method_sig: TokenStream, vec_ty: &VecType) -> T
         }
     }
 }
+
+pub(crate) fn generic_mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    let scalar = vec_ty.scalar.rust(vec_ty.scalar_bits);
+    let len = vec_ty.len;
+
+    quote! {
+        #method_sig {
+            let lanes: [#scalar; #len] =
+                core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
+            lanes.simd_into(self)
+        }
+    }
+}
+
+pub(crate) fn generic_mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    let as_array = generic_op_name("as_array", vec_ty);
+    let len = vec_ty.len;
+
+    quote! {
+        #method_sig {
+            let lanes = self.#as_array(a);
+            let mut bits = 0u64;
+            let mut i = 0;
+            while i < #len {
+                if lanes[i] != 0 {
+                    bits |= 1u64 << i;
+                }
+                i += 1;
+            }
+            bits
+        }
+    }
+}
diff --git a/fearless_simd_gen/src/mk_fallback.rs b/fearless_simd_gen/src/mk_fallback.rs
index 119850e27..269ef502a 100644
--- a/fearless_simd_gen/src/mk_fallback.rs
+++ b/fearless_simd_gen/src/mk_fallback.rs
@@ -3,7 +3,8 @@
 
 use crate::arch::fallback;
 use crate::generic::{
-    generic_from_bytes, generic_op_name, generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name,
+    generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, OpSig, RefKind, valid_reinterpret};
@@ -455,6 +456,8 @@ impl Level for Fallback {
                     }
                 }
             }
+            OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 7f74fb4e0..100d4725e 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -5,8 +5,9 @@ use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
-    generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array,
-    generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_as_array, generic_from_array, generic_from_bytes, generic_mask_from_bitmask,
+    generic_mask_to_bitmask, generic_op_name, generic_store_array, generic_to_bytes,
+    integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -530,6 +531,8 @@ impl Level for Neon {
                     }
                 }
             }
+            OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index a973c01bd..2d9d00a60 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -308,6 +308,41 @@ fn mk_simd_mask() -> TokenStream {
             /// Create a SIMD mask with all lanes set to the given boolean value.
             fn splat(simd: S, val: bool) -> Self;
 
+            /// Create a mask from a compact bitmask.
+            ///
+            /// Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above
+            /// [`Self::N`] are ignored.
+            fn from_bitmask(simd: S, bits: u64) -> Self;
+
+            /// Convert this mask to a compact bitmask.
+            ///
+            /// Bit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above
+            /// [`Self::N`] are cleared.
+            fn to_bitmask(self) -> u64;
+
+            /// Test whether one logical lane is set.
+            ///
+            /// Panics if `index` is greater than or equal to the number of lanes in the mask.
+            #[inline(always)]
+            fn test(&self, index: usize) -> bool {
+                assert!(index < Self::N);
+                (((*self).to_bitmask() >> index) & 1) != 0
+            }
+
+            /// Sets the value of one logical lane.
+            ///
+            /// Panics if `index` is greater than or equal to the number of lanes in the mask.
+            #[inline(always)]
+            fn set(&mut self, index: usize, value: bool) {
+                assert!(index < Self::N);
+                let lane_mask = Self::from_bitmask(self.witness(), 1u64 << index);
+                if value {
+                    *self = *self | lane_mask;
+                } else {
+                    *self = *self & !lane_mask;
+                }
+            }
+
             /// Create a SIMD mask from signed integer mask lanes.
             ///
             /// The slice must be exactly the size of the SIMD mask.
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index 22d484178..bfe49be3c 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -296,6 +296,8 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
     let scalar = ty.scalar.rust(ty.scalar_bits);
     let len = Literal::usize_unsuffixed(ty.len);
     let splat = generic_op_name("splat", ty);
+    let from_bitmask_op = generic_op_name("from_bitmask", ty);
+    let to_bitmask_op = generic_op_name("to_bitmask", ty);
     let from_array_op = generic_op_name("load_array", ty);
     let as_array_op = generic_op_name("as_array", ty);
     let mut methods = vec![];
@@ -335,6 +337,16 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
                 simd.#splat(val)
             }
 
+            #[inline(always)]
+            fn from_bitmask(simd: S, bits: u64) -> Self {
+                simd.#from_bitmask_op(bits)
+            }
+
+            #[inline(always)]
+            fn to_bitmask(self) -> u64 {
+                self.simd.#to_bitmask_op(self)
+            }
+
             #[inline(always)]
             fn from_slice(simd: S, slice: &[#scalar]) -> Self {
                 let slice: &[#scalar; #len] = slice.try_into().unwrap();
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index eb7a3e333..f3c72f823 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -7,8 +7,8 @@ use quote::{format_ident, quote};
 use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
-    integer_lane_mask_splat_arg, scalar_binary,
+    generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name,
+    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
 use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret};
@@ -512,6 +512,8 @@ impl Level for WasmSimd128 {
                     }
                 }
             }
+            OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 7b139bdde..e0d8c1cc1 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -201,6 +201,8 @@ impl Level for X86 {
                 quantifier,
                 condition,
             } => self.handle_mask_reduce(method_sig, vec_ty, quantifier, condition),
+            OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,
@@ -224,6 +226,82 @@ impl Level for X86 {
     }
 }
 
+fn mask_from_bitmask_bytes(vec_ty: &VecType) -> TokenStream {
+    let lane_count = vec_ty.len;
+    let bit_mask_128 = mask_bit_pattern_128();
+
+    if lane_count <= 8 {
+        return quote! {
+            {
+                let bit_bytes = _mm_set1_epi8(bits as i8);
+                let bit_mask = #bit_mask_128;
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            }
+        };
+    }
+
+    if lane_count <= 16 {
+        let shuffle = mask_byte_shuffle_128(lane_count);
+        return quote! {
+            {
+                let bit_bytes = _mm_cvtsi32_si128(bits as i32);
+                let bit_bytes = _mm_shuffle_epi8(bit_bytes, #shuffle);
+                let bit_mask = #bit_mask_128;
+                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+            }
+        };
+    }
+
+    assert_eq!(
+        (vec_ty.n_bits(), vec_ty.scalar_bits, lane_count),
+        (256, 8, 32),
+        "only 32-lane masks need a 256-bit inverse movemask"
+    );
+
+    let shuffle = mask_byte_shuffle_256();
+    let bit_mask = mask_bit_pattern_256();
+    quote! {
+        {
+            let bit_bytes = _mm256_broadcastsi128_si256(_mm_cvtsi32_si128(bits as i32));
+            let bit_bytes = _mm256_shuffle_epi8(bit_bytes, #shuffle);
+            let bit_mask = #bit_mask;
+            _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+        }
+    }
+}
+
+fn mask_bit_pattern_128() -> TokenStream {
+    let lanes = (0..16).map(|i| {
+        let bit = 1u16 << (i % 8);
+        quote! { #bit as i8 }
+    });
+    quote! { _mm_setr_epi8(#(#lanes),*) }
+}
+
+fn mask_bit_pattern_256() -> TokenStream {
+    let lanes = (0..32).map(|i| {
+        let bit = 1u16 << (i % 8);
+        quote! { #bit as i8 }
+    });
+    quote! { _mm256_setr_epi8(#(#lanes),*) }
+}
+
+fn mask_byte_shuffle_128(lane_count: usize) -> TokenStream {
+    let lanes = (0..16).map(|i| {
+        let byte = (i.min(lane_count - 1) / 8) as u8;
+        quote! { #byte as i8 }
+    });
+    quote! { _mm_setr_epi8(#(#lanes),*) }
+}
+
+fn mask_byte_shuffle_256() -> TokenStream {
+    let lanes = (0..32).map(|i| {
+        let byte = (i / 8) as u8;
+        quote! { #byte as i8 }
+    });
+    quote! { _mm256_setr_epi8(#(#lanes),*) }
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
         let intrinsic = set1_intrinsic(vec_ty);
@@ -242,6 +320,115 @@ impl X86 {
         }
     }
 
+    pub(crate) fn handle_mask_from_bitmask(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+    ) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "mask bitmask conversion only operates on masks"
+        );
+
+        let bytes = mask_from_bitmask_bytes(vec_ty);
+        let expr = if vec_ty.scalar_bits == 8 {
+            quote! {
+                #bytes.simd_into(self)
+            }
+        } else {
+            let extend = extend_intrinsic(ScalarType::Int, 8, vec_ty.scalar_bits, vec_ty.n_bits());
+            quote! {
+                #extend(#bytes).simd_into(self)
+            }
+        };
+
+        quote! {
+            #method_sig {
+                unsafe {
+                    #expr
+                }
+            }
+        }
+    }
+
+    pub(crate) fn handle_mask_to_bitmask(
+        &self,
+        method_sig: TokenStream,
+        vec_ty: &VecType,
+    ) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "mask bitmask conversion only operates on masks"
+        );
+
+        match vec_ty.scalar_bits {
+            8 => {
+                let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8);
+                let movemask = simple_intrinsic("movemask", &bits_ty);
+                quote! {
+                    #method_sig {
+                        unsafe { #movemask(a.into()) as u32 as u64 }
+                    }
+                }
+            }
+            16 => {
+                let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8);
+                let movemask = simple_intrinsic("movemask", &bits_ty);
+                let (even_bits, pair_bits, nibble_bits, byte_bits, word_bits) = match vec_ty.len {
+                    8 => (0x5555u64, 0x3333u64, 0x0f0fu64, 0x00ffu64, 0),
+                    16 => (
+                        0x5555_5555u64,
+                        0x3333_3333u64,
+                        0x0f0f_0f0fu64,
+                        0x00ff_00ffu64,
+                        0x0000_ffffu64,
+                    ),
+                    _ => unimplemented!(),
+                };
+                let merge_words = (vec_ty.len > 8).then(|| {
+                    quote! {
+                        bits = (bits | (bits >> 8)) & #word_bits;
+                    }
+                });
+
+                quote! {
+                    #method_sig {
+                        unsafe {
+                            // `_mm*_movemask_epi8` returns one bit per byte. For 16-bit masks both bytes have the
+                            // same sign bit, so keep one byte bit per lane and compact those bits.
+                            let mut bits = #movemask(a.into()) as u32 as u64;
+                            bits &= #even_bits;
+                            bits = (bits | (bits >> 1)) & #pair_bits;
+                            bits = (bits | (bits >> 2)) & #nibble_bits;
+                            bits = (bits | (bits >> 4)) & #byte_bits;
+                            #merge_words
+                            bits
+                        }
+                    }
+                }
+            }
+            32 | 64 => {
+                let float_ty = vec_ty.cast(ScalarType::Float);
+                let movemask = simple_intrinsic("movemask", &float_ty);
+                let cast = cast_ident(
+                    ScalarType::Mask,
+                    ScalarType::Float,
+                    vec_ty.scalar_bits,
+                    vec_ty.scalar_bits,
+                    vec_ty.n_bits(),
+                );
+                quote! {
+                    #method_sig {
+                        unsafe { #movemask(#cast(a.into())) as u32 as u64 }
+                    }
+                }
+            }
+            _ => unreachable!(),
+        }
+    }
+
     pub(crate) fn handle_compare(
         &self,
         method_sig: TokenStream,
diff --git a/fearless_simd_gen/src/ops.rs b/fearless_simd_gen/src/ops.rs
index 521b0b5d2..79fa35c3c 100644
--- a/fearless_simd_gen/src/ops.rs
+++ b/fearless_simd_gen/src/ops.rs
@@ -106,6 +106,10 @@ pub(crate) enum OpSig {
         quantifier: Quantifier,
         condition: bool,
     },
+    /// Takes a compact bitmask and returns the corresponding mask vector type.
+    MaskFromBitmask,
+    /// Takes a mask vector type and returns its compact bitmask representation.
+    MaskToBitmask,
     /// Takes an argument of an array of a certain scalar type, with the length (`block_size` * `block_count`) / [scalar
     /// type's byte size]. Returns a vector type of that scalar type and length.
     ///
@@ -265,6 +269,14 @@ impl Op {
                 let arg0 = &arg_names[0];
                 quote! { (self, #arg0: #ty<Self>) -> bool }
             }
+            OpSig::MaskFromBitmask => {
+                let arg0 = &arg_names[0];
+                quote! { (self, #arg0: u64) -> #ty<Self> }
+            }
+            OpSig::MaskToBitmask => {
+                let arg0 = &arg_names[0];
+                quote! { (self, #arg0: #ty<Self>) -> u64 }
+            }
             OpSig::Shift => {
                 let arg0 = &arg_names[0];
                 let arg1 = &arg_names[1];
@@ -341,6 +353,7 @@ impl Op {
             OpSig::LoadInterleaved { .. } | OpSig::StoreInterleaved { .. } | OpSig::StoreArray => {
                 return None;
             }
+            OpSig::MaskFromBitmask | OpSig::MaskToBitmask => return None,
             OpSig::Unary
             | OpSig::Cvt { .. }
             | OpSig::Reinterpret { .. }
@@ -558,6 +571,18 @@ const MASK_REPRESENTATION_OPS: &[Op] = &[
         },
         "Convert a SIMD mask to signed integer mask lanes.",
     ),
+    Op::new(
+        "from_bitmask",
+        OpKind::AssociatedOnly,
+        OpSig::MaskFromBitmask,
+        "Create a SIMD mask from a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are ignored.",
+    ),
+    Op::new(
+        "to_bitmask",
+        OpKind::AssociatedOnly,
+        OpSig::MaskToBitmask,
+        "Convert a SIMD mask to a compact bitmask.\n\nBit `i` maps to lane `i`, with lane 0 in the least significant bit. Bits above the number of lanes in this mask are cleared.",
+    ),
 ];
 
 const FLOAT_OPS: &[Op] = &[
@@ -1504,12 +1529,14 @@ impl OpSig {
     fn simd_trait_arg_names(&self) -> &'static [&'static str] {
         match self {
             Self::Splat | Self::FromArray { .. } => &["val"],
+            Self::MaskFromBitmask => &["bits"],
             Self::Unary
             | Self::Split { .. }
             | Self::Cvt { .. }
             | Self::Reinterpret { .. }
             | Self::WidenNarrow { .. }
             | Self::MaskReduce { .. }
+            | Self::MaskToBitmask
             | Self::AsArray { .. }
             | Self::FromBytes
             | Self::ToBytes => &["a"],
@@ -1533,6 +1560,8 @@ impl OpSig {
             Self::LoadInterleaved { .. }
             | Self::StoreInterleaved { .. }
             | Self::FromArray { .. }
+            | Self::MaskFromBitmask
+            | Self::MaskToBitmask
             | Self::FromBytes { .. }
             | Self::StoreArray => &[],
             Self::Unary
@@ -1593,6 +1622,8 @@ impl OpSig {
             | Self::Reinterpret { .. }
             | Self::WidenNarrow { .. }
             | Self::Shift
+            | Self::MaskFromBitmask
+            | Self::MaskToBitmask
             | Self::LoadInterleaved { .. }
             | Self::StoreInterleaved { .. }
             | Self::FromArray { .. }
diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs
index 3b20e3104..5dfea6adc 100644
--- a/fearless_simd_gen/src/types.rs
+++ b/fearless_simd_gen/src/types.rs
@@ -179,7 +179,7 @@ impl VecType {
             let scalar_bits = self.scalar_bits;
             format!(
                 "A SIMD mask of {len} logical lanes corresponding to {scalar_bits}-bit vector elements.\n\n\
-                The storage representation of this type is intentionally opaque. For compatibility with existing APIs, it may be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).",
+                The storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).",
             )
         } else {
             let scalar_name = self.scalar.rust_name(self.scalar_bits);
diff --git a/fearless_simd_tests/tests/mask_methods.rs b/fearless_simd_tests/tests/mask_methods.rs
new file mode 100644
index 000000000..ef1453021
--- /dev/null
+++ b/fearless_simd_tests/tests/mask_methods.rs
@@ -0,0 +1,62 @@
+// Copyright 2026 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use fearless_simd::*;
+use fearless_simd_dev_macros::simd_test;
+
+fn mask_bits(len: usize) -> u64 {
+    if len == 64 {
+        u64::MAX
+    } else {
+        (1u64 << len) - 1
+    }
+}
+
+macro_rules! check_mask_methods {
+    ($simd:expr, $mask:ident, $len:expr, $bits:expr) => {{
+        let all_bits = mask_bits($len);
+        let mut expected = $bits & all_bits;
+        let mut mask = <$mask<_> as SimdMask<_>>::from_bitmask($simd, $bits);
+
+        assert_eq!(mask.to_bitmask(), expected);
+        for i in 0..$len {
+            assert_eq!(mask.test(i), ((expected >> i) & 1) != 0);
+        }
+
+        mask.set(0, false);
+        expected &= !1;
+        assert_eq!(mask.to_bitmask(), expected);
+
+        mask.set($len - 1, true);
+        expected |= 1u64 << ($len - 1);
+        assert_eq!(mask.to_bitmask(), expected);
+
+        mask.set(1, true);
+        expected |= 1u64 << 1;
+        assert!(mask.test(1));
+        assert_eq!(mask.to_bitmask(), expected);
+
+        mask.set(1, false);
+        expected &= !(1u64 << 1);
+        assert!(!mask.test(1));
+        assert_eq!(mask.to_bitmask(), expected);
+    }};
+}
+
+#[simd_test]
+fn mask_bitmask_roundtrip_test_set<S: Simd>(simd: S) {
+    check_mask_methods!(simd, mask8x16, 16, 0x1_aa55_8001);
+    check_mask_methods!(simd, mask16x8, 8, 0x1_a5);
+    check_mask_methods!(simd, mask32x4, 4, 0x1d);
+    check_mask_methods!(simd, mask64x2, 2, 0x6);
+
+    check_mask_methods!(simd, mask8x32, 32, 0x1_8000_aa55);
+    check_mask_methods!(simd, mask16x16, 16, 0x1_aa55_8001);
+    check_mask_methods!(simd, mask32x8, 8, 0x1_a5);
+    check_mask_methods!(simd, mask64x4, 4, 0x1d);
+
+    check_mask_methods!(simd, mask8x64, 64, 0x8000_0001_5555_aaab);
+    check_mask_methods!(simd, mask16x32, 32, 0x1_8000_aa55);
+    check_mask_methods!(simd, mask32x16, 16, 0x1_aa55_8001);
+    check_mask_methods!(simd, mask64x8, 8, 0x1_a5);
+}
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index 4d2f053d8..ac410b667 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -10,6 +10,7 @@ use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
 mod harness;
+mod mask_methods;
 mod soundness;
 
 // Ensure that we can cast between generic native-width vectors
diff --git a/fearless_simd_tests/tests/soundness.rs b/fearless_simd_tests/tests/soundness.rs
index 04cd04c1c..3c78f52b2 100644
--- a/fearless_simd_tests/tests/soundness.rs
+++ b/fearless_simd_tests/tests/soundness.rs
@@ -63,6 +63,23 @@ macro_rules! for_each_simd_type {
     };
 }
 
+macro_rules! for_each_mask_type {
+    ($test:ident, $simd:expr) => {
+        $test!($simd, mask8x16, 16);
+        $test!($simd, mask16x8, 8);
+        $test!($simd, mask32x4, 4);
+        $test!($simd, mask64x2, 2);
+        $test!($simd, mask8x32, 32);
+        $test!($simd, mask16x16, 16);
+        $test!($simd, mask32x8, 8);
+        $test!($simd, mask64x4, 4);
+        $test!($simd, mask8x64, 64);
+        $test!($simd, mask16x32, 32);
+        $test!($simd, mask32x16, 16);
+        $test!($simd, mask64x8, 8);
+    };
+}
+
 macro_rules! check_from_slice_short {
     ($simd:expr, $vec:ident, $len:expr) => {
         assert_panics(stringify!($vec::from_slice), || {
@@ -82,6 +99,26 @@ macro_rules! check_store_slice_short {
     }};
 }
 
+macro_rules! check_mask_test_oob {
+    ($simd:expr, $mask:ident, $len:expr) => {{
+        let mask = $mask::splat($simd, false);
+
+        assert_panics(stringify!($mask::test), || {
+            let _ = mask.test($len);
+        });
+    }};
+}
+
+macro_rules! check_mask_set_oob {
+    ($simd:expr, $mask:ident, $len:expr) => {{
+        let mut mask = $mask::splat($simd, false);
+
+        assert_panics(stringify!($mask::set), || {
+            mask.set($len, true);
+        });
+    }};
+}
+
 #[simd_test]
 fn from_slice_rejects_short_slice<S: Simd>(simd: S) {
     for_each_simd_type!(check_from_slice_short, simd);
@@ -91,3 +128,13 @@ fn from_slice_rejects_short_slice<S: Simd>(simd: S) {
 fn store_slice_rejects_short_slice<S: Simd>(simd: S) {
     for_each_simd_type!(check_store_slice_short, simd);
 }
+
+#[simd_test]
+fn mask_test_rejects_out_of_bounds<S: Simd>(simd: S) {
+    for_each_mask_type!(check_mask_test_oob, simd);
+}
+
+#[simd_test]
+fn mask_set_rejects_out_of_bounds<S: Simd>(simd: S) {
+    for_each_mask_type!(check_mask_set_oob, simd);
+}

From da174ba4e3c1a46929ac27b473bebb550166bdb1 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 14:37:39 +0100
Subject: [PATCH 02/16] Implement set() via array roundtrip instead, matching
 std::simd assembly

---
 fearless_simd/src/generated/simd_trait.rs | 11 +--
 fearless_simd/src/generated/simd_types.rs | 84 +++++++++++++++++++++++
 fearless_simd_gen/src/mk_simd_trait.rs    | 11 +--
 fearless_simd_gen/src/mk_simd_types.rs    | 11 +++
 4 files changed, 97 insertions(+), 20 deletions(-)

diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 57058e0eb..6d0b013a2 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -3020,16 +3020,7 @@ pub trait SimdMask<S: Simd>:
     #[doc = r" Sets the value of one logical lane."]
     #[doc = r""]
     #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."]
-    #[inline(always)]
-    fn set(&mut self, index: usize, value: bool) {
-        assert!(index < Self::N);
-        let lane_mask = Self::from_bitmask(self.witness(), 1u64 << index);
-        if value {
-            *self = *self | lane_mask;
-        } else {
-            *self = *self & !lane_mask;
-        }
-    }
+    fn set(&mut self, index: usize, value: bool);
     #[doc = r" Create a SIMD mask from signed integer mask lanes."]
     #[doc = r""]
     #[doc = r" The slice must be exactly the size of the SIMD mask."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 2893ba9d5..90cb80cfd 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -682,6 +682,13 @@ impl<S: Simd> crate::SimdMask<S> for mask8x16<S> {
         self.simd.to_bitmask_mask8x16(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 16);
+        let mut lanes = self.simd.as_array_mask8x16(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask8x16(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 16] = slice.try_into().unwrap();
         simd.load_array_mask8x16(*slice)
@@ -1139,6 +1146,13 @@ impl<S: Simd> crate::SimdMask<S> for mask16x8<S> {
         self.simd.to_bitmask_mask16x8(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 8);
+        let mut lanes = self.simd.as_array_mask16x8(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask16x8(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 8] = slice.try_into().unwrap();
         simd.load_array_mask16x8(*slice)
@@ -1620,6 +1634,13 @@ impl<S: Simd> crate::SimdMask<S> for mask32x4<S> {
         self.simd.to_bitmask_mask32x4(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 4);
+        let mut lanes = self.simd.as_array_mask32x4(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask32x4(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 4] = slice.try_into().unwrap();
         simd.load_array_mask32x4(*slice)
@@ -1942,6 +1963,13 @@ impl<S: Simd> crate::SimdMask<S> for mask64x2<S> {
         self.simd.to_bitmask_mask64x2(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 2);
+        let mut lanes = self.simd.as_array_mask64x2(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask64x2(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 2] = slice.try_into().unwrap();
         simd.load_array_mask64x2(*slice)
@@ -2669,6 +2697,13 @@ impl<S: Simd> crate::SimdMask<S> for mask8x32<S> {
         self.simd.to_bitmask_mask8x32(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 32);
+        let mut lanes = self.simd.as_array_mask8x32(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask8x32(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 32] = slice.try_into().unwrap();
         simd.load_array_mask8x32(*slice)
@@ -3152,6 +3187,13 @@ impl<S: Simd> crate::SimdMask<S> for mask16x16<S> {
         self.simd.to_bitmask_mask16x16(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 16);
+        let mut lanes = self.simd.as_array_mask16x16(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask16x16(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 16] = slice.try_into().unwrap();
         simd.load_array_mask16x16(*slice)
@@ -3647,6 +3689,13 @@ impl<S: Simd> crate::SimdMask<S> for mask32x8<S> {
         self.simd.to_bitmask_mask32x8(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 8);
+        let mut lanes = self.simd.as_array_mask32x8(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask32x8(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 8] = slice.try_into().unwrap();
         simd.load_array_mask32x8(*slice)
@@ -3976,6 +4025,13 @@ impl<S: Simd> crate::SimdMask<S> for mask64x4<S> {
         self.simd.to_bitmask_mask64x4(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 4);
+        let mut lanes = self.simd.as_array_mask64x4(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask64x4(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 4] = slice.try_into().unwrap();
         simd.load_array_mask64x4(*slice)
@@ -4691,6 +4747,13 @@ impl<S: Simd> crate::SimdMask<S> for mask8x64<S> {
         self.simd.to_bitmask_mask8x64(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 64);
+        let mut lanes = self.simd.as_array_mask8x64(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask8x64(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i8]) -> Self {
         let slice: &[i8; 64] = slice.try_into().unwrap();
         simd.load_array_mask8x64(*slice)
@@ -5162,6 +5225,13 @@ impl<S: Simd> crate::SimdMask<S> for mask16x32<S> {
         self.simd.to_bitmask_mask16x32(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 32);
+        let mut lanes = self.simd.as_array_mask16x32(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask16x32(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i16]) -> Self {
         let slice: &[i16; 32] = slice.try_into().unwrap();
         simd.load_array_mask16x32(*slice)
@@ -5657,6 +5727,13 @@ impl<S: Simd> crate::SimdMask<S> for mask32x16<S> {
         self.simd.to_bitmask_mask32x16(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 16);
+        let mut lanes = self.simd.as_array_mask32x16(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask32x16(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i32]) -> Self {
         let slice: &[i32; 16] = slice.try_into().unwrap();
         simd.load_array_mask32x16(*slice)
@@ -5980,6 +6057,13 @@ impl<S: Simd> crate::SimdMask<S> for mask64x8<S> {
         self.simd.to_bitmask_mask64x8(self)
     }
     #[inline(always)]
+    fn set(&mut self, index: usize, value: bool) {
+        assert!(index < 8);
+        let mut lanes = self.simd.as_array_mask64x8(*self);
+        lanes[index] = if value { !0 } else { 0 };
+        *self = self.simd.load_array_mask64x8(lanes);
+    }
+    #[inline(always)]
     fn from_slice(simd: S, slice: &[i64]) -> Self {
         let slice: &[i64; 8] = slice.try_into().unwrap();
         simd.load_array_mask64x8(*slice)
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index 2d9d00a60..7e42af2f7 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -332,16 +332,7 @@ fn mk_simd_mask() -> TokenStream {
             /// Sets the value of one logical lane.
             ///
             /// Panics if `index` is greater than or equal to the number of lanes in the mask.
-            #[inline(always)]
-            fn set(&mut self, index: usize, value: bool) {
-                assert!(index < Self::N);
-                let lane_mask = Self::from_bitmask(self.witness(), 1u64 << index);
-                if value {
-                    *self = *self | lane_mask;
-                } else {
-                    *self = *self & !lane_mask;
-                }
-            }
+            fn set(&mut self, index: usize, value: bool);
 
             /// Create a SIMD mask from signed integer mask lanes.
             ///
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index bfe49be3c..541c69842 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -322,6 +322,9 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
         }
     }
 
+    // Current backends store masks as signed integer lanes, so `set` uses a generic
+    // spill/update/reload path. Future compact predicate backends such as AVX-512 can
+    // switch this implementation to `to_bitmask`/`from_bitmask`.
     quote! {
         impl<S: Simd> crate::SimdMask<S> for #name<S> {
             type Element = #scalar;
@@ -347,6 +350,14 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
                 self.simd.#to_bitmask_op(self)
             }
 
+            #[inline(always)]
+            fn set(&mut self, index: usize, value: bool) {
+                assert!(index < #len);
+                let mut lanes = self.simd.#as_array_op(*self);
+                lanes[index] = if value { !0 } else { 0 };
+                *self = self.simd.#from_array_op(lanes);
+            }
+
             #[inline(always)]
             fn from_slice(simd: S, slice: &[#scalar]) -> Self {
                 let slice: &[#scalar; #len] = slice.try_into().unwrap();

From 9b0cfb0a4af6d6fb47999c9d460dae20bc556d11 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 15:03:14 +0100
Subject: [PATCH 03/16] Optimize to_bitmask on NEON

---
 fearless_simd/src/generated/neon.rs | 55 ++++++++---------------
 fearless_simd_gen/src/mk_neon.rs    | 67 +++++++++++++++++++++++++++--
 2 files changed, 83 insertions(+), 39 deletions(-)

diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index f4cd64937..dc1abd883 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -793,16 +793,14 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
-        let lanes = self.as_array_mask8x16(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 16usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
+        unsafe {
+            let weights =
+                vld1q_u8([1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
+            let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
+            let lo = vaddv_u8(vget_low_u8(bits)) as u64;
+            let hi = vaddv_u8(vget_high_u8(bits)) as u64;
+            lo | (hi << 8)
         }
-        bits
     }
     #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
@@ -1298,16 +1296,11 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
-        let lanes = self.as_array_mask16x8(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 8usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
+        unsafe {
+            let weights = vld1q_u16([1, 2, 4, 8, 16, 32, 64, 128].as_ptr());
+            let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
+            vaddvq_u16(bits) as u64
         }
-        bits
     }
     #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
@@ -1807,16 +1800,11 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
-        let lanes = self.as_array_mask32x4(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 4usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
+        unsafe {
+            let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
+            let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
+            vaddvq_u32(bits) as u64
         }
-        bits
     }
     #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
@@ -2129,16 +2117,11 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
-        let lanes = self.as_array_mask64x2(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 2usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
+        unsafe {
+            let weights = vld1q_u64([1, 2].as_ptr());
+            let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
+            vaddvq_u64(bits)
         }
-        bits
     }
     #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index 100d4725e..fc9df7f5e 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -6,8 +6,7 @@ use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
     generic_as_array, generic_from_array, generic_from_bytes, generic_mask_from_bitmask,
-    generic_mask_to_bitmask, generic_op_name, generic_store_array, generic_to_bytes,
-    integer_lane_mask_splat_arg,
+    generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -532,7 +531,7 @@ impl Level for Neon {
                 }
             }
             OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
-            OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
                 generic_as_array(method_sig, vec_ty, kind, self.max_block_size(), |vec_ty| {
@@ -564,6 +563,68 @@ impl Level for Neon {
     }
 }
 
+impl Neon {
+    fn handle_mask_to_bitmask(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "mask bitmask conversion only operates on masks"
+        );
+        assert_eq!(
+            vec_ty.n_bits(),
+            self.native_width(),
+            "wide masks should use the generic split implementation"
+        );
+
+        match vec_ty.scalar_bits {
+            8 => quote! {
+                #method_sig {
+                    unsafe {
+                        let weights = vld1q_u8([
+                            1, 2, 4, 8, 16, 32, 64, 128,
+                            1, 2, 4, 8, 16, 32, 64, 128,
+                        ].as_ptr());
+                        let bits = vandq_u8(vreinterpretq_u8_s8(a.into()), weights);
+                        let lo = vaddv_u8(vget_low_u8(bits)) as u64;
+                        let hi = vaddv_u8(vget_high_u8(bits)) as u64;
+                        lo | (hi << 8)
+                    }
+                }
+            },
+            16 => quote! {
+                #method_sig {
+                    unsafe {
+                        let weights = vld1q_u16([
+                            1, 2, 4, 8, 16, 32, 64, 128,
+                        ].as_ptr());
+                        let bits = vandq_u16(vreinterpretq_u16_s16(a.into()), weights);
+                        vaddvq_u16(bits) as u64
+                    }
+                }
+            },
+            32 => quote! {
+                #method_sig {
+                    unsafe {
+                        let weights = vld1q_u32([1, 2, 4, 8].as_ptr());
+                        let bits = vandq_u32(vreinterpretq_u32_s32(a.into()), weights);
+                        vaddvq_u32(bits) as u64
+                    }
+                }
+            },
+            64 => quote! {
+                #method_sig {
+                    unsafe {
+                        let weights = vld1q_u64([1, 2].as_ptr());
+                        let bits = vandq_u64(vreinterpretq_u64_s64(a.into()), weights);
+                        vaddvq_u64(bits)
+                    }
+                }
+            },
+            _ => unimplemented!(),
+        }
+    }
+}
+
 fn mk_slide_helpers() -> TokenStream {
     let shifts = (0_usize..16).map(|shift| {
         let shift_i32 = i32::try_from(shift).unwrap();

From 4612f8ca2c1dfa979d2ba318d00a3dff4f3b280c Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 15:20:07 +0100
Subject: [PATCH 04/16] optimize from_bitmask on NEON

---
 fearless_simd/src/generated/neon.rs | 42 +++++++++++++-----
 fearless_simd_gen/src/mk_neon.rs    | 68 +++++++++++++++++++++++++++--
 2 files changed, 95 insertions(+), 15 deletions(-)

diff --git a/fearless_simd/src/generated/neon.rs b/fearless_simd/src/generated/neon.rs
index dc1abd883..984082976 100644
--- a/fearless_simd/src/generated/neon.rs
+++ b/fearless_simd/src/generated/neon.rs
@@ -787,9 +787,18 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
-        let lanes: [i8; 16usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        unsafe {
+            let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+            let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
+            let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
+            let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
+            let hi = vcltq_s16(vreinterpretq_s16_u16(hi), vdupq_n_s16(0));
+            vcombine_s8(
+                vmovn_s16(vreinterpretq_s16_u16(lo)),
+                vmovn_s16(vreinterpretq_s16_u16(hi)),
+            )
+            .simd_into(self)
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
@@ -1290,9 +1299,12 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
-        let lanes: [i16; 8usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        unsafe {
+            let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+            let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
+            let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
+            vreinterpretq_s16_u16(mask).simd_into(self)
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
@@ -1794,9 +1806,12 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
-        let lanes: [i32; 4usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        unsafe {
+            let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
+            let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
+            let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
+            vreinterpretq_s32_u32(mask).simd_into(self)
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
@@ -2111,9 +2126,12 @@ impl Simd for Neon {
     }
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
-        let lanes: [i64; 2usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        unsafe {
+            let shifts = vld1q_s64([63, 62].as_ptr());
+            let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
+            let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
+            vreinterpretq_s64_u64(mask).simd_into(self)
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
diff --git a/fearless_simd_gen/src/mk_neon.rs b/fearless_simd_gen/src/mk_neon.rs
index fc9df7f5e..9765c06df 100644
--- a/fearless_simd_gen/src/mk_neon.rs
+++ b/fearless_simd_gen/src/mk_neon.rs
@@ -5,8 +5,8 @@ use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{ToTokens as _, format_ident, quote};
 
 use crate::generic::{
-    generic_as_array, generic_from_array, generic_from_bytes, generic_mask_from_bitmask,
-    generic_op_name, generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg,
+    generic_as_array, generic_from_array, generic_from_bytes, generic_op_name, generic_store_array,
+    generic_to_bytes, integer_lane_mask_splat_arg,
 };
 use crate::level::Level;
 use crate::ops::{Op, SlideGranularity, valid_reinterpret};
@@ -530,7 +530,7 @@ impl Level for Neon {
                     }
                 }
             }
-            OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskFromBitmask => self.handle_mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => self.handle_mask_to_bitmask(method_sig, vec_ty),
             OpSig::FromArray { kind } => generic_from_array(method_sig, vec_ty, kind),
             OpSig::AsArray { kind } => {
@@ -564,6 +564,68 @@ impl Level for Neon {
 }
 
 impl Neon {
+    fn handle_mask_from_bitmask(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+        assert_eq!(
+            vec_ty.scalar,
+            ScalarType::Mask,
+            "mask bitmask conversion only operates on masks"
+        );
+        assert_eq!(
+            vec_ty.n_bits(),
+            self.native_width(),
+            "wide masks should use the generic split implementation"
+        );
+
+        match vec_ty.scalar_bits {
+            8 => quote! {
+                #method_sig {
+                    unsafe {
+                        let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+                        let lo = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
+                        let hi = vshlq_u16(vdupq_n_u16((bits >> 8) as u16), shifts);
+                        let lo = vcltq_s16(vreinterpretq_s16_u16(lo), vdupq_n_s16(0));
+                        let hi = vcltq_s16(vreinterpretq_s16_u16(hi), vdupq_n_s16(0));
+                        vcombine_s8(
+                            vmovn_s16(vreinterpretq_s16_u16(lo)),
+                            vmovn_s16(vreinterpretq_s16_u16(hi)),
+                        ).simd_into(self)
+                    }
+                }
+            },
+            16 => quote! {
+                #method_sig {
+                    unsafe {
+                        let shifts = vld1q_s16([15, 14, 13, 12, 11, 10, 9, 8].as_ptr());
+                        let shifted = vshlq_u16(vdupq_n_u16(bits as u16), shifts);
+                        let mask = vcltq_s16(vreinterpretq_s16_u16(shifted), vdupq_n_s16(0));
+                        vreinterpretq_s16_u16(mask).simd_into(self)
+                    }
+                }
+            },
+            32 => quote! {
+                #method_sig {
+                    unsafe {
+                        let shifts = vld1q_s32([31, 30, 29, 28].as_ptr());
+                        let shifted = vshlq_u32(vdupq_n_u32(bits as u32), shifts);
+                        let mask = vcltq_s32(vreinterpretq_s32_u32(shifted), vdupq_n_s32(0));
+                        vreinterpretq_s32_u32(mask).simd_into(self)
+                    }
+                }
+            },
+            64 => quote! {
+                #method_sig {
+                    unsafe {
+                        let shifts = vld1q_s64([63, 62].as_ptr());
+                        let shifted = vshlq_u64(vdupq_n_u64(bits), shifts);
+                        let mask = vcltq_s64(vreinterpretq_s64_u64(shifted), vdupq_n_s64(0));
+                        vreinterpretq_s64_u64(mask).simd_into(self)
+                    }
+                }
+            },
+            _ => unimplemented!(),
+        }
+    }
+
     fn handle_mask_to_bitmask(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
         assert_eq!(
             vec_ty.scalar,

From 5dae6cc8673c36e370dd455aee7be93bcff17c6a Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 15:48:40 +0100
Subject: [PATCH 05/16] Expand to/from_bitmask roundtrip tests

---
 fearless_simd_tests/tests/mask_methods.rs | 122 ++++++++++++++++++++++
 1 file changed, 122 insertions(+)

diff --git a/fearless_simd_tests/tests/mask_methods.rs b/fearless_simd_tests/tests/mask_methods.rs
index ef1453021..faa188c5b 100644
--- a/fearless_simd_tests/tests/mask_methods.rs
+++ b/fearless_simd_tests/tests/mask_methods.rs
@@ -12,6 +12,92 @@ fn mask_bits(len: usize) -> u64 {
     }
 }
 
+const CHUNK_PATTERNS_16: [u64; 8] = [
+    0x0000, 0x0001, 0x00ff, 0x5555, 0x8000, 0xaaaa, 0xff00, 0xffff,
+];
+
+fn for_each_exhaustive_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
+    assert!(
+        len <= 16,
+        "exhaustive bitmask roundtrip tests are only practical up to 16 lanes"
+    );
+
+    let all_bits = mask_bits(len);
+    for bits in 0..(1u64 << len) {
+        f(bits);
+        f(bits | !all_bits);
+    }
+}
+
+fn for_each_chunked_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
+    assert!(
+        len % 16 == 0,
+        "chunked bitmask roundtrip tests expect 16-lane chunks"
+    );
+    assert!(
+        len <= 64,
+        "chunked bitmask roundtrip tests only support u64 bitmasks"
+    );
+
+    let chunks = len / 16;
+    let mut pattern_count = 1usize;
+    for _ in 0..chunks {
+        pattern_count *= CHUNK_PATTERNS_16.len();
+    }
+
+    for mut pattern_index in 0..pattern_count {
+        let mut bits = 0u64;
+        for chunk in 0..chunks {
+            let chunk_pattern = CHUNK_PATTERNS_16[pattern_index % CHUNK_PATTERNS_16.len()];
+            pattern_index /= CHUNK_PATTERNS_16.len();
+            bits |= chunk_pattern << (chunk * 16);
+        }
+        f(bits);
+    }
+}
+
+fn for_each_wide_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
+    let all_bits = mask_bits(len);
+    let mut check = |bits| {
+        f(bits);
+        f(bits | !all_bits);
+    };
+
+    check(0);
+    check(all_bits);
+    check(all_bits & 0x5555_5555_5555_5555);
+    check(all_bits & 0xaaaa_aaaa_aaaa_aaaa);
+
+    for bit in 0..len {
+        let bits = 1u64 << bit;
+        check(bits);
+        check(all_bits ^ bits);
+    }
+
+    for_each_chunked_bitmask(len, check);
+}
+
+macro_rules! check_bitmask_roundtrip {
+    ($simd:expr, $mask:ident, $len:expr, $bits:expr) => {{
+        let raw_bits = $bits;
+        let expected = raw_bits & mask_bits($len);
+        let mask = <$mask<_> as SimdMask<_>>::from_bitmask($simd, raw_bits);
+
+        assert_eq!(
+            mask.to_bitmask(),
+            expected,
+            "{}::from_bitmask({raw_bits:#018x}).to_bitmask()",
+            stringify!($mask)
+        );
+        assert_eq!(
+            <$mask<_> as SimdMask<_>>::from_bitmask($simd, mask.to_bitmask()).to_bitmask(),
+            expected,
+            "{}::from_bitmask({raw_bits:#018x}).to_bitmask() roundtripped again",
+            stringify!($mask)
+        );
+    }};
+}
+
 macro_rules! check_mask_methods {
     ($simd:expr, $mask:ident, $len:expr, $bits:expr) => {{
         let all_bits = mask_bits($len);
@@ -43,6 +129,42 @@ macro_rules! check_mask_methods {
     }};
 }
 
+#[simd_test]
+fn mask_bitmask_roundtrip_exhaustive<S: Simd>(simd: S) {
+    for_each_exhaustive_bitmask(16, |bits| {
+        check_bitmask_roundtrip!(simd, mask8x16, 16, bits);
+        check_bitmask_roundtrip!(simd, mask16x16, 16, bits);
+        check_bitmask_roundtrip!(simd, mask32x16, 16, bits);
+    });
+
+    for_each_exhaustive_bitmask(8, |bits| {
+        check_bitmask_roundtrip!(simd, mask16x8, 8, bits);
+        check_bitmask_roundtrip!(simd, mask32x8, 8, bits);
+        check_bitmask_roundtrip!(simd, mask64x8, 8, bits);
+    });
+
+    for_each_exhaustive_bitmask(4, |bits| {
+        check_bitmask_roundtrip!(simd, mask32x4, 4, bits);
+        check_bitmask_roundtrip!(simd, mask64x4, 4, bits);
+    });
+
+    for_each_exhaustive_bitmask(2, |bits| {
+        check_bitmask_roundtrip!(simd, mask64x2, 2, bits);
+    });
+}
+
+#[simd_test]
+fn mask_bitmask_roundtrip_wide_patterns<S: Simd>(simd: S) {
+    for_each_wide_bitmask(32, |bits| {
+        check_bitmask_roundtrip!(simd, mask8x32, 32, bits);
+        check_bitmask_roundtrip!(simd, mask16x32, 32, bits);
+    });
+
+    for_each_wide_bitmask(64, |bits| {
+        check_bitmask_roundtrip!(simd, mask8x64, 64, bits);
+    });
+}
+
 #[simd_test]
 fn mask_bitmask_roundtrip_test_set<S: Simd>(simd: S) {
     check_mask_methods!(simd, mask8x16, 16, 0x1_aa55_8001);

From 9b7a6c5d11dd9e66bb2fe565bf6ecb25a6ac3f10 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 16:02:17 +0100
Subject: [PATCH 06/16] Further optimize from_bitmask for x86, allow more
 granular fall-through to the generic implementation to make it possible

---
 fearless_simd/src/generated/avx2.rs   | 346 +++++++++++++++-----------
 fearless_simd/src/generated/sse4_2.rs | 167 ++++++++-----
 fearless_simd_gen/src/level.rs        |   7 +-
 fearless_simd_gen/src/mk_x86.rs       | 274 +++++++++++++++++++-
 4 files changed, 567 insertions(+), 227 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 44cb960f2..1d40a17de 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -1428,28 +1428,20 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
         unsafe {
-            _mm_cvtepi8_epi16({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+            {
+                let bit_lanes = _mm_set1_epi16(bits as i16);
+                let bit_mask = _mm_setr_epi16(
+                    1u64 as i16,
+                    2u64 as i16,
+                    4u64 as i16,
+                    8u64 as i16,
+                    16u64 as i16,
+                    32u64 as i16,
+                    64u64 as i16,
+                    128u64 as i16,
                 );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+                _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -1968,28 +1960,11 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
         unsafe {
-            _mm_cvtepi8_epi32({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+            {
+                let bit_lanes = _mm_set1_epi32(bits as i32);
+                let bit_mask = _mm_setr_epi32(1u64 as i32, 2u64 as i32, 4u64 as i32, 8u64 as i32);
+                _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -2302,28 +2277,11 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         unsafe {
-            _mm_cvtepi8_epi64({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+            {
+                let bit_lanes = _mm_set1_epi64x(bits as i64);
+                let bit_mask = _mm_set_epi64x(2, 1);
+                _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -4167,36 +4125,28 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask16x16(self, bits: u64) -> mask16x16<Self> {
         unsafe {
-            _mm256_cvtepi8_epi16({
-                let bit_bytes = _mm_cvtsi32_si128(bits as i32);
-                let bit_bytes = _mm_shuffle_epi8(
-                    bit_bytes,
-                    _mm_setr_epi8(
-                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                    ),
-                );
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+            {
+                let bit_lanes = _mm256_set1_epi16(bits as i16);
+                let bit_mask = _mm256_setr_epi16(
+                    1u64 as i16,
+                    2u64 as i16,
+                    4u64 as i16,
+                    8u64 as i16,
+                    16u64 as i16,
+                    32u64 as i16,
+                    64u64 as i16,
+                    128u64 as i16,
+                    256u64 as i16,
+                    512u64 as i16,
+                    1024u64 as i16,
+                    2048u64 as i16,
+                    4096u64 as i16,
+                    8192u64 as i16,
+                    16384u64 as i16,
+                    32768u64 as i16,
                 );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+                _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -4853,28 +4803,20 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask32x8(self, bits: u64) -> mask32x8<Self> {
         unsafe {
-            _mm256_cvtepi8_epi32({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+            {
+                let bit_lanes = _mm256_set1_epi32(bits as i32);
+                let bit_mask = _mm256_setr_epi32(
+                    1u64 as i32,
+                    2u64 as i32,
+                    4u64 as i32,
+                    8u64 as i32,
+                    16u64 as i32,
+                    32u64 as i32,
+                    64u64 as i32,
+                    128u64 as i32,
                 );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+                _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -5262,28 +5204,11 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
         unsafe {
-            _mm256_cvtepi8_epi64({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+            {
+                let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
+                _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -6391,9 +6316,80 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        let lo = self.from_bitmask_mask8x32(bits);
-        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
-        self.combine_mask8x32(lo, hi)
+        unsafe {
+            {
+                let bit_bytes = _mm256_set1_epi64x(bits as i64);
+                let bit_mask = _mm256_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                mask8x64 {
+                    val: crate::support::Aligned512([
+                        {
+                            let bit_bytes = _mm256_shuffle_epi8(
+                                bit_bytes,
+                                _mm256_setr_epi8(
+                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8,
+                                    1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                                    1u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
+                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 3u8 as i8,
+                                    3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
+                                    3u8 as i8, 3u8 as i8,
+                                ),
+                            );
+                            _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_bytes = _mm256_shuffle_epi8(
+                                bit_bytes,
+                                _mm256_setr_epi8(
+                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8,
+                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 5u8 as i8, 5u8 as i8,
+                                    5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8,
+                                    5u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8,
+                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 7u8 as i8,
+                                    7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8,
+                                    7u8 as i8, 7u8 as i8,
+                                ),
+                            );
+                            _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                        },
+                    ]),
+                    simd: self,
+                }
+            }
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
@@ -7857,9 +7853,42 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn from_bitmask_mask32x16(self, bits: u64) -> mask32x16<Self> {
-        let lo = self.from_bitmask_mask32x8(bits);
-        let hi = self.from_bitmask_mask32x8(bits >> 8usize);
-        self.combine_mask32x8(lo, hi)
+        unsafe {
+            {
+                let bit_lanes = _mm256_set1_epi32(bits as i32);
+                mask32x16 {
+                    val: crate::support::Aligned512([
+                        {
+                            let bit_mask = _mm256_setr_epi32(
+                                1u64 as i32,
+                                2u64 as i32,
+                                4u64 as i32,
+                                8u64 as i32,
+                                16u64 as i32,
+                                32u64 as i32,
+                                64u64 as i32,
+                                128u64 as i32,
+                            );
+                            _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_mask = _mm256_setr_epi32(
+                                256u64 as i32,
+                                512u64 as i32,
+                                1024u64 as i32,
+                                2048u64 as i32,
+                                4096u64 as i32,
+                                8192u64 as i32,
+                                16384u64 as i32,
+                                32768u64 as i32,
+                            );
+                            _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                        },
+                    ]),
+                    simd: self,
+                }
+            }
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask32x16(self, a: mask32x16<Self>) -> u64 {
@@ -8283,9 +8312,34 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
-        let lo = self.from_bitmask_mask64x4(bits);
-        let hi = self.from_bitmask_mask64x4(bits >> 4usize);
-        self.combine_mask64x4(lo, hi)
+        unsafe {
+            {
+                let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                mask64x8 {
+                    val: crate::support::Aligned512([
+                        {
+                            let bit_mask = _mm256_set_epi64x(
+                                8u64 as i64,
+                                4u64 as i64,
+                                2u64 as i64,
+                                1u64 as i64,
+                            );
+                            _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_mask = _mm256_set_epi64x(
+                                128u64 as i64,
+                                64u64 as i64,
+                                32u64 as i64,
+                                16u64 as i64,
+                            );
+                            _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                        },
+                    ]),
+                    simd: self,
+                }
+            }
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask64x8(self, a: mask64x8<Self>) -> u64 {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index eb894b43e..abaa82fca 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -1477,28 +1477,20 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
         unsafe {
-            _mm_cvtepi8_epi16({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+            {
+                let bit_lanes = _mm_set1_epi16(bits as i16);
+                let bit_mask = _mm_setr_epi16(
+                    1u64 as i16,
+                    2u64 as i16,
+                    4u64 as i16,
+                    8u64 as i16,
+                    16u64 as i16,
+                    32u64 as i16,
+                    64u64 as i16,
+                    128u64 as i16,
                 );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+                _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -2026,28 +2018,11 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
         unsafe {
-            _mm_cvtepi8_epi32({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+            {
+                let bit_lanes = _mm_set1_epi32(bits as i32);
+                let bit_mask = _mm_setr_epi32(1u64 as i32, 2u64 as i32, 4u64 as i32, 8u64 as i32);
+                _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -2366,28 +2341,11 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         unsafe {
-            _mm_cvtepi8_epi64({
-                let bit_bytes = _mm_set1_epi8(bits as i8);
-                let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                );
-                _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
-            })
+            {
+                let bit_lanes = _mm_set1_epi64x(bits as i64);
+                let bit_mask = _mm_set_epi64x(2, 1);
+                _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+            }
             .simd_into(self)
         }
     }
@@ -6371,9 +6329,82 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
-        let lo = self.from_bitmask_mask8x32(bits);
-        let hi = self.from_bitmask_mask8x32(bits >> 32usize);
-        self.combine_mask8x32(lo, hi)
+        unsafe {
+            {
+                let bit_bytes = _mm_set1_epi64x(bits as i64);
+                let bit_mask = _mm_setr_epi8(
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                    1u16 as i8,
+                    2u16 as i8,
+                    4u16 as i8,
+                    8u16 as i8,
+                    16u16 as i8,
+                    32u16 as i8,
+                    64u16 as i8,
+                    128u16 as i8,
+                );
+                mask8x64 {
+                    val: crate::support::Aligned512([
+                        {
+                            let bit_bytes = _mm_shuffle_epi8(
+                                bit_bytes,
+                                _mm_setr_epi8(
+                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
+                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8,
+                                    1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                                    1u8 as i8,
+                                ),
+                            );
+                            _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_bytes = _mm_shuffle_epi8(
+                                bit_bytes,
+                                _mm_setr_epi8(
+                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
+                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 3u8 as i8, 3u8 as i8,
+                                    3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
+                                    3u8 as i8,
+                                ),
+                            );
+                            _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_bytes = _mm_shuffle_epi8(
+                                bit_bytes,
+                                _mm_setr_epi8(
+                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8,
+                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 5u8 as i8, 5u8 as i8,
+                                    5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8,
+                                    5u8 as i8,
+                                ),
+                            );
+                            _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                        },
+                        {
+                            let bit_bytes = _mm_shuffle_epi8(
+                                bit_bytes,
+                                _mm_setr_epi8(
+                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8,
+                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 7u8 as i8, 7u8 as i8,
+                                    7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8,
+                                    7u8 as i8,
+                                ),
+                            );
+                            _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                        },
+                    ]),
+                    simd: self,
+                }
+            }
+        }
     }
     #[inline(always)]
     fn to_bitmask_mask8x64(self, a: mask8x64<Self>) -> u64 {
diff --git a/fearless_simd_gen/src/level.rs b/fearless_simd_gen/src/level.rs
index c4800698f..61ec20303 100644
--- a/fearless_simd_gen/src/level.rs
+++ b/fearless_simd_gen/src/level.rs
@@ -46,6 +46,11 @@ pub(crate) trait Level {
     /// Generate a single operation's method on the `Simd` implementation.
     fn make_method(&self, op: Op, vec_ty: &VecType) -> TokenStream;
 
+    /// Determine whether an operation should defer to the generic split/combine implementation.
+    fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool {
+        op.sig.should_use_generic_op(vec_ty, self.native_width())
+    }
+
     fn token(&self) -> Ident {
         Ident::new(self.name(), Span::call_site())
     }
@@ -91,7 +96,7 @@ pub(crate) trait Level {
         let mut methods = vec![];
         for vec_ty in SIMD_TYPES {
             for op in ops_for_type(vec_ty) {
-                if op.sig.should_use_generic_op(vec_ty, native_width) {
+                if self.should_use_generic_op(&op, vec_ty) {
                     methods.push(generic_op(&op, vec_ty));
                     continue;
                 }
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index e0d8c1cc1..47929ce98 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -168,6 +168,26 @@ impl Level for X86 {
         }
     }
 
+    fn should_use_generic_op(&self, op: &Op, vec_ty: &VecType) -> bool {
+        let should_use_generic = op.sig.should_use_generic_op(vec_ty, self.native_width());
+        if !should_use_generic {
+            return false;
+        }
+
+        // Some 512-bit masks can be constructed directly from one broadcast, avoiding the
+        // shift-and-rebroadcast shape from generic split/combine.
+        if matches!(op.sig, OpSig::MaskFromBitmask)
+            && vec_ty.scalar == ScalarType::Mask
+            && vec_ty.n_bits() == 512
+            && (vec_ty.scalar_bits == 8
+                || (*self == Self::Avx2 && matches!(vec_ty.scalar_bits, 32 | 64)))
+        {
+            return false;
+        }
+
+        true
+    }
+
     fn make_method(&self, op: Op, vec_ty: &VecType) -> TokenStream {
         let Op { sig, method, .. } = op;
         let method_sig = op.simd_trait_method_sig(vec_ty);
@@ -270,6 +290,202 @@ fn mask_from_bitmask_bytes(vec_ty: &VecType) -> TokenStream {
     }
 }
 
+fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
+    let lane_count = vec_ty.len;
+    let scalar_bits = vec_ty.scalar_bits;
+
+    match (vec_ty.n_bits(), scalar_bits) {
+        (128, 16) => {
+            let lanes = (0..lane_count).map(|i| {
+                let bit = 1u64 << i;
+                quote! { #bit as i16 }
+            });
+            quote! {
+                {
+                    let bit_lanes = _mm_set1_epi16(bits as i16);
+                    let bit_mask = _mm_setr_epi16(#(#lanes),*);
+                    _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        (256, 16) => {
+            let lanes = (0..lane_count).map(|i| {
+                let bit = 1u64 << i;
+                quote! { #bit as i16 }
+            });
+            quote! {
+                {
+                    let bit_lanes = _mm256_set1_epi16(bits as i16);
+                    let bit_mask = _mm256_setr_epi16(#(#lanes),*);
+                    _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        (128, 32) => {
+            let lanes = (0..lane_count).map(|i| {
+                let bit = 1u64 << i;
+                quote! { #bit as i32 }
+            });
+            quote! {
+                {
+                    let bit_lanes = _mm_set1_epi32(bits as i32);
+                    let bit_mask = _mm_setr_epi32(#(#lanes),*);
+                    _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        (256, 32) => {
+            let lanes = (0..lane_count).map(|i| {
+                let bit = 1u64 << i;
+                quote! { #bit as i32 }
+            });
+            quote! {
+                {
+                    let bit_lanes = _mm256_set1_epi32(bits as i32);
+                    let bit_mask = _mm256_setr_epi32(#(#lanes),*);
+                    _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        (128, 64) => {
+            assert_eq!(lane_count, 2);
+            quote! {
+                {
+                    let bit_lanes = _mm_set1_epi64x(bits as i64);
+                    let bit_mask = _mm_set_epi64x(2, 1);
+                    _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        (256, 64) => {
+            assert_eq!(lane_count, 4);
+            quote! {
+                {
+                    let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                    let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
+                    _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                }
+            }
+        }
+        _ => unimplemented!(),
+    }
+}
+
+fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
+    assert_eq!(vec_ty.n_bits(), 512);
+    assert!(
+        matches!(vec_ty.scalar_bits, 32 | 64),
+        "only 32-bit and 64-bit AVX2 masks use direct wide lowering"
+    );
+
+    let ty = vec_ty.rust();
+    let lanes_per_chunk = 256 / vec_ty.scalar_bits;
+    let chunks = (0..2).map(|chunk| {
+        let chunk_start = chunk * lanes_per_chunk;
+        match vec_ty.scalar_bits {
+            32 => {
+                let lanes = (0..lanes_per_chunk).map(|i| {
+                    let bit = 1u64 << (chunk_start + i);
+                    quote! { #bit as i32 }
+                });
+                quote! {
+                    {
+                        let bit_mask = _mm256_setr_epi32(#(#lanes),*);
+                        _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                    }
+                }
+            }
+            64 => {
+                let lanes = (0..lanes_per_chunk).rev().map(|i| {
+                    let bit = 1u64 << (chunk_start + i);
+                    quote! { #bit as i64 }
+                });
+                quote! {
+                    {
+                        let bit_mask = _mm256_set_epi64x(#(#lanes),*);
+                        _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
+                    }
+                }
+            }
+            _ => unreachable!(),
+        }
+    });
+    let set1 = match vec_ty.scalar_bits {
+        32 => quote! { _mm256_set1_epi32(bits as i32) },
+        64 => quote! { _mm256_set1_epi64x(bits as i64) },
+        _ => unreachable!(),
+    };
+
+    quote! {
+        {
+            let bit_lanes = #set1;
+            #ty {
+                val: crate::support::Aligned512([#(#chunks),*]),
+                simd: self,
+            }
+        }
+    }
+}
+
+fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenStream {
+    assert_eq!(vec_ty.n_bits(), 512);
+    assert_eq!(
+        vec_ty.scalar_bits, 8,
+        "only mask8x64 uses direct wide byte-mask lowering"
+    );
+
+    let ty = vec_ty.rust();
+    match native_width {
+        128 => {
+            let bit_mask = mask_bit_pattern_128();
+            let chunks = (0..4).map(|chunk| {
+                let shuffle = mask_byte_shuffle_128_offset(16, chunk * 2);
+                quote! {
+                    {
+                        let bit_bytes = _mm_shuffle_epi8(bit_bytes, #shuffle);
+                        _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
+                    }
+                }
+            });
+
+            quote! {
+                {
+                    let bit_bytes = _mm_set1_epi64x(bits as i64);
+                    let bit_mask = #bit_mask;
+                    #ty {
+                        val: crate::support::Aligned512([#(#chunks),*]),
+                        simd: self,
+                    }
+                }
+            }
+        }
+        256 => {
+            let bit_mask = mask_bit_pattern_256();
+            let chunks = (0..2).map(|chunk| {
+                let shuffle = mask_byte_shuffle_256_offset(chunk * 4);
+                quote! {
+                    {
+                        let bit_bytes = _mm256_shuffle_epi8(bit_bytes, #shuffle);
+                        _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
+                    }
+                }
+            });
+
+            quote! {
+                {
+                    let bit_bytes = _mm256_set1_epi64x(bits as i64);
+                    let bit_mask = #bit_mask;
+                    #ty {
+                        val: crate::support::Aligned512([#(#chunks),*]),
+                        simd: self,
+                    }
+                }
+            }
+        }
+        _ => unreachable!(),
+    }
+}
+
 fn mask_bit_pattern_128() -> TokenStream {
     let lanes = (0..16).map(|i| {
         let bit = 1u16 << (i % 8);
@@ -286,22 +502,30 @@ fn mask_bit_pattern_256() -> TokenStream {
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
 
-fn mask_byte_shuffle_128(lane_count: usize) -> TokenStream {
+fn mask_byte_shuffle_128_offset(lane_count: usize, byte_offset: usize) -> TokenStream {
     let lanes = (0..16).map(|i| {
-        let byte = (i.min(lane_count - 1) / 8) as u8;
+        let byte = (byte_offset + i.min(lane_count - 1) / 8) as u8;
         quote! { #byte as i8 }
     });
     quote! { _mm_setr_epi8(#(#lanes),*) }
 }
 
-fn mask_byte_shuffle_256() -> TokenStream {
+fn mask_byte_shuffle_128(lane_count: usize) -> TokenStream {
+    mask_byte_shuffle_128_offset(lane_count, 0)
+}
+
+fn mask_byte_shuffle_256_offset(byte_offset: usize) -> TokenStream {
     let lanes = (0..32).map(|i| {
-        let byte = (i / 8) as u8;
+        let byte = (byte_offset + i / 8) as u8;
         quote! { #byte as i8 }
     });
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
 
+fn mask_byte_shuffle_256() -> TokenStream {
+    mask_byte_shuffle_256_offset(0)
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
         let intrinsic = set1_intrinsic(vec_ty);
@@ -331,16 +555,42 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
-        let bytes = mask_from_bitmask_bytes(vec_ty);
-        let expr = if vec_ty.scalar_bits == 8 {
-            quote! {
-                #bytes.simd_into(self)
+        if vec_ty.n_bits() == 512 && vec_ty.scalar_bits == 8 {
+            let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #expr
+                    }
+                }
+            };
+        }
+
+        if *self == Self::Avx2 && vec_ty.n_bits() == 512 && matches!(vec_ty.scalar_bits, 32 | 64) {
+            let expr = mask_from_bitmask_wide_avx2(vec_ty);
+            return quote! {
+                #method_sig {
+                    unsafe {
+                        #expr
+                    }
+                }
+            };
+        }
+
+        let expr = match vec_ty.scalar_bits {
+            8 => {
+                let bytes = mask_from_bitmask_bytes(vec_ty);
+                quote! {
+                    #bytes.simd_into(self)
+                }
             }
-        } else {
-            let extend = extend_intrinsic(ScalarType::Int, 8, vec_ty.scalar_bits, vec_ty.n_bits());
-            quote! {
-                #extend(#bytes).simd_into(self)
+            16 | 32 | 64 => {
+                let lanes = mask_from_bitmask_lanes(vec_ty);
+                quote! {
+                    #lanes.simd_into(self)
+                }
             }
+            _ => unreachable!(),
         };
 
         quote! {

From b61d4e4349672c2a126082a303e6028defc12bbe Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 16:21:49 +0100
Subject: [PATCH 07/16] Optimize the 16-bit cases of to_bitmask() on x86

---
 fearless_simd/src/generated/avx2.rs   | 35 +++++------
 fearless_simd/src/generated/sse4_2.rs | 33 ++++++-----
 fearless_simd_gen/src/mk_x86.rs       | 85 ++++++++++++++++++---------
 3 files changed, 94 insertions(+), 59 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index 1d40a17de..c031ccb56 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -1448,12 +1448,10 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
         unsafe {
-            let mut bits = _mm_movemask_epi8(a.into()) as u32 as u64;
-            bits &= 21845u64;
-            bits = (bits | (bits >> 1)) & 13107u64;
-            bits = (bits | (bits >> 2)) & 3855u64;
-            bits = (bits | (bits >> 4)) & 255u64;
-            bits
+            {
+                let packed = _mm_packs_epi16(a.into(), a.into());
+                _mm_movemask_epi8(packed) as u8 as u64
+            }
         }
     }
     #[inline(always)]
@@ -4153,13 +4151,11 @@ impl Simd for Avx2 {
     #[inline(always)]
     fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
         unsafe {
-            let mut bits = _mm256_movemask_epi8(a.into()) as u32 as u64;
-            bits &= 1431655765u64;
-            bits = (bits | (bits >> 1)) & 858993459u64;
-            bits = (bits | (bits >> 2)) & 252645135u64;
-            bits = (bits | (bits >> 4)) & 16711935u64;
-            bits = (bits | (bits >> 8)) & 65535u64;
-            bits
+            {
+                let halves: [__m128i; 2usize] = core::mem::transmute(a.val.0);
+                let packed = _mm_packs_epi16(halves[0], halves[1]);
+                _mm_movemask_epi8(packed) as u32 as u64
+            }
         }
     }
     #[inline(always)]
@@ -7144,10 +7140,15 @@ impl Simd for Avx2 {
     }
     #[inline(always)]
     fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x32(a);
-        let lo = self.to_bitmask_mask16x16(lo);
-        let hi = self.to_bitmask_mask16x16(hi);
-        lo | (hi << 16usize)
+        unsafe {
+            {
+                let lo = _mm256_movemask_epi8(a.val.0[0]) as u32;
+                let hi = _mm256_movemask_epi8(a.val.0[1]) as u32;
+                let lo = _pext_u32(lo, 0x5555_5555u32) as u64;
+                let hi = _pext_u32(hi, 0x5555_5555u32) as u64;
+                lo | (hi << 16usize)
+            }
+        }
     }
     #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index abaa82fca..aa9c9bbf7 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -1497,12 +1497,10 @@ impl Simd for Sse4_2 {
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
         unsafe {
-            let mut bits = _mm_movemask_epi8(a.into()) as u32 as u64;
-            bits &= 21845u64;
-            bits = (bits | (bits >> 1)) & 13107u64;
-            bits = (bits | (bits >> 2)) & 3855u64;
-            bits = (bits | (bits >> 4)) & 255u64;
-            bits
+            {
+                let packed = _mm_packs_epi16(a.into(), a.into());
+                _mm_movemask_epi8(packed) as u8 as u64
+            }
         }
     }
     #[inline(always)]
@@ -4073,10 +4071,12 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn to_bitmask_mask16x16(self, a: mask16x16<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x16(a);
-        let lo = self.to_bitmask_mask16x8(lo);
-        let hi = self.to_bitmask_mask16x8(hi);
-        lo | (hi << 8usize)
+        unsafe {
+            {
+                let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                _mm_movemask_epi8(packed) as u32 as u64
+            }
+        }
     }
     #[inline(always)]
     fn and_mask16x16(self, a: mask16x16<Self>, b: mask16x16<Self>) -> mask16x16<Self> {
@@ -7151,10 +7151,15 @@ impl Simd for Sse4_2 {
     }
     #[inline(always)]
     fn to_bitmask_mask16x32(self, a: mask16x32<Self>) -> u64 {
-        let (lo, hi) = self.split_mask16x32(a);
-        let lo = self.to_bitmask_mask16x16(lo);
-        let hi = self.to_bitmask_mask16x16(hi);
-        lo | (hi << 16usize)
+        unsafe {
+            {
+                let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]);
+                let lo = _mm_movemask_epi8(lo) as u32 as u64;
+                let hi = _mm_movemask_epi8(hi) as u32 as u64;
+                lo | (hi << 16usize)
+            }
+        }
     }
     #[inline(always)]
     fn and_mask16x32(self, a: mask16x32<Self>, b: mask16x32<Self>) -> mask16x32<Self> {
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 47929ce98..348643b48 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -185,6 +185,13 @@ impl Level for X86 {
             return false;
         }
 
+        if matches!(op.sig, OpSig::MaskToBitmask)
+            && vec_ty.scalar == ScalarType::Mask
+            && vec_ty.scalar_bits == 16
+        {
+            return false;
+        }
+
         true
     }
 
@@ -486,6 +493,54 @@ fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenS
     }
 }
 
+fn mask_to_bitmask_words(native_width: usize, vec_ty: &VecType) -> TokenStream {
+    assert_eq!(
+        vec_ty.scalar_bits, 16,
+        "only 16-bit masks use word packing to produce bitmasks"
+    );
+
+    match (native_width, vec_ty.n_bits()) {
+        (128 | 256, 128) => quote! {
+            {
+                let packed = _mm_packs_epi16(a.into(), a.into());
+                _mm_movemask_epi8(packed) as u8 as u64
+            }
+        },
+        (128, 256) => quote! {
+            {
+                let packed = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                _mm_movemask_epi8(packed) as u32 as u64
+            }
+        },
+        (128, 512) => quote! {
+            {
+                let lo = _mm_packs_epi16(a.val.0[0], a.val.0[1]);
+                let hi = _mm_packs_epi16(a.val.0[2], a.val.0[3]);
+                let lo = _mm_movemask_epi8(lo) as u32 as u64;
+                let hi = _mm_movemask_epi8(hi) as u32 as u64;
+                lo | (hi << 16usize)
+            }
+        },
+        (256, 256) => quote! {
+            {
+                let halves: [__m128i; 2usize] = core::mem::transmute(a.val.0);
+                let packed = _mm_packs_epi16(halves[0], halves[1]);
+                _mm_movemask_epi8(packed) as u32 as u64
+            }
+        },
+        (256, 512) => quote! {
+            {
+                let lo = _mm256_movemask_epi8(a.val.0[0]) as u32;
+                let hi = _mm256_movemask_epi8(a.val.0[1]) as u32;
+                let lo = _pext_u32(lo, 0x5555_5555u32) as u64;
+                let hi = _pext_u32(hi, 0x5555_5555u32) as u64;
+                lo | (hi << 16usize)
+            }
+        },
+        _ => unimplemented!(),
+    }
+}
+
 fn mask_bit_pattern_128() -> TokenStream {
     let lanes = (0..16).map(|i| {
         let bit = 1u16 << (i % 8);
@@ -624,37 +679,11 @@ impl X86 {
                 }
             }
             16 => {
-                let bits_ty = vec_ty.reinterpret(ScalarType::Int, 8);
-                let movemask = simple_intrinsic("movemask", &bits_ty);
-                let (even_bits, pair_bits, nibble_bits, byte_bits, word_bits) = match vec_ty.len {
-                    8 => (0x5555u64, 0x3333u64, 0x0f0fu64, 0x00ffu64, 0),
-                    16 => (
-                        0x5555_5555u64,
-                        0x3333_3333u64,
-                        0x0f0f_0f0fu64,
-                        0x00ff_00ffu64,
-                        0x0000_ffffu64,
-                    ),
-                    _ => unimplemented!(),
-                };
-                let merge_words = (vec_ty.len > 8).then(|| {
-                    quote! {
-                        bits = (bits | (bits >> 8)) & #word_bits;
-                    }
-                });
-
+                let bits = mask_to_bitmask_words(self.native_width(), vec_ty);
                 quote! {
                     #method_sig {
                         unsafe {
-                            // `_mm*_movemask_epi8` returns one bit per byte. For 16-bit masks both bytes have the
-                            // same sign bit, so keep one byte bit per lane and compact those bits.
-                            let mut bits = #movemask(a.into()) as u32 as u64;
-                            bits &= #even_bits;
-                            bits = (bits | (bits >> 1)) & #pair_bits;
-                            bits = (bits | (bits >> 2)) & #nibble_bits;
-                            bits = (bits | (bits >> 4)) & #byte_bits;
-                            #merge_words
-                            bits
+                            #bits
                         }
                     }
                 }

From 8f53a164bd1fa09dbf20cf33055d31059b07c692 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 16:45:12 +0100
Subject: [PATCH 08/16] Move specialization exception checks closer to where
 they're used

---
 fearless_simd_gen/src/mk_x86.rs | 49 +++++++++++++++++++--------------
 1 file changed, 29 insertions(+), 20 deletions(-)

diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index 348643b48..d43e506fc 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -174,25 +174,11 @@ impl Level for X86 {
             return false;
         }
 
-        // Some 512-bit masks can be constructed directly from one broadcast, avoiding the
-        // shift-and-rebroadcast shape from generic split/combine.
-        if matches!(op.sig, OpSig::MaskFromBitmask)
-            && vec_ty.scalar == ScalarType::Mask
-            && vec_ty.n_bits() == 512
-            && (vec_ty.scalar_bits == 8
-                || (*self == Self::Avx2 && matches!(vec_ty.scalar_bits, 32 | 64)))
-        {
-            return false;
+        match op.sig {
+            OpSig::MaskFromBitmask => !self.has_specialized_mask_from_bitmask(vec_ty),
+            OpSig::MaskToBitmask => !self.has_specialized_mask_to_bitmask(vec_ty),
+            _ => true,
         }
-
-        if matches!(op.sig, OpSig::MaskToBitmask)
-            && vec_ty.scalar == ScalarType::Mask
-            && vec_ty.scalar_bits == 16
-        {
-            return false;
-        }
-
-        true
     }
 
     fn make_method(&self, op: Op, vec_ty: &VecType) -> TokenStream {
@@ -599,6 +585,29 @@ impl X86 {
         }
     }
 
+    fn has_specialized_mask_from_bitmask(&self, vec_ty: &VecType) -> bool {
+        self.has_wide_byte_mask_from_bitmask(vec_ty) || self.has_wide_avx2_mask_from_bitmask(vec_ty)
+    }
+
+    fn has_wide_byte_mask_from_bitmask(&self, vec_ty: &VecType) -> bool {
+        // 512-bit byte masks can be constructed directly from one broadcast, avoiding the
+        // shift-and-rebroadcast shape from generic split/combine.
+        vec_ty.scalar == ScalarType::Mask && vec_ty.n_bits() == 512 && vec_ty.scalar_bits == 8
+    }
+
+    fn has_wide_avx2_mask_from_bitmask(&self, vec_ty: &VecType) -> bool {
+        // AVX2 can construct these 512-bit masks directly from one broadcast, avoiding the
+        // split/combine shape that shifts and broadcasts each half separately.
+        *self == Self::Avx2
+            && vec_ty.scalar == ScalarType::Mask
+            && vec_ty.n_bits() == 512
+            && matches!(vec_ty.scalar_bits, 32 | 64)
+    }
+
+    fn has_specialized_mask_to_bitmask(&self, vec_ty: &VecType) -> bool {
+        vec_ty.scalar == ScalarType::Mask && vec_ty.scalar_bits == 16
+    }
+
     pub(crate) fn handle_mask_from_bitmask(
         &self,
         method_sig: TokenStream,
@@ -610,7 +619,7 @@ impl X86 {
             "mask bitmask conversion only operates on masks"
         );
 
-        if vec_ty.n_bits() == 512 && vec_ty.scalar_bits == 8 {
+        if self.has_wide_byte_mask_from_bitmask(vec_ty) {
             let expr = mask_from_bitmask_wide_bytes(self.native_width(), vec_ty);
             return quote! {
                 #method_sig {
@@ -621,7 +630,7 @@ impl X86 {
             };
         }
 
-        if *self == Self::Avx2 && vec_ty.n_bits() == 512 && matches!(vec_ty.scalar_bits, 32 | 64) {
+        if self.has_wide_avx2_mask_from_bitmask(vec_ty) {
             let expr = mask_from_bitmask_wide_avx2(vec_ty);
             return quote! {
                 #method_sig {

From ebce9f05dcbdc7e01aa9cc2330c5c8b25d8ac0b2 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 17:11:28 +0100
Subject: [PATCH 09/16] Optimize from_bitmask for WASM

---
 fearless_simd/src/generated/wasm.rs | 30 +++++++++-------
 fearless_simd_gen/src/mk_wasm.rs    | 56 +++++++++++++++++++++++++++--
 2 files changed, 71 insertions(+), 15 deletions(-)

diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index ba14075b0..bf1068613 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -853,9 +853,12 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn from_bitmask_mask8x16(self, bits: u64) -> mask8x16<Self> {
-        let lanes: [i8; 16usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        let lo = i8x16_splat(bits as i8);
+        let hi = i8x16_splat((bits >> 8) as i8);
+        let bytes = u8x16_shuffle::<0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16, 16>(lo, hi);
+        let powers = u8x16(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+        let selected = v128_and(bytes, powers);
+        i8x16_ne(selected, i8x16_splat(0)).simd_into(self)
     }
     #[inline(always)]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
@@ -1361,9 +1364,10 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn from_bitmask_mask16x8(self, bits: u64) -> mask16x8<Self> {
-        let lanes: [i16; 8usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        let bitset = i16x8_splat(bits as i16);
+        let powers = u16x8(1, 2, 4, 8, 16, 32, 64, 128);
+        let selected = v128_and(bitset, powers);
+        i16x8_ne(selected, i16x8_splat(0)).simd_into(self)
     }
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
@@ -1873,9 +1877,10 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn from_bitmask_mask32x4(self, bits: u64) -> mask32x4<Self> {
-        let lanes: [i32; 4usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        let bitset = i32x4_splat(bits as i32);
+        let powers = u32x4(1, 2, 4, 8);
+        let selected = v128_and(bitset, powers);
+        i32x4_ne(selected, i32x4_splat(0)).simd_into(self)
     }
     #[inline(always)]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
@@ -2227,9 +2232,10 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
-        let lanes: [i64; 2usize] =
-            core::array::from_fn(|i| if ((bits >> i) & 1) != 0 { !0 } else { 0 });
-        lanes.simd_into(self)
+        let bitset = i64x2_splat(bits as i64);
+        let powers = u64x2(1, 2);
+        let selected = v128_and(bitset, powers);
+        i64x2_ne(selected, i64x2_splat(0)).simd_into(self)
     }
     #[inline(always)]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index f3c72f823..7f490136b 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -7,8 +7,8 @@ use quote::{format_ident, quote};
 use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_mask_from_bitmask, generic_mask_to_bitmask, generic_op_name,
-    generic_store_array, generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary,
+    generic_from_bytes, generic_mask_to_bitmask, generic_op_name, generic_store_array,
+    generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
 use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret};
@@ -21,6 +21,56 @@ use crate::{
 #[derive(Clone, Copy)]
 pub(crate) struct WasmSimd128;
 
+fn mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    assert_eq!(
+        vec_ty.scalar,
+        ScalarType::Mask,
+        "mask bitmask conversion only operates on masks"
+    );
+    assert_eq!(
+        vec_ty.n_bits(),
+        128,
+        "WASM SIMD mask bitmask lowering only handles one native vector"
+    );
+
+    let expr = match vec_ty.scalar_bits {
+        8 => quote! {
+            let lo = i8x16_splat(bits as i8);
+            let hi = i8x16_splat((bits >> 8) as i8);
+            let bytes =
+                u8x16_shuffle::<0, 0, 0, 0, 0, 0, 0, 0, 16, 16, 16, 16, 16, 16, 16, 16>(lo, hi);
+            let powers = u8x16(1, 2, 4, 8, 16, 32, 64, 128, 1, 2, 4, 8, 16, 32, 64, 128);
+            let selected = v128_and(bytes, powers);
+            i8x16_ne(selected, i8x16_splat(0)).simd_into(self)
+        },
+        16 => quote! {
+            let bitset = i16x8_splat(bits as i16);
+            let powers = u16x8(1, 2, 4, 8, 16, 32, 64, 128);
+            let selected = v128_and(bitset, powers);
+            i16x8_ne(selected, i16x8_splat(0)).simd_into(self)
+        },
+        32 => quote! {
+            let bitset = i32x4_splat(bits as i32);
+            let powers = u32x4(1, 2, 4, 8);
+            let selected = v128_and(bitset, powers);
+            i32x4_ne(selected, i32x4_splat(0)).simd_into(self)
+        },
+        64 => quote! {
+            let bitset = i64x2_splat(bits as i64);
+            let powers = u64x2(1, 2);
+            let selected = v128_and(bitset, powers);
+            i64x2_ne(selected, i64x2_splat(0)).simd_into(self)
+        },
+        _ => unreachable!("WASM only supports mask lane widths of 8, 16, 32, and 64 bits"),
+    };
+
+    quote! {
+        #method_sig {
+            #expr
+        }
+    }
+}
+
 impl Level for WasmSimd128 {
     fn name(&self) -> &'static str {
         "WasmSimd128"
@@ -512,7 +562,7 @@ impl Level for WasmSimd128 {
                     }
                 }
             }
-            OpSig::MaskFromBitmask => generic_mask_from_bitmask(method_sig, vec_ty),
+            OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty),
             OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,

From 8dc5eaa9aa5220a41bf7166219ffdb52a0c46f61 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 17:17:28 +0100
Subject: [PATCH 10/16] Optimize to_bitmask for WASM

---
 fearless_simd/src/generated/wasm.rs | 44 +++--------------------------
 fearless_simd_gen/src/mk_wasm.rs    | 27 ++++++++++++++++--
 2 files changed, 28 insertions(+), 43 deletions(-)

diff --git a/fearless_simd/src/generated/wasm.rs b/fearless_simd/src/generated/wasm.rs
index bf1068613..da6b718dc 100644
--- a/fearless_simd/src/generated/wasm.rs
+++ b/fearless_simd/src/generated/wasm.rs
@@ -862,16 +862,7 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn to_bitmask_mask8x16(self, a: mask8x16<Self>) -> u64 {
-        let lanes = self.as_array_mask8x16(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 16usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
-        }
-        bits
+        i8x16_bitmask(a.into()) as u64
     }
     #[inline(always)]
     fn and_mask8x16(self, a: mask8x16<Self>, b: mask8x16<Self>) -> mask8x16<Self> {
@@ -1371,16 +1362,7 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn to_bitmask_mask16x8(self, a: mask16x8<Self>) -> u64 {
-        let lanes = self.as_array_mask16x8(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 8usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
-        }
-        bits
+        i16x8_bitmask(a.into()) as u64
     }
     #[inline(always)]
     fn and_mask16x8(self, a: mask16x8<Self>, b: mask16x8<Self>) -> mask16x8<Self> {
@@ -1884,16 +1866,7 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn to_bitmask_mask32x4(self, a: mask32x4<Self>) -> u64 {
-        let lanes = self.as_array_mask32x4(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 4usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
-        }
-        bits
+        i32x4_bitmask(a.into()) as u64
     }
     #[inline(always)]
     fn and_mask32x4(self, a: mask32x4<Self>, b: mask32x4<Self>) -> mask32x4<Self> {
@@ -2239,16 +2212,7 @@ impl Simd for WasmSimd128 {
     }
     #[inline(always)]
     fn to_bitmask_mask64x2(self, a: mask64x2<Self>) -> u64 {
-        let lanes = self.as_array_mask64x2(a);
-        let mut bits = 0u64;
-        let mut i = 0;
-        while i < 2usize {
-            if lanes[i] != 0 {
-                bits |= 1u64 << i;
-            }
-            i += 1;
-        }
-        bits
+        i64x2_bitmask(a.into()) as u64
     }
     #[inline(always)]
     fn and_mask64x2(self, a: mask64x2<Self>, b: mask64x2<Self>) -> mask64x2<Self> {
diff --git a/fearless_simd_gen/src/mk_wasm.rs b/fearless_simd_gen/src/mk_wasm.rs
index 7f490136b..200e30ee5 100644
--- a/fearless_simd_gen/src/mk_wasm.rs
+++ b/fearless_simd_gen/src/mk_wasm.rs
@@ -7,8 +7,8 @@ use quote::{format_ident, quote};
 use crate::arch::wasm::{arch_prefix, v128_intrinsic};
 use crate::generic::{
     generic_as_array, generic_block_combine, generic_block_split, generic_from_array,
-    generic_from_bytes, generic_mask_to_bitmask, generic_op_name, generic_store_array,
-    generic_to_bytes, integer_lane_mask_splat_arg, scalar_binary,
+    generic_from_bytes, generic_op_name, generic_store_array, generic_to_bytes,
+    integer_lane_mask_splat_arg, scalar_binary,
 };
 use crate::level::Level;
 use crate::ops::{Op, Quantifier, SlideGranularity, valid_reinterpret};
@@ -71,6 +71,27 @@ fn mask_from_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
     }
 }
 
+fn mask_to_bitmask(method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
+    assert_eq!(
+        vec_ty.scalar,
+        ScalarType::Mask,
+        "mask bitmask conversion only operates on masks"
+    );
+    assert_eq!(
+        vec_ty.n_bits(),
+        128,
+        "WASM SIMD mask bitmask lowering only handles one native vector"
+    );
+
+    let intrinsic = format_ident!("i{}x{}_bitmask", vec_ty.scalar_bits, vec_ty.len);
+
+    quote! {
+        #method_sig {
+            #intrinsic(a.into()) as u64
+        }
+    }
+}
+
 impl Level for WasmSimd128 {
     fn name(&self) -> &'static str {
         "WasmSimd128"
@@ -563,7 +584,7 @@ impl Level for WasmSimd128 {
                 }
             }
             OpSig::MaskFromBitmask => mask_from_bitmask(method_sig, vec_ty),
-            OpSig::MaskToBitmask => generic_mask_to_bitmask(method_sig, vec_ty),
+            OpSig::MaskToBitmask => mask_to_bitmask(method_sig, vec_ty),
             OpSig::LoadInterleaved {
                 block_size,
                 block_count,

From 95cb1746e07454e17fa1f2826c417540df2f7b2f Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 17:26:42 +0100
Subject: [PATCH 11/16] Placate clippy in a straightforward way

---
 fearless_simd/src/generated.rs            |   1 +
 fearless_simd/src/generated/avx2.rs       | 422 +++++++++++++---------
 fearless_simd/src/generated/simd_trait.rs |   6 +-
 fearless_simd/src/generated/simd_types.rs |  72 +++-
 fearless_simd/src/generated/sse4_2.rs     | 190 ++++++----
 fearless_simd_gen/src/mk_simd_trait.rs    |   6 +-
 fearless_simd_gen/src/mk_simd_types.rs    |   6 +-
 fearless_simd_gen/src/mk_x86.rs           |  68 ++--
 fearless_simd_tests/tests/mask_methods.rs |  18 +-
 9 files changed, 510 insertions(+), 279 deletions(-)

diff --git a/fearless_simd/src/generated.rs b/fearless_simd/src/generated.rs
index 9d342539a..0fe782230 100644
--- a/fearless_simd/src/generated.rs
+++ b/fearless_simd/src/generated.rs
@@ -6,6 +6,7 @@
     clippy::cast_possible_truncation,
     clippy::unseparated_literal_suffix,
     clippy::use_self,
+    clippy::wrong_self_convention,
     reason = "TODO: https://github.com/linebender/fearless_simd/issues/40"
 )]
 #![cfg_attr(
diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index c031ccb56..ac70f2d93 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -902,28 +902,41 @@ impl Simd for Avx2 {
                 let bit_bytes = _mm_shuffle_epi8(
                     bit_bytes,
                     _mm_setr_epi8(
-                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
                     ),
                 );
                 let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
                 );
                 _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
             }
@@ -1431,14 +1444,14 @@ impl Simd for Avx2 {
             {
                 let bit_lanes = _mm_set1_epi16(bits as i16);
                 let bit_mask = _mm_setr_epi16(
-                    1u64 as i16,
-                    2u64 as i16,
-                    4u64 as i16,
-                    8u64 as i16,
-                    16u64 as i16,
-                    32u64 as i16,
-                    64u64 as i16,
-                    128u64 as i16,
+                    1u16.cast_signed(),
+                    2u16.cast_signed(),
+                    4u16.cast_signed(),
+                    8u16.cast_signed(),
+                    16u16.cast_signed(),
+                    32u16.cast_signed(),
+                    64u16.cast_signed(),
+                    128u16.cast_signed(),
                 );
                 _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
@@ -1960,7 +1973,12 @@ impl Simd for Avx2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi32(bits as i32);
-                let bit_mask = _mm_setr_epi32(1u64 as i32, 2u64 as i32, 4u64 as i32, 8u64 as i32);
+                let bit_mask = _mm_setr_epi32(
+                    1u32.cast_signed(),
+                    2u32.cast_signed(),
+                    4u32.cast_signed(),
+                    8u32.cast_signed(),
+                );
                 _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -2276,7 +2294,7 @@ impl Simd for Avx2 {
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         unsafe {
             {
-                let bit_lanes = _mm_set1_epi64x(bits as i64);
+                let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm_set_epi64x(2, 1);
                 _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
@@ -3376,47 +3394,73 @@ impl Simd for Avx2 {
                 let bit_bytes = _mm256_shuffle_epi8(
                     bit_bytes,
                     _mm256_setr_epi8(
-                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 2u8 as i8, 2u8 as i8,
-                        2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
-                        3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
-                        3u8 as i8, 3u8 as i8,
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        2u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
+                        3u8.cast_signed(),
                     ),
                 );
                 let bit_mask = _mm256_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
                 );
                 _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
             }
@@ -4126,22 +4170,22 @@ impl Simd for Avx2 {
             {
                 let bit_lanes = _mm256_set1_epi16(bits as i16);
                 let bit_mask = _mm256_setr_epi16(
-                    1u64 as i16,
-                    2u64 as i16,
-                    4u64 as i16,
-                    8u64 as i16,
-                    16u64 as i16,
-                    32u64 as i16,
-                    64u64 as i16,
-                    128u64 as i16,
-                    256u64 as i16,
-                    512u64 as i16,
-                    1024u64 as i16,
-                    2048u64 as i16,
-                    4096u64 as i16,
-                    8192u64 as i16,
-                    16384u64 as i16,
-                    32768u64 as i16,
+                    1u16.cast_signed(),
+                    2u16.cast_signed(),
+                    4u16.cast_signed(),
+                    8u16.cast_signed(),
+                    16u16.cast_signed(),
+                    32u16.cast_signed(),
+                    64u16.cast_signed(),
+                    128u16.cast_signed(),
+                    256u16.cast_signed(),
+                    512u16.cast_signed(),
+                    1024u16.cast_signed(),
+                    2048u16.cast_signed(),
+                    4096u16.cast_signed(),
+                    8192u16.cast_signed(),
+                    16384u16.cast_signed(),
+                    32768u16.cast_signed(),
                 );
                 _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
             }
@@ -4802,14 +4846,14 @@ impl Simd for Avx2 {
             {
                 let bit_lanes = _mm256_set1_epi32(bits as i32);
                 let bit_mask = _mm256_setr_epi32(
-                    1u64 as i32,
-                    2u64 as i32,
-                    4u64 as i32,
-                    8u64 as i32,
-                    16u64 as i32,
-                    32u64 as i32,
-                    64u64 as i32,
-                    128u64 as i32,
+                    1u32.cast_signed(),
+                    2u32.cast_signed(),
+                    4u32.cast_signed(),
+                    8u32.cast_signed(),
+                    16u32.cast_signed(),
+                    32u32.cast_signed(),
+                    64u32.cast_signed(),
+                    128u32.cast_signed(),
                 );
                 _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
             }
@@ -5201,7 +5245,7 @@ impl Simd for Avx2 {
     fn from_bitmask_mask64x4(self, bits: u64) -> mask64x4<Self> {
         unsafe {
             {
-                let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                let bit_lanes = _mm256_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
                 _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
             }
@@ -6314,40 +6358,40 @@ impl Simd for Avx2 {
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
         unsafe {
             {
-                let bit_bytes = _mm256_set1_epi64x(bits as i64);
+                let bit_bytes = _mm256_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm256_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
                 );
                 mask8x64 {
                     val: crate::support::Aligned512([
@@ -6355,13 +6399,38 @@ impl Simd for Avx2 {
                             let bit_bytes = _mm256_shuffle_epi8(
                                 bit_bytes,
                                 _mm256_setr_epi8(
-                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8,
-                                    1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                                    1u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
-                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 3u8 as i8,
-                                    3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
-                                    3u8 as i8, 3u8 as i8,
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
                                 ),
                             );
                             _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
@@ -6370,13 +6439,38 @@ impl Simd for Avx2 {
                             let bit_bytes = _mm256_shuffle_epi8(
                                 bit_bytes,
                                 _mm256_setr_epi8(
-                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8,
-                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 5u8 as i8, 5u8 as i8,
-                                    5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8,
-                                    5u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8,
-                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 7u8 as i8,
-                                    7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8,
-                                    7u8 as i8, 7u8 as i8,
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
                                 ),
                             );
                             _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
@@ -7861,27 +7955,27 @@ impl Simd for Avx2 {
                     val: crate::support::Aligned512([
                         {
                             let bit_mask = _mm256_setr_epi32(
-                                1u64 as i32,
-                                2u64 as i32,
-                                4u64 as i32,
-                                8u64 as i32,
-                                16u64 as i32,
-                                32u64 as i32,
-                                64u64 as i32,
-                                128u64 as i32,
+                                1u32.cast_signed(),
+                                2u32.cast_signed(),
+                                4u32.cast_signed(),
+                                8u32.cast_signed(),
+                                16u32.cast_signed(),
+                                32u32.cast_signed(),
+                                64u32.cast_signed(),
+                                128u32.cast_signed(),
                             );
                             _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                         {
                             let bit_mask = _mm256_setr_epi32(
-                                256u64 as i32,
-                                512u64 as i32,
-                                1024u64 as i32,
-                                2048u64 as i32,
-                                4096u64 as i32,
-                                8192u64 as i32,
-                                16384u64 as i32,
-                                32768u64 as i32,
+                                256u32.cast_signed(),
+                                512u32.cast_signed(),
+                                1024u32.cast_signed(),
+                                2048u32.cast_signed(),
+                                4096u32.cast_signed(),
+                                8192u32.cast_signed(),
+                                16384u32.cast_signed(),
+                                32768u32.cast_signed(),
                             );
                             _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
@@ -8315,24 +8409,24 @@ impl Simd for Avx2 {
     fn from_bitmask_mask64x8(self, bits: u64) -> mask64x8<Self> {
         unsafe {
             {
-                let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                let bit_lanes = _mm256_set1_epi64x(bits.cast_signed());
                 mask64x8 {
                     val: crate::support::Aligned512([
                         {
                             let bit_mask = _mm256_set_epi64x(
-                                8u64 as i64,
-                                4u64 as i64,
-                                2u64 as i64,
-                                1u64 as i64,
+                                8u64.cast_signed(),
+                                4u64.cast_signed(),
+                                2u64.cast_signed(),
+                                1u64.cast_signed(),
                             );
                             _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                         {
                             let bit_mask = _mm256_set_epi64x(
-                                128u64 as i64,
-                                64u64 as i64,
-                                32u64 as i64,
-                                16u64 as i64,
+                                128u64.cast_signed(),
+                                64u64.cast_signed(),
+                                32u64.cast_signed(),
+                                16u64.cast_signed(),
                             );
                             _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
diff --git a/fearless_simd/src/generated/simd_trait.rs b/fearless_simd/src/generated/simd_trait.rs
index 6d0b013a2..47b3ce962 100644
--- a/fearless_simd/src/generated/simd_trait.rs
+++ b/fearless_simd/src/generated/simd_trait.rs
@@ -3014,7 +3014,11 @@ pub trait SimdMask<S: Simd>:
     #[doc = r" Panics if `index` is greater than or equal to the number of lanes in the mask."]
     #[inline(always)]
     fn test(&self, index: usize) -> bool {
-        assert!(index < Self::N);
+        assert!(
+            index < Self::N,
+            "mask lane index {index} is out of bounds for {} lanes",
+            Self::N
+        );
         (((*self).to_bitmask() >> index) & 1) != 0
     }
     #[doc = r" Sets the value of one logical lane."]
diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 90cb80cfd..66d9807c4 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -683,7 +683,11 @@ impl<S: Simd> crate::SimdMask<S> for mask8x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 16);
+        assert!(
+            index < 16,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16
+        );
         let mut lanes = self.simd.as_array_mask8x16(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask8x16(lanes);
@@ -1147,7 +1151,11 @@ impl<S: Simd> crate::SimdMask<S> for mask16x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 8);
+        assert!(
+            index < 8,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8
+        );
         let mut lanes = self.simd.as_array_mask16x8(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask16x8(lanes);
@@ -1635,7 +1643,11 @@ impl<S: Simd> crate::SimdMask<S> for mask32x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 4);
+        assert!(
+            index < 4,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4
+        );
         let mut lanes = self.simd.as_array_mask32x4(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask32x4(lanes);
@@ -1964,7 +1976,11 @@ impl<S: Simd> crate::SimdMask<S> for mask64x2<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 2);
+        assert!(
+            index < 2,
+            "mask lane index {index} is out of bounds for {} lanes",
+            2
+        );
         let mut lanes = self.simd.as_array_mask64x2(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask64x2(lanes);
@@ -2698,7 +2714,11 @@ impl<S: Simd> crate::SimdMask<S> for mask8x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 32);
+        assert!(
+            index < 32,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32
+        );
         let mut lanes = self.simd.as_array_mask8x32(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask8x32(lanes);
@@ -3188,7 +3208,11 @@ impl<S: Simd> crate::SimdMask<S> for mask16x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 16);
+        assert!(
+            index < 16,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16
+        );
         let mut lanes = self.simd.as_array_mask16x16(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask16x16(lanes);
@@ -3690,7 +3714,11 @@ impl<S: Simd> crate::SimdMask<S> for mask32x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 8);
+        assert!(
+            index < 8,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8
+        );
         let mut lanes = self.simd.as_array_mask32x8(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask32x8(lanes);
@@ -4026,7 +4054,11 @@ impl<S: Simd> crate::SimdMask<S> for mask64x4<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 4);
+        assert!(
+            index < 4,
+            "mask lane index {index} is out of bounds for {} lanes",
+            4
+        );
         let mut lanes = self.simd.as_array_mask64x4(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask64x4(lanes);
@@ -4748,7 +4780,11 @@ impl<S: Simd> crate::SimdMask<S> for mask8x64<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 64);
+        assert!(
+            index < 64,
+            "mask lane index {index} is out of bounds for {} lanes",
+            64
+        );
         let mut lanes = self.simd.as_array_mask8x64(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask8x64(lanes);
@@ -5226,7 +5262,11 @@ impl<S: Simd> crate::SimdMask<S> for mask16x32<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 32);
+        assert!(
+            index < 32,
+            "mask lane index {index} is out of bounds for {} lanes",
+            32
+        );
         let mut lanes = self.simd.as_array_mask16x32(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask16x32(lanes);
@@ -5728,7 +5768,11 @@ impl<S: Simd> crate::SimdMask<S> for mask32x16<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 16);
+        assert!(
+            index < 16,
+            "mask lane index {index} is out of bounds for {} lanes",
+            16
+        );
         let mut lanes = self.simd.as_array_mask32x16(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask32x16(lanes);
@@ -6058,7 +6102,11 @@ impl<S: Simd> crate::SimdMask<S> for mask64x8<S> {
     }
     #[inline(always)]
     fn set(&mut self, index: usize, value: bool) {
-        assert!(index < 8);
+        assert!(
+            index < 8,
+            "mask lane index {index} is out of bounds for {} lanes",
+            8
+        );
         let mut lanes = self.simd.as_array_mask64x8(*self);
         lanes[index] = if value { !0 } else { 0 };
         *self = self.simd.load_array_mask64x8(lanes);
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index aa9c9bbf7..6f270a21d 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -942,28 +942,41 @@ impl Simd for Sse4_2 {
                 let bit_bytes = _mm_shuffle_epi8(
                     bit_bytes,
                     _mm_setr_epi8(
-                        0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                        0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                        1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        0u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
+                        1u8.cast_signed(),
                     ),
                 );
                 let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
                 );
                 _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
             }
@@ -1480,14 +1493,14 @@ impl Simd for Sse4_2 {
             {
                 let bit_lanes = _mm_set1_epi16(bits as i16);
                 let bit_mask = _mm_setr_epi16(
-                    1u64 as i16,
-                    2u64 as i16,
-                    4u64 as i16,
-                    8u64 as i16,
-                    16u64 as i16,
-                    32u64 as i16,
-                    64u64 as i16,
-                    128u64 as i16,
+                    1u16.cast_signed(),
+                    2u16.cast_signed(),
+                    4u16.cast_signed(),
+                    8u16.cast_signed(),
+                    16u16.cast_signed(),
+                    32u16.cast_signed(),
+                    64u16.cast_signed(),
+                    128u16.cast_signed(),
                 );
                 _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
@@ -2018,7 +2031,12 @@ impl Simd for Sse4_2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi32(bits as i32);
-                let bit_mask = _mm_setr_epi32(1u64 as i32, 2u64 as i32, 4u64 as i32, 8u64 as i32);
+                let bit_mask = _mm_setr_epi32(
+                    1u32.cast_signed(),
+                    2u32.cast_signed(),
+                    4u32.cast_signed(),
+                    8u32.cast_signed(),
+                );
                 _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -2340,7 +2358,7 @@ impl Simd for Sse4_2 {
     fn from_bitmask_mask64x2(self, bits: u64) -> mask64x2<Self> {
         unsafe {
             {
-                let bit_lanes = _mm_set1_epi64x(bits as i64);
+                let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm_set_epi64x(2, 1);
                 _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
@@ -6331,24 +6349,24 @@ impl Simd for Sse4_2 {
     fn from_bitmask_mask8x64(self, bits: u64) -> mask8x64<Self> {
         unsafe {
             {
-                let bit_bytes = _mm_set1_epi64x(bits as i64);
+                let bit_bytes = _mm_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm_setr_epi8(
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
-                    1u16 as i8,
-                    2u16 as i8,
-                    4u16 as i8,
-                    8u16 as i8,
-                    16u16 as i8,
-                    32u16 as i8,
-                    64u16 as i8,
-                    128u16 as i8,
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
+                    1u8.cast_signed(),
+                    2u8.cast_signed(),
+                    4u8.cast_signed(),
+                    8u8.cast_signed(),
+                    16u8.cast_signed(),
+                    32u8.cast_signed(),
+                    64u8.cast_signed(),
+                    128u8.cast_signed(),
                 );
                 mask8x64 {
                     val: crate::support::Aligned512([
@@ -6356,10 +6374,22 @@ impl Simd for Sse4_2 {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
                                 _mm_setr_epi8(
-                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8, 0u8 as i8,
-                                    0u8 as i8, 0u8 as i8, 0u8 as i8, 1u8 as i8, 1u8 as i8,
-                                    1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8, 1u8 as i8,
-                                    1u8 as i8,
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    0u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
+                                    1u8.cast_signed(),
                                 ),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
@@ -6368,10 +6398,22 @@ impl Simd for Sse4_2 {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
                                 _mm_setr_epi8(
-                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8, 2u8 as i8,
-                                    2u8 as i8, 2u8 as i8, 2u8 as i8, 3u8 as i8, 3u8 as i8,
-                                    3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8, 3u8 as i8,
-                                    3u8 as i8,
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    2u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
+                                    3u8.cast_signed(),
                                 ),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
@@ -6380,10 +6422,22 @@ impl Simd for Sse4_2 {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
                                 _mm_setr_epi8(
-                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8, 4u8 as i8,
-                                    4u8 as i8, 4u8 as i8, 4u8 as i8, 5u8 as i8, 5u8 as i8,
-                                    5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8, 5u8 as i8,
-                                    5u8 as i8,
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    4u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
+                                    5u8.cast_signed(),
                                 ),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
@@ -6392,10 +6446,22 @@ impl Simd for Sse4_2 {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
                                 _mm_setr_epi8(
-                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8, 6u8 as i8,
-                                    6u8 as i8, 6u8 as i8, 6u8 as i8, 7u8 as i8, 7u8 as i8,
-                                    7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8, 7u8 as i8,
-                                    7u8 as i8,
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    6u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
+                                    7u8.cast_signed(),
                                 ),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
diff --git a/fearless_simd_gen/src/mk_simd_trait.rs b/fearless_simd_gen/src/mk_simd_trait.rs
index 7e42af2f7..fb118cf49 100644
--- a/fearless_simd_gen/src/mk_simd_trait.rs
+++ b/fearless_simd_gen/src/mk_simd_trait.rs
@@ -325,7 +325,11 @@ fn mk_simd_mask() -> TokenStream {
             /// Panics if `index` is greater than or equal to the number of lanes in the mask.
             #[inline(always)]
             fn test(&self, index: usize) -> bool {
-                assert!(index < Self::N);
+                assert!(
+                    index < Self::N,
+                    "mask lane index {index} is out of bounds for {} lanes",
+                    Self::N
+                );
                 (((*self).to_bitmask() >> index) & 1) != 0
             }
 
diff --git a/fearless_simd_gen/src/mk_simd_types.rs b/fearless_simd_gen/src/mk_simd_types.rs
index 541c69842..73885ad79 100644
--- a/fearless_simd_gen/src/mk_simd_types.rs
+++ b/fearless_simd_gen/src/mk_simd_types.rs
@@ -352,7 +352,11 @@ fn simd_mask_impl(ty: &VecType) -> TokenStream {
 
             #[inline(always)]
             fn set(&mut self, index: usize, value: bool) {
-                assert!(index < #len);
+                assert!(
+                    index < #len,
+                    "mask lane index {index} is out of bounds for {} lanes",
+                    #len
+                );
                 let mut lanes = self.simd.#as_array_op(*self);
                 lanes[index] = if value { !0 } else { 0 };
                 *self = self.simd.#from_array_op(lanes);
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index d43e506fc..b5e2f215f 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -290,8 +290,8 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
     match (vec_ty.n_bits(), scalar_bits) {
         (128, 16) => {
             let lanes = (0..lane_count).map(|i| {
-                let bit = 1u64 << i;
-                quote! { #bit as i16 }
+                let bit = 1_u16 << i;
+                quote! { #bit.cast_signed() }
             });
             quote! {
                 {
@@ -303,8 +303,8 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         }
         (256, 16) => {
             let lanes = (0..lane_count).map(|i| {
-                let bit = 1u64 << i;
-                quote! { #bit as i16 }
+                let bit = 1_u16 << i;
+                quote! { #bit.cast_signed() }
             });
             quote! {
                 {
@@ -316,8 +316,8 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         }
         (128, 32) => {
             let lanes = (0..lane_count).map(|i| {
-                let bit = 1u64 << i;
-                quote! { #bit as i32 }
+                let bit = 1_u32 << i;
+                quote! { #bit.cast_signed() }
             });
             quote! {
                 {
@@ -329,8 +329,8 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         }
         (256, 32) => {
             let lanes = (0..lane_count).map(|i| {
-                let bit = 1u64 << i;
-                quote! { #bit as i32 }
+                let bit = 1_u32 << i;
+                quote! { #bit.cast_signed() }
             });
             quote! {
                 {
@@ -341,20 +341,20 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
             }
         }
         (128, 64) => {
-            assert_eq!(lane_count, 2);
+            assert_eq!(lane_count, 2, "128-bit 64-bit masks must have two lanes");
             quote! {
                 {
-                    let bit_lanes = _mm_set1_epi64x(bits as i64);
+                    let bit_lanes = _mm_set1_epi64x(bits.cast_signed());
                     let bit_mask = _mm_set_epi64x(2, 1);
                     _mm_cmpeq_epi64(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
                 }
             }
         }
         (256, 64) => {
-            assert_eq!(lane_count, 4);
+            assert_eq!(lane_count, 4, "256-bit 64-bit masks must have four lanes");
             quote! {
                 {
-                    let bit_lanes = _mm256_set1_epi64x(bits as i64);
+                    let bit_lanes = _mm256_set1_epi64x(bits.cast_signed());
                     let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
                     _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                 }
@@ -365,7 +365,11 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
 }
 
 fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
-    assert_eq!(vec_ty.n_bits(), 512);
+    assert_eq!(
+        vec_ty.n_bits(),
+        512,
+        "only 512-bit masks use direct wide AVX2 bitmask lowering"
+    );
     assert!(
         matches!(vec_ty.scalar_bits, 32 | 64),
         "only 32-bit and 64-bit AVX2 masks use direct wide lowering"
@@ -378,8 +382,8 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
         match vec_ty.scalar_bits {
             32 => {
                 let lanes = (0..lanes_per_chunk).map(|i| {
-                    let bit = 1u64 << (chunk_start + i);
-                    quote! { #bit as i32 }
+                    let bit = 1_u32 << (chunk_start + i);
+                    quote! { #bit.cast_signed() }
                 });
                 quote! {
                     {
@@ -390,8 +394,8 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
             }
             64 => {
                 let lanes = (0..lanes_per_chunk).rev().map(|i| {
-                    let bit = 1u64 << (chunk_start + i);
-                    quote! { #bit as i64 }
+                    let bit = 1_u64 << (chunk_start + i);
+                    quote! { #bit.cast_signed() }
                 });
                 quote! {
                     {
@@ -405,7 +409,7 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
     });
     let set1 = match vec_ty.scalar_bits {
         32 => quote! { _mm256_set1_epi32(bits as i32) },
-        64 => quote! { _mm256_set1_epi64x(bits as i64) },
+        64 => quote! { _mm256_set1_epi64x(bits.cast_signed()) },
         _ => unreachable!(),
     };
 
@@ -421,7 +425,11 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
 }
 
 fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenStream {
-    assert_eq!(vec_ty.n_bits(), 512);
+    assert_eq!(
+        vec_ty.n_bits(),
+        512,
+        "only 512-bit masks use direct wide byte-mask lowering"
+    );
     assert_eq!(
         vec_ty.scalar_bits, 8,
         "only mask8x64 uses direct wide byte-mask lowering"
@@ -443,7 +451,7 @@ fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenS
 
             quote! {
                 {
-                    let bit_bytes = _mm_set1_epi64x(bits as i64);
+                    let bit_bytes = _mm_set1_epi64x(bits.cast_signed());
                     let bit_mask = #bit_mask;
                     #ty {
                         val: crate::support::Aligned512([#(#chunks),*]),
@@ -466,7 +474,7 @@ fn mask_from_bitmask_wide_bytes(native_width: usize, vec_ty: &VecType) -> TokenS
 
             quote! {
                 {
-                    let bit_bytes = _mm256_set1_epi64x(bits as i64);
+                    let bit_bytes = _mm256_set1_epi64x(bits.cast_signed());
                     let bit_mask = #bit_mask;
                     #ty {
                         val: crate::support::Aligned512([#(#chunks),*]),
@@ -529,24 +537,25 @@ fn mask_to_bitmask_words(native_width: usize, vec_ty: &VecType) -> TokenStream {
 
 fn mask_bit_pattern_128() -> TokenStream {
     let lanes = (0..16).map(|i| {
-        let bit = 1u16 << (i % 8);
-        quote! { #bit as i8 }
+        let bit = 1_u8 << (i % 8);
+        quote! { #bit.cast_signed() }
     });
     quote! { _mm_setr_epi8(#(#lanes),*) }
 }
 
 fn mask_bit_pattern_256() -> TokenStream {
     let lanes = (0..32).map(|i| {
-        let bit = 1u16 << (i % 8);
-        quote! { #bit as i8 }
+        let bit = 1_u8 << (i % 8);
+        quote! { #bit.cast_signed() }
     });
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
 
 fn mask_byte_shuffle_128_offset(lane_count: usize, byte_offset: usize) -> TokenStream {
     let lanes = (0..16).map(|i| {
-        let byte = (byte_offset + i.min(lane_count - 1) / 8) as u8;
-        quote! { #byte as i8 }
+        let byte = u8::try_from(byte_offset + i.min(lane_count - 1) / 8)
+            .expect("SSE byte shuffle index must fit in u8");
+        quote! { #byte.cast_signed() }
     });
     quote! { _mm_setr_epi8(#(#lanes),*) }
 }
@@ -557,8 +566,9 @@ fn mask_byte_shuffle_128(lane_count: usize) -> TokenStream {
 
 fn mask_byte_shuffle_256_offset(byte_offset: usize) -> TokenStream {
     let lanes = (0..32).map(|i| {
-        let byte = (byte_offset + i / 8) as u8;
-        quote! { #byte as i8 }
+        let byte =
+            u8::try_from(byte_offset + i / 8).expect("AVX2 byte shuffle index must fit in u8");
+        quote! { #byte.cast_signed() }
     });
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
diff --git a/fearless_simd_tests/tests/mask_methods.rs b/fearless_simd_tests/tests/mask_methods.rs
index faa188c5b..dfc0c82b5 100644
--- a/fearless_simd_tests/tests/mask_methods.rs
+++ b/fearless_simd_tests/tests/mask_methods.rs
@@ -8,7 +8,7 @@ fn mask_bits(len: usize) -> u64 {
     if len == 64 {
         u64::MAX
     } else {
-        (1u64 << len) - 1
+        (1_u64 << len) - 1
     }
 }
 
@@ -23,7 +23,7 @@ fn for_each_exhaustive_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
     );
 
     let all_bits = mask_bits(len);
-    for bits in 0..(1u64 << len) {
+    for bits in 0..(1_u64 << len) {
         f(bits);
         f(bits | !all_bits);
     }
@@ -31,7 +31,7 @@ fn for_each_exhaustive_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
 
 fn for_each_chunked_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
     assert!(
-        len % 16 == 0,
+        len.is_multiple_of(16),
         "chunked bitmask roundtrip tests expect 16-lane chunks"
     );
     assert!(
@@ -40,13 +40,13 @@ fn for_each_chunked_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
     );
 
     let chunks = len / 16;
-    let mut pattern_count = 1usize;
+    let mut pattern_count = 1_usize;
     for _ in 0..chunks {
         pattern_count *= CHUNK_PATTERNS_16.len();
     }
 
     for mut pattern_index in 0..pattern_count {
-        let mut bits = 0u64;
+        let mut bits = 0_u64;
         for chunk in 0..chunks {
             let chunk_pattern = CHUNK_PATTERNS_16[pattern_index % CHUNK_PATTERNS_16.len()];
             pattern_index /= CHUNK_PATTERNS_16.len();
@@ -69,7 +69,7 @@ fn for_each_wide_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
     check(all_bits & 0xaaaa_aaaa_aaaa_aaaa);
 
     for bit in 0..len {
-        let bits = 1u64 << bit;
+        let bits = 1_u64 << bit;
         check(bits);
         check(all_bits ^ bits);
     }
@@ -114,16 +114,16 @@ macro_rules! check_mask_methods {
         assert_eq!(mask.to_bitmask(), expected);
 
         mask.set($len - 1, true);
-        expected |= 1u64 << ($len - 1);
+        expected |= 1_u64 << ($len - 1);
         assert_eq!(mask.to_bitmask(), expected);
 
         mask.set(1, true);
-        expected |= 1u64 << 1;
+        expected |= 1_u64 << 1;
         assert!(mask.test(1));
         assert_eq!(mask.to_bitmask(), expected);
 
         mask.set(1, false);
-        expected &= !(1u64 << 1);
+        expected &= !(1_u64 << 1);
         assert!(!mask.test(1));
         assert_eq!(mask.to_bitmask(), expected);
     }};

From 7d73a6aeb0e4424584d5e09ca162d59d7c48db91 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 17:31:30 +0100
Subject: [PATCH 12/16] Make mask constants less hideous

---
 fearless_simd/src/generated/avx2.rs   | 292 ++------------------------
 fearless_simd/src/generated/sse4_2.rs | 153 +-------------
 fearless_simd_gen/src/mk_x86.rs       |  38 +++-
 3 files changed, 60 insertions(+), 423 deletions(-)

diff --git a/fearless_simd/src/generated/avx2.rs b/fearless_simd/src/generated/avx2.rs
index ac70f2d93..030e7bfbf 100644
--- a/fearless_simd/src/generated/avx2.rs
+++ b/fearless_simd/src/generated/avx2.rs
@@ -901,43 +901,10 @@ impl Simd for Avx2 {
                 let bit_bytes = _mm_cvtsi32_si128(bits as i32);
                 let bit_bytes = _mm_shuffle_epi8(
                     bit_bytes,
-                    _mm_setr_epi8(
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                    ),
-                );
-                let bit_mask = _mm_setr_epi8(
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
+                    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
                 );
+                let bit_mask =
+                    _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128);
                 _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -1443,16 +1410,7 @@ impl Simd for Avx2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi16(bits as i16);
-                let bit_mask = _mm_setr_epi16(
-                    1u16.cast_signed(),
-                    2u16.cast_signed(),
-                    4u16.cast_signed(),
-                    8u16.cast_signed(),
-                    16u16.cast_signed(),
-                    32u16.cast_signed(),
-                    64u16.cast_signed(),
-                    128u16.cast_signed(),
-                );
+                let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128);
                 _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -1973,12 +1931,7 @@ impl Simd for Avx2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi32(bits as i32);
-                let bit_mask = _mm_setr_epi32(
-                    1u32.cast_signed(),
-                    2u32.cast_signed(),
-                    4u32.cast_signed(),
-                    8u32.cast_signed(),
-                );
+                let bit_mask = _mm_setr_epi32(1, 2, 4, 8);
                 _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -3394,73 +3347,13 @@ impl Simd for Avx2 {
                 let bit_bytes = _mm256_shuffle_epi8(
                     bit_bytes,
                     _mm256_setr_epi8(
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        2u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
-                        3u8.cast_signed(),
+                        0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 3,
+                        3, 3, 3, 3, 3, 3, 3,
                     ),
                 );
                 let bit_mask = _mm256_setr_epi8(
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
+                    1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32,
+                    64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
                 );
                 _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
             }
@@ -4170,22 +4063,7 @@ impl Simd for Avx2 {
             {
                 let bit_lanes = _mm256_set1_epi16(bits as i16);
                 let bit_mask = _mm256_setr_epi16(
-                    1u16.cast_signed(),
-                    2u16.cast_signed(),
-                    4u16.cast_signed(),
-                    8u16.cast_signed(),
-                    16u16.cast_signed(),
-                    32u16.cast_signed(),
-                    64u16.cast_signed(),
-                    128u16.cast_signed(),
-                    256u16.cast_signed(),
-                    512u16.cast_signed(),
-                    1024u16.cast_signed(),
-                    2048u16.cast_signed(),
-                    4096u16.cast_signed(),
-                    8192u16.cast_signed(),
-                    16384u16.cast_signed(),
-                    32768u16.cast_signed(),
+                    1, 2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, -32768,
                 );
                 _mm256_cmpeq_epi16(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
             }
@@ -4845,16 +4723,7 @@ impl Simd for Avx2 {
         unsafe {
             {
                 let bit_lanes = _mm256_set1_epi32(bits as i32);
-                let bit_mask = _mm256_setr_epi32(
-                    1u32.cast_signed(),
-                    2u32.cast_signed(),
-                    4u32.cast_signed(),
-                    8u32.cast_signed(),
-                    16u32.cast_signed(),
-                    32u32.cast_signed(),
-                    64u32.cast_signed(),
-                    128u32.cast_signed(),
-                );
+                let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
                 _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -6360,38 +6229,8 @@ impl Simd for Avx2 {
             {
                 let bit_bytes = _mm256_set1_epi64x(bits.cast_signed());
                 let bit_mask = _mm256_setr_epi8(
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
+                    1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32,
+                    64, -128, 1, 2, 4, 8, 16, 32, 64, -128,
                 );
                 mask8x64 {
                     val: crate::support::Aligned512([
@@ -6399,38 +6238,8 @@ impl Simd for Avx2 {
                             let bit_bytes = _mm256_shuffle_epi8(
                                 bit_bytes,
                                 _mm256_setr_epi8(
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
+                                    0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2,
+                                    2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3,
                                 ),
                             );
                             _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
@@ -6439,38 +6248,8 @@ impl Simd for Avx2 {
                             let bit_bytes = _mm256_shuffle_epi8(
                                 bit_bytes,
                                 _mm256_setr_epi8(
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
+                                    4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5, 6, 6, 6, 6, 6,
+                                    6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7,
                                 ),
                             );
                             _mm256_cmpeq_epi8(_mm256_and_si256(bit_bytes, bit_mask), bit_mask)
@@ -7954,29 +7733,12 @@ impl Simd for Avx2 {
                 mask32x16 {
                     val: crate::support::Aligned512([
                         {
-                            let bit_mask = _mm256_setr_epi32(
-                                1u32.cast_signed(),
-                                2u32.cast_signed(),
-                                4u32.cast_signed(),
-                                8u32.cast_signed(),
-                                16u32.cast_signed(),
-                                32u32.cast_signed(),
-                                64u32.cast_signed(),
-                                128u32.cast_signed(),
-                            );
+                            let bit_mask = _mm256_setr_epi32(1, 2, 4, 8, 16, 32, 64, 128);
                             _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                         {
-                            let bit_mask = _mm256_setr_epi32(
-                                256u32.cast_signed(),
-                                512u32.cast_signed(),
-                                1024u32.cast_signed(),
-                                2048u32.cast_signed(),
-                                4096u32.cast_signed(),
-                                8192u32.cast_signed(),
-                                16384u32.cast_signed(),
-                                32768u32.cast_signed(),
-                            );
+                            let bit_mask =
+                                _mm256_setr_epi32(256, 512, 1024, 2048, 4096, 8192, 16384, 32768);
                             _mm256_cmpeq_epi32(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                     ]),
@@ -8413,21 +8175,11 @@ impl Simd for Avx2 {
                 mask64x8 {
                     val: crate::support::Aligned512([
                         {
-                            let bit_mask = _mm256_set_epi64x(
-                                8u64.cast_signed(),
-                                4u64.cast_signed(),
-                                2u64.cast_signed(),
-                                1u64.cast_signed(),
-                            );
+                            let bit_mask = _mm256_set_epi64x(8, 4, 2, 1);
                             _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                         {
-                            let bit_mask = _mm256_set_epi64x(
-                                128u64.cast_signed(),
-                                64u64.cast_signed(),
-                                32u64.cast_signed(),
-                                16u64.cast_signed(),
-                            );
+                            let bit_mask = _mm256_set_epi64x(128, 64, 32, 16);
                             _mm256_cmpeq_epi64(_mm256_and_si256(bit_lanes, bit_mask), bit_mask)
                         },
                     ]),
diff --git a/fearless_simd/src/generated/sse4_2.rs b/fearless_simd/src/generated/sse4_2.rs
index 6f270a21d..746cbca55 100644
--- a/fearless_simd/src/generated/sse4_2.rs
+++ b/fearless_simd/src/generated/sse4_2.rs
@@ -941,43 +941,10 @@ impl Simd for Sse4_2 {
                 let bit_bytes = _mm_cvtsi32_si128(bits as i32);
                 let bit_bytes = _mm_shuffle_epi8(
                     bit_bytes,
-                    _mm_setr_epi8(
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        0u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                        1u8.cast_signed(),
-                    ),
-                );
-                let bit_mask = _mm_setr_epi8(
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
+                    _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
                 );
+                let bit_mask =
+                    _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128);
                 _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -1492,16 +1459,7 @@ impl Simd for Sse4_2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi16(bits as i16);
-                let bit_mask = _mm_setr_epi16(
-                    1u16.cast_signed(),
-                    2u16.cast_signed(),
-                    4u16.cast_signed(),
-                    8u16.cast_signed(),
-                    16u16.cast_signed(),
-                    32u16.cast_signed(),
-                    64u16.cast_signed(),
-                    128u16.cast_signed(),
-                );
+                let bit_mask = _mm_setr_epi16(1, 2, 4, 8, 16, 32, 64, 128);
                 _mm_cmpeq_epi16(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -2031,12 +1989,7 @@ impl Simd for Sse4_2 {
         unsafe {
             {
                 let bit_lanes = _mm_set1_epi32(bits as i32);
-                let bit_mask = _mm_setr_epi32(
-                    1u32.cast_signed(),
-                    2u32.cast_signed(),
-                    4u32.cast_signed(),
-                    8u32.cast_signed(),
-                );
+                let bit_mask = _mm_setr_epi32(1, 2, 4, 8);
                 _mm_cmpeq_epi32(_mm_and_si128(bit_lanes, bit_mask), bit_mask)
             }
             .simd_into(self)
@@ -6350,119 +6303,35 @@ impl Simd for Sse4_2 {
         unsafe {
             {
                 let bit_bytes = _mm_set1_epi64x(bits.cast_signed());
-                let bit_mask = _mm_setr_epi8(
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                    1u8.cast_signed(),
-                    2u8.cast_signed(),
-                    4u8.cast_signed(),
-                    8u8.cast_signed(),
-                    16u8.cast_signed(),
-                    32u8.cast_signed(),
-                    64u8.cast_signed(),
-                    128u8.cast_signed(),
-                );
+                let bit_mask =
+                    _mm_setr_epi8(1, 2, 4, 8, 16, 32, 64, -128, 1, 2, 4, 8, 16, 32, 64, -128);
                 mask8x64 {
                     val: crate::support::Aligned512([
                         {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
-                                _mm_setr_epi8(
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    0u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                    1u8.cast_signed(),
-                                ),
+                                _mm_setr_epi8(0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
                         },
                         {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
-                                _mm_setr_epi8(
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    2u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                    3u8.cast_signed(),
-                                ),
+                                _mm_setr_epi8(2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
                         },
                         {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
-                                _mm_setr_epi8(
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    4u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                    5u8.cast_signed(),
-                                ),
+                                _mm_setr_epi8(4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 5, 5, 5, 5),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
                         },
                         {
                             let bit_bytes = _mm_shuffle_epi8(
                                 bit_bytes,
-                                _mm_setr_epi8(
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    6u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                    7u8.cast_signed(),
-                                ),
+                                _mm_setr_epi8(6, 6, 6, 6, 6, 6, 6, 6, 7, 7, 7, 7, 7, 7, 7, 7),
                             );
                             _mm_cmpeq_epi8(_mm_and_si128(bit_bytes, bit_mask), bit_mask)
                         },
diff --git a/fearless_simd_gen/src/mk_x86.rs b/fearless_simd_gen/src/mk_x86.rs
index b5e2f215f..fa4639f0a 100644
--- a/fearless_simd_gen/src/mk_x86.rs
+++ b/fearless_simd_gen/src/mk_x86.rs
@@ -14,7 +14,7 @@ use crate::generic::{
 use crate::level::Level;
 use crate::ops::{Op, OpSig, Quantifier, SlideGranularity, valid_reinterpret};
 use crate::types::{ScalarType, VecType};
-use proc_macro2::{Ident, Span, TokenStream};
+use proc_macro2::{Ident, Literal, Span, TokenStream};
 use quote::{ToTokens as _, format_ident, quote};
 
 #[derive(Clone, Copy, PartialEq, Eq)]
@@ -291,7 +291,7 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         (128, 16) => {
             let lanes = (0..lane_count).map(|i| {
                 let bit = 1_u16 << i;
-                quote! { #bit.cast_signed() }
+                signed_literal(bit.into(), 16)
             });
             quote! {
                 {
@@ -304,7 +304,7 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         (256, 16) => {
             let lanes = (0..lane_count).map(|i| {
                 let bit = 1_u16 << i;
-                quote! { #bit.cast_signed() }
+                signed_literal(bit.into(), 16)
             });
             quote! {
                 {
@@ -317,7 +317,7 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         (128, 32) => {
             let lanes = (0..lane_count).map(|i| {
                 let bit = 1_u32 << i;
-                quote! { #bit.cast_signed() }
+                signed_literal(bit.into(), 32)
             });
             quote! {
                 {
@@ -330,7 +330,7 @@ fn mask_from_bitmask_lanes(vec_ty: &VecType) -> TokenStream {
         (256, 32) => {
             let lanes = (0..lane_count).map(|i| {
                 let bit = 1_u32 << i;
-                quote! { #bit.cast_signed() }
+                signed_literal(bit.into(), 32)
             });
             quote! {
                 {
@@ -383,7 +383,7 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
             32 => {
                 let lanes = (0..lanes_per_chunk).map(|i| {
                     let bit = 1_u32 << (chunk_start + i);
-                    quote! { #bit.cast_signed() }
+                    signed_literal(bit.into(), 32)
                 });
                 quote! {
                     {
@@ -395,7 +395,7 @@ fn mask_from_bitmask_wide_avx2(vec_ty: &VecType) -> TokenStream {
             64 => {
                 let lanes = (0..lanes_per_chunk).rev().map(|i| {
                     let bit = 1_u64 << (chunk_start + i);
-                    quote! { #bit.cast_signed() }
+                    signed_literal(bit, 64)
                 });
                 quote! {
                     {
@@ -538,7 +538,7 @@ fn mask_to_bitmask_words(native_width: usize, vec_ty: &VecType) -> TokenStream {
 fn mask_bit_pattern_128() -> TokenStream {
     let lanes = (0..16).map(|i| {
         let bit = 1_u8 << (i % 8);
-        quote! { #bit.cast_signed() }
+        signed_literal(bit.into(), 8)
     });
     quote! { _mm_setr_epi8(#(#lanes),*) }
 }
@@ -546,7 +546,7 @@ fn mask_bit_pattern_128() -> TokenStream {
 fn mask_bit_pattern_256() -> TokenStream {
     let lanes = (0..32).map(|i| {
         let bit = 1_u8 << (i % 8);
-        quote! { #bit.cast_signed() }
+        signed_literal(bit.into(), 8)
     });
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
@@ -555,7 +555,7 @@ fn mask_byte_shuffle_128_offset(lane_count: usize, byte_offset: usize) -> TokenS
     let lanes = (0..16).map(|i| {
         let byte = u8::try_from(byte_offset + i.min(lane_count - 1) / 8)
             .expect("SSE byte shuffle index must fit in u8");
-        quote! { #byte.cast_signed() }
+        signed_literal(byte.into(), 8)
     });
     quote! { _mm_setr_epi8(#(#lanes),*) }
 }
@@ -568,7 +568,7 @@ fn mask_byte_shuffle_256_offset(byte_offset: usize) -> TokenStream {
     let lanes = (0..32).map(|i| {
         let byte =
             u8::try_from(byte_offset + i / 8).expect("AVX2 byte shuffle index must fit in u8");
-        quote! { #byte.cast_signed() }
+        signed_literal(byte.into(), 8)
     });
     quote! { _mm256_setr_epi8(#(#lanes),*) }
 }
@@ -577,6 +577,22 @@ fn mask_byte_shuffle_256() -> TokenStream {
     mask_byte_shuffle_256_offset(0)
 }
 
+fn signed_literal(value: u64, bits: u32) -> TokenStream {
+    assert!(
+        bits <= 64,
+        "signed literal width must fit in a primitive integer"
+    );
+    let shift = 64 - bits;
+    let value = (value << shift).cast_signed() >> shift;
+    if value < 0 {
+        let magnitude = Literal::u64_unsuffixed(value.unsigned_abs());
+        quote! { -#magnitude }
+    } else {
+        let value = Literal::u64_unsuffixed(value as u64);
+        quote! { #value }
+    }
+}
+
 impl X86 {
     pub(crate) fn handle_splat(&self, method_sig: TokenStream, vec_ty: &VecType) -> TokenStream {
         let intrinsic = set1_intrinsic(vec_ty);

From b4899e3d3029e5a968583e836e1f881f50b877ae Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 17:35:17 +0100
Subject: [PATCH 13/16] Fix doc link

---
 fearless_simd/src/generated/simd_types.rs | 24 +++++++++++------------
 fearless_simd_gen/src/types.rs            |  2 +-
 2 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/fearless_simd/src/generated/simd_types.rs b/fearless_simd/src/generated/simd_types.rs
index 66d9807c4..a71b080e6 100644
--- a/fearless_simd/src/generated/simd_types.rs
+++ b/fearless_simd/src/generated/simd_types.rs
@@ -625,7 +625,7 @@ impl<S: Simd> crate::SimdCombine<S> for u8x16<S> {
         self.simd.combine_u8x16(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x16<S: Simd> {
     pub(crate) val: S::mask8x16,
@@ -1093,7 +1093,7 @@ impl<S: Simd> crate::SimdCombine<S> for u16x8<S> {
         self.simd.combine_u16x8(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x8<S: Simd> {
     pub(crate) val: S::mask16x8,
@@ -1585,7 +1585,7 @@ impl<S: Simd> crate::SimdCombine<S> for u32x4<S> {
         self.simd.combine_u32x4(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 4 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x4<S: Simd> {
     pub(crate) val: S::mask32x4,
@@ -1918,7 +1918,7 @@ impl<S: Simd> crate::SimdCombine<S> for f64x2<S> {
         self.simd.combine_f64x2(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 2 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x2<S: Simd> {
     pub(crate) val: S::mask64x2,
@@ -2656,7 +2656,7 @@ impl<S: Simd> crate::SimdCombine<S> for u8x32<S> {
         self.simd.combine_u8x32(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 32 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x32<S: Simd> {
     pub(crate) val: S::mask8x32,
@@ -3150,7 +3150,7 @@ impl<S: Simd> crate::SimdCombine<S> for u16x16<S> {
         self.simd.combine_u16x16(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x16<S: Simd> {
     pub(crate) val: S::mask16x16,
@@ -3656,7 +3656,7 @@ impl<S: Simd> crate::SimdCombine<S> for u32x8<S> {
         self.simd.combine_u32x8(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x8<S: Simd> {
     pub(crate) val: S::mask32x8,
@@ -3996,7 +3996,7 @@ impl<S: Simd> crate::SimdCombine<S> for f64x4<S> {
         self.simd.combine_f64x4(self, rhs.simd_into(self.simd))
     }
 }
-#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 4 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x4<S: Simd> {
     pub(crate) val: S::mask64x4,
@@ -4722,7 +4722,7 @@ impl<S: Simd> crate::SimdSplit<S> for u8x64<S> {
         self.simd.split_u8x64(self)
     }
 }
-#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 64 logical lanes corresponding to 8-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask8x64<S: Simd> {
     pub(crate) val: S::mask8x64,
@@ -5204,7 +5204,7 @@ impl<S: Simd> crate::SimdSplit<S> for u16x32<S> {
         self.simd.split_u16x32(self)
     }
 }
-#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 32 logical lanes corresponding to 16-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask16x32<S: Simd> {
     pub(crate) val: S::mask16x32,
@@ -5710,7 +5710,7 @@ impl<S: Simd> crate::SimdSplit<S> for u32x16<S> {
         self.simd.split_u32x16(self)
     }
 }
-#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 16 logical lanes corresponding to 32-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask32x16<S: Simd> {
     pub(crate) val: S::mask32x16,
@@ -6044,7 +6044,7 @@ impl<S: Simd> crate::SimdSplit<S> for f64x8<S> {
         self.simd.split_f64x8(self)
     }
 }
-#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
+#[doc = "A SIMD mask of 8 logical lanes corresponding to 64-bit vector elements.\n\nThe storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1)."]
 #[derive(Clone, Copy)]
 pub struct mask64x8<S: Simd> {
     pub(crate) val: S::mask64x8,
diff --git a/fearless_simd_gen/src/types.rs b/fearless_simd_gen/src/types.rs
index 5dfea6adc..ab1d7e829 100644
--- a/fearless_simd_gen/src/types.rs
+++ b/fearless_simd_gen/src/types.rs
@@ -179,7 +179,7 @@ impl VecType {
             let scalar_bits = self.scalar_bits;
             format!(
                 "A SIMD mask of {len} logical lanes corresponding to {scalar_bits}-bit vector elements.\n\n\
-                The storage representation of this type is intentionally opaque. Use [`Self::from_bitmask`] and [`Self::to_bitmask`] for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).",
+                The storage representation of this type is intentionally opaque. Use [`SimdMask::from_bitmask`](crate::SimdMask::from_bitmask) and [`SimdMask::to_bitmask`](crate::SimdMask::to_bitmask) for compact representation interop. For compatibility with existing APIs, it may also be converted to and from signed integer lanes where false is encoded as all zeroes (integer value 0) and true is encoded as all ones (integer value -1).",
             )
         } else {
             let scalar_name = self.scalar.rust_name(self.scalar_bits);

From 258da0ac498c84670742ce9740a93cf3d41bd7c0 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sat, 23 May 2026 23:57:43 +0100
Subject: [PATCH 14/16] Replace a complex exhaustive test for masks with a more
 verbose but simpler form with all interesting values written out

---
 .../tests/harness/lm_generated.rs             |   1 +
 .../harness/lm_generated/mask_methods.rs      | 437 ++++++++++++++++++
 fearless_simd_tests/tests/mask_methods.rs     | 184 --------
 3 files changed, 438 insertions(+), 184 deletions(-)
 create mode 100644 fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
 delete mode 100644 fearless_simd_tests/tests/mask_methods.rs

diff --git a/fearless_simd_tests/tests/harness/lm_generated.rs b/fearless_simd_tests/tests/harness/lm_generated.rs
index 2db66f9ca..789a8eb99 100644
--- a/fearless_simd_tests/tests/harness/lm_generated.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated.rs
@@ -2,5 +2,6 @@
 // SPDX-License-Identifier: Apache-2.0 OR MIT
 
 mod extended_512;
+mod mask_methods;
 mod mod_256;
 mod mod_512;
diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
new file mode 100644
index 000000000..7d9549cd3
--- /dev/null
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
@@ -0,0 +1,437 @@
+// Copyright 2026 the Fearless_SIMD Authors
+// SPDX-License-Identifier: Apache-2.0 OR MIT
+
+use fearless_simd::*;
+use fearless_simd_dev_macros::simd_test;
+
+#[simd_test]
+fn mask8x16_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask8x16::from_bitmask(simd, 0x0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask8x16::from_bitmask(simd, 0x0001);
+    assert_eq!(mask.to_bitmask(), 0x0001);
+
+    let mask = mask8x16::from_bitmask(simd, 0x8000);
+    assert_eq!(mask.to_bitmask(), 0x8000);
+
+    let mask = mask8x16::from_bitmask(simd, 0x00ff);
+    assert_eq!(mask.to_bitmask(), 0x00ff);
+
+    let mask = mask8x16::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0xff00);
+
+    let mask = mask8x16::from_bitmask(simd, 0x5555);
+    assert_eq!(mask.to_bitmask(), 0x5555);
+
+    let mask = mask8x16::from_bitmask(simd, 0xaaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa);
+
+    let mask = mask8x16::from_bitmask(simd, 0xaa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask8x16::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+
+    let mask = mask8x16::from_bitmask(simd, 0xffff_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask8x16::from_bitmask(simd, 0xffff_aa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask8x16::from_bitmask(simd, 0xffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+}
+
+#[simd_test]
+fn mask16x8_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask16x8::from_bitmask(simd, 0x00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask16x8::from_bitmask(simd, 0x01);
+    assert_eq!(mask.to_bitmask(), 0x01);
+
+    let mask = mask16x8::from_bitmask(simd, 0x80);
+    assert_eq!(mask.to_bitmask(), 0x80);
+
+    let mask = mask16x8::from_bitmask(simd, 0x0f);
+    assert_eq!(mask.to_bitmask(), 0x0f);
+
+    let mask = mask16x8::from_bitmask(simd, 0xf0);
+    assert_eq!(mask.to_bitmask(), 0xf0);
+
+    let mask = mask16x8::from_bitmask(simd, 0x55);
+    assert_eq!(mask.to_bitmask(), 0x55);
+
+    let mask = mask16x8::from_bitmask(simd, 0xaa);
+    assert_eq!(mask.to_bitmask(), 0xaa);
+
+    let mask = mask16x8::from_bitmask(simd, 0xa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask16x8::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+
+    let mask = mask16x8::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask16x8::from_bitmask(simd, 0xffa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask16x8::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+}
+
+#[simd_test]
+fn mask32x4_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask32x4::from_bitmask(simd, 0x0);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask32x4::from_bitmask(simd, 0x1);
+    assert_eq!(mask.to_bitmask(), 0x1);
+
+    let mask = mask32x4::from_bitmask(simd, 0x8);
+    assert_eq!(mask.to_bitmask(), 0x8);
+
+    let mask = mask32x4::from_bitmask(simd, 0x5);
+    assert_eq!(mask.to_bitmask(), 0x5);
+
+    let mask = mask32x4::from_bitmask(simd, 0xa);
+    assert_eq!(mask.to_bitmask(), 0xa);
+
+    let mask = mask32x4::from_bitmask(simd, 0xd);
+    assert_eq!(mask.to_bitmask(), 0xd);
+
+    let mask = mask32x4::from_bitmask(simd, 0xf);
+    assert_eq!(mask.to_bitmask(), 0xf);
+
+    let mask = mask32x4::from_bitmask(simd, 0xf0);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask32x4::from_bitmask(simd, 0xfd);
+    assert_eq!(mask.to_bitmask(), 0xd);
+
+    let mask = mask32x4::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0xf);
+}
+
+#[simd_test]
+fn mask64x2_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask64x2::from_bitmask(simd, 0x0);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask64x2::from_bitmask(simd, 0x1);
+    assert_eq!(mask.to_bitmask(), 0x1);
+
+    let mask = mask64x2::from_bitmask(simd, 0x2);
+    assert_eq!(mask.to_bitmask(), 0x2);
+
+    let mask = mask64x2::from_bitmask(simd, 0x3);
+    assert_eq!(mask.to_bitmask(), 0x3);
+
+    let mask = mask64x2::from_bitmask(simd, 0xfc);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask64x2::from_bitmask(simd, 0xfd);
+    assert_eq!(mask.to_bitmask(), 0x1);
+
+    let mask = mask64x2::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0x3);
+}
+
+#[simd_test]
+fn mask8x32_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask8x32::from_bitmask(simd, 0x0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000);
+
+    let mask = mask8x32::from_bitmask(simd, 0x0000_0001);
+    assert_eq!(mask.to_bitmask(), 0x0000_0001);
+
+    let mask = mask8x32::from_bitmask(simd, 0x8000_0000);
+    assert_eq!(mask.to_bitmask(), 0x8000_0000);
+
+    let mask = mask8x32::from_bitmask(simd, 0x0000_ffff);
+    assert_eq!(mask.to_bitmask(), 0x0000_ffff);
+
+    let mask = mask8x32::from_bitmask(simd, 0xffff_0000);
+    assert_eq!(mask.to_bitmask(), 0xffff_0000);
+
+    let mask = mask8x32::from_bitmask(simd, 0x5555_5555);
+    assert_eq!(mask.to_bitmask(), 0x5555_5555);
+
+    let mask = mask8x32::from_bitmask(simd, 0xaaaa_aaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa);
+
+    let mask = mask8x32::from_bitmask(simd, 0x8000_aa55);
+    assert_eq!(mask.to_bitmask(), 0x8000_aa55);
+
+    let mask = mask8x32::from_bitmask(simd, 0xffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff);
+
+    let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000);
+
+    let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_8000_aa55);
+    assert_eq!(mask.to_bitmask(), 0x8000_aa55);
+
+    let mask = mask8x32::from_bitmask(simd, 0xffff_ffff_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff);
+}
+
+#[simd_test]
+fn mask16x16_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask16x16::from_bitmask(simd, 0x0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask16x16::from_bitmask(simd, 0x0001);
+    assert_eq!(mask.to_bitmask(), 0x0001);
+
+    let mask = mask16x16::from_bitmask(simd, 0x8000);
+    assert_eq!(mask.to_bitmask(), 0x8000);
+
+    let mask = mask16x16::from_bitmask(simd, 0x00ff);
+    assert_eq!(mask.to_bitmask(), 0x00ff);
+
+    let mask = mask16x16::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0xff00);
+
+    let mask = mask16x16::from_bitmask(simd, 0x5555);
+    assert_eq!(mask.to_bitmask(), 0x5555);
+
+    let mask = mask16x16::from_bitmask(simd, 0xaaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa);
+
+    let mask = mask16x16::from_bitmask(simd, 0xaa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask16x16::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+
+    let mask = mask16x16::from_bitmask(simd, 0xffff_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask16x16::from_bitmask(simd, 0xffff_aa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask16x16::from_bitmask(simd, 0xffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+}
+
+#[simd_test]
+fn mask32x8_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask32x8::from_bitmask(simd, 0x00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask32x8::from_bitmask(simd, 0x01);
+    assert_eq!(mask.to_bitmask(), 0x01);
+
+    let mask = mask32x8::from_bitmask(simd, 0x80);
+    assert_eq!(mask.to_bitmask(), 0x80);
+
+    let mask = mask32x8::from_bitmask(simd, 0x0f);
+    assert_eq!(mask.to_bitmask(), 0x0f);
+
+    let mask = mask32x8::from_bitmask(simd, 0xf0);
+    assert_eq!(mask.to_bitmask(), 0xf0);
+
+    let mask = mask32x8::from_bitmask(simd, 0x55);
+    assert_eq!(mask.to_bitmask(), 0x55);
+
+    let mask = mask32x8::from_bitmask(simd, 0xaa);
+    assert_eq!(mask.to_bitmask(), 0xaa);
+
+    let mask = mask32x8::from_bitmask(simd, 0xa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask32x8::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+
+    let mask = mask32x8::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask32x8::from_bitmask(simd, 0xffa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask32x8::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+}
+
+#[simd_test]
+fn mask64x4_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask64x4::from_bitmask(simd, 0x0);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask64x4::from_bitmask(simd, 0x1);
+    assert_eq!(mask.to_bitmask(), 0x1);
+
+    let mask = mask64x4::from_bitmask(simd, 0x8);
+    assert_eq!(mask.to_bitmask(), 0x8);
+
+    let mask = mask64x4::from_bitmask(simd, 0x5);
+    assert_eq!(mask.to_bitmask(), 0x5);
+
+    let mask = mask64x4::from_bitmask(simd, 0xa);
+    assert_eq!(mask.to_bitmask(), 0xa);
+
+    let mask = mask64x4::from_bitmask(simd, 0xd);
+    assert_eq!(mask.to_bitmask(), 0xd);
+
+    let mask = mask64x4::from_bitmask(simd, 0xf);
+    assert_eq!(mask.to_bitmask(), 0xf);
+
+    let mask = mask64x4::from_bitmask(simd, 0xf0);
+    assert_eq!(mask.to_bitmask(), 0x0);
+
+    let mask = mask64x4::from_bitmask(simd, 0xfd);
+    assert_eq!(mask.to_bitmask(), 0xd);
+
+    let mask = mask64x4::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0xf);
+}
+
+#[simd_test]
+fn mask8x64_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0000);
+
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0001);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0001);
+
+    let mask = mask8x64::from_bitmask(simd, 0x8000_0000_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x8000_0000_0000_0000);
+
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_ffff_ffff);
+
+    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff_0000_0000);
+
+    let mask = mask8x64::from_bitmask(simd, 0x5555_5555_5555_5555);
+    assert_eq!(mask.to_bitmask(), 0x5555_5555_5555_5555);
+
+    let mask = mask8x64::from_bitmask(simd, 0xaaaa_aaaa_aaaa_aaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa_aaaa_aaaa);
+
+    let mask = mask8x64::from_bitmask(simd, 0x8000_0001_5555_aaab);
+    assert_eq!(mask.to_bitmask(), 0x8000_0001_5555_aaab);
+
+    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff_ffff_ffff);
+}
+
+#[simd_test]
+fn mask16x32_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask16x32::from_bitmask(simd, 0x0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000);
+
+    let mask = mask16x32::from_bitmask(simd, 0x0000_0001);
+    assert_eq!(mask.to_bitmask(), 0x0000_0001);
+
+    let mask = mask16x32::from_bitmask(simd, 0x8000_0000);
+    assert_eq!(mask.to_bitmask(), 0x8000_0000);
+
+    let mask = mask16x32::from_bitmask(simd, 0x0000_ffff);
+    assert_eq!(mask.to_bitmask(), 0x0000_ffff);
+
+    let mask = mask16x32::from_bitmask(simd, 0xffff_0000);
+    assert_eq!(mask.to_bitmask(), 0xffff_0000);
+
+    let mask = mask16x32::from_bitmask(simd, 0x5555_5555);
+    assert_eq!(mask.to_bitmask(), 0x5555_5555);
+
+    let mask = mask16x32::from_bitmask(simd, 0xaaaa_aaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa);
+
+    let mask = mask16x32::from_bitmask(simd, 0x8000_aa55);
+    assert_eq!(mask.to_bitmask(), 0x8000_aa55);
+
+    let mask = mask16x32::from_bitmask(simd, 0xffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff);
+
+    let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000);
+
+    let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_8000_aa55);
+    assert_eq!(mask.to_bitmask(), 0x8000_aa55);
+
+    let mask = mask16x32::from_bitmask(simd, 0xffff_ffff_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff);
+}
+
+#[simd_test]
+fn mask32x16_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask32x16::from_bitmask(simd, 0x0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask32x16::from_bitmask(simd, 0x0001);
+    assert_eq!(mask.to_bitmask(), 0x0001);
+
+    let mask = mask32x16::from_bitmask(simd, 0x8000);
+    assert_eq!(mask.to_bitmask(), 0x8000);
+
+    let mask = mask32x16::from_bitmask(simd, 0x00ff);
+    assert_eq!(mask.to_bitmask(), 0x00ff);
+
+    let mask = mask32x16::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0xff00);
+
+    let mask = mask32x16::from_bitmask(simd, 0x5555);
+    assert_eq!(mask.to_bitmask(), 0x5555);
+
+    let mask = mask32x16::from_bitmask(simd, 0xaaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa);
+
+    let mask = mask32x16::from_bitmask(simd, 0xaa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask32x16::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+
+    let mask = mask32x16::from_bitmask(simd, 0xffff_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000);
+
+    let mask = mask32x16::from_bitmask(simd, 0xffff_aa55);
+    assert_eq!(mask.to_bitmask(), 0xaa55);
+
+    let mask = mask32x16::from_bitmask(simd, 0xffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff);
+}
+
+#[simd_test]
+fn mask64x8_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask64x8::from_bitmask(simd, 0x00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask64x8::from_bitmask(simd, 0x01);
+    assert_eq!(mask.to_bitmask(), 0x01);
+
+    let mask = mask64x8::from_bitmask(simd, 0x80);
+    assert_eq!(mask.to_bitmask(), 0x80);
+
+    let mask = mask64x8::from_bitmask(simd, 0x0f);
+    assert_eq!(mask.to_bitmask(), 0x0f);
+
+    let mask = mask64x8::from_bitmask(simd, 0xf0);
+    assert_eq!(mask.to_bitmask(), 0xf0);
+
+    let mask = mask64x8::from_bitmask(simd, 0x55);
+    assert_eq!(mask.to_bitmask(), 0x55);
+
+    let mask = mask64x8::from_bitmask(simd, 0xaa);
+    assert_eq!(mask.to_bitmask(), 0xaa);
+
+    let mask = mask64x8::from_bitmask(simd, 0xa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask64x8::from_bitmask(simd, 0xff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+
+    let mask = mask64x8::from_bitmask(simd, 0xff00);
+    assert_eq!(mask.to_bitmask(), 0x00);
+
+    let mask = mask64x8::from_bitmask(simd, 0xffa5);
+    assert_eq!(mask.to_bitmask(), 0xa5);
+
+    let mask = mask64x8::from_bitmask(simd, 0xffff);
+    assert_eq!(mask.to_bitmask(), 0xff);
+}
diff --git a/fearless_simd_tests/tests/mask_methods.rs b/fearless_simd_tests/tests/mask_methods.rs
deleted file mode 100644
index dfc0c82b5..000000000
--- a/fearless_simd_tests/tests/mask_methods.rs
+++ /dev/null
@@ -1,184 +0,0 @@
-// Copyright 2026 the Fearless_SIMD Authors
-// SPDX-License-Identifier: Apache-2.0 OR MIT
-
-use fearless_simd::*;
-use fearless_simd_dev_macros::simd_test;
-
-fn mask_bits(len: usize) -> u64 {
-    if len == 64 {
-        u64::MAX
-    } else {
-        (1_u64 << len) - 1
-    }
-}
-
-const CHUNK_PATTERNS_16: [u64; 8] = [
-    0x0000, 0x0001, 0x00ff, 0x5555, 0x8000, 0xaaaa, 0xff00, 0xffff,
-];
-
-fn for_each_exhaustive_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
-    assert!(
-        len <= 16,
-        "exhaustive bitmask roundtrip tests are only practical up to 16 lanes"
-    );
-
-    let all_bits = mask_bits(len);
-    for bits in 0..(1_u64 << len) {
-        f(bits);
-        f(bits | !all_bits);
-    }
-}
-
-fn for_each_chunked_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
-    assert!(
-        len.is_multiple_of(16),
-        "chunked bitmask roundtrip tests expect 16-lane chunks"
-    );
-    assert!(
-        len <= 64,
-        "chunked bitmask roundtrip tests only support u64 bitmasks"
-    );
-
-    let chunks = len / 16;
-    let mut pattern_count = 1_usize;
-    for _ in 0..chunks {
-        pattern_count *= CHUNK_PATTERNS_16.len();
-    }
-
-    for mut pattern_index in 0..pattern_count {
-        let mut bits = 0_u64;
-        for chunk in 0..chunks {
-            let chunk_pattern = CHUNK_PATTERNS_16[pattern_index % CHUNK_PATTERNS_16.len()];
-            pattern_index /= CHUNK_PATTERNS_16.len();
-            bits |= chunk_pattern << (chunk * 16);
-        }
-        f(bits);
-    }
-}
-
-fn for_each_wide_bitmask<F: FnMut(u64)>(len: usize, mut f: F) {
-    let all_bits = mask_bits(len);
-    let mut check = |bits| {
-        f(bits);
-        f(bits | !all_bits);
-    };
-
-    check(0);
-    check(all_bits);
-    check(all_bits & 0x5555_5555_5555_5555);
-    check(all_bits & 0xaaaa_aaaa_aaaa_aaaa);
-
-    for bit in 0..len {
-        let bits = 1_u64 << bit;
-        check(bits);
-        check(all_bits ^ bits);
-    }
-
-    for_each_chunked_bitmask(len, check);
-}
-
-macro_rules! check_bitmask_roundtrip {
-    ($simd:expr, $mask:ident, $len:expr, $bits:expr) => {{
-        let raw_bits = $bits;
-        let expected = raw_bits & mask_bits($len);
-        let mask = <$mask<_> as SimdMask<_>>::from_bitmask($simd, raw_bits);
-
-        assert_eq!(
-            mask.to_bitmask(),
-            expected,
-            "{}::from_bitmask({raw_bits:#018x}).to_bitmask()",
-            stringify!($mask)
-        );
-        assert_eq!(
-            <$mask<_> as SimdMask<_>>::from_bitmask($simd, mask.to_bitmask()).to_bitmask(),
-            expected,
-            "{}::from_bitmask({raw_bits:#018x}).to_bitmask() roundtripped again",
-            stringify!($mask)
-        );
-    }};
-}
-
-macro_rules! check_mask_methods {
-    ($simd:expr, $mask:ident, $len:expr, $bits:expr) => {{
-        let all_bits = mask_bits($len);
-        let mut expected = $bits & all_bits;
-        let mut mask = <$mask<_> as SimdMask<_>>::from_bitmask($simd, $bits);
-
-        assert_eq!(mask.to_bitmask(), expected);
-        for i in 0..$len {
-            assert_eq!(mask.test(i), ((expected >> i) & 1) != 0);
-        }
-
-        mask.set(0, false);
-        expected &= !1;
-        assert_eq!(mask.to_bitmask(), expected);
-
-        mask.set($len - 1, true);
-        expected |= 1_u64 << ($len - 1);
-        assert_eq!(mask.to_bitmask(), expected);
-
-        mask.set(1, true);
-        expected |= 1_u64 << 1;
-        assert!(mask.test(1));
-        assert_eq!(mask.to_bitmask(), expected);
-
-        mask.set(1, false);
-        expected &= !(1_u64 << 1);
-        assert!(!mask.test(1));
-        assert_eq!(mask.to_bitmask(), expected);
-    }};
-}
-
-#[simd_test]
-fn mask_bitmask_roundtrip_exhaustive<S: Simd>(simd: S) {
-    for_each_exhaustive_bitmask(16, |bits| {
-        check_bitmask_roundtrip!(simd, mask8x16, 16, bits);
-        check_bitmask_roundtrip!(simd, mask16x16, 16, bits);
-        check_bitmask_roundtrip!(simd, mask32x16, 16, bits);
-    });
-
-    for_each_exhaustive_bitmask(8, |bits| {
-        check_bitmask_roundtrip!(simd, mask16x8, 8, bits);
-        check_bitmask_roundtrip!(simd, mask32x8, 8, bits);
-        check_bitmask_roundtrip!(simd, mask64x8, 8, bits);
-    });
-
-    for_each_exhaustive_bitmask(4, |bits| {
-        check_bitmask_roundtrip!(simd, mask32x4, 4, bits);
-        check_bitmask_roundtrip!(simd, mask64x4, 4, bits);
-    });
-
-    for_each_exhaustive_bitmask(2, |bits| {
-        check_bitmask_roundtrip!(simd, mask64x2, 2, bits);
-    });
-}
-
-#[simd_test]
-fn mask_bitmask_roundtrip_wide_patterns<S: Simd>(simd: S) {
-    for_each_wide_bitmask(32, |bits| {
-        check_bitmask_roundtrip!(simd, mask8x32, 32, bits);
-        check_bitmask_roundtrip!(simd, mask16x32, 32, bits);
-    });
-
-    for_each_wide_bitmask(64, |bits| {
-        check_bitmask_roundtrip!(simd, mask8x64, 64, bits);
-    });
-}
-
-#[simd_test]
-fn mask_bitmask_roundtrip_test_set<S: Simd>(simd: S) {
-    check_mask_methods!(simd, mask8x16, 16, 0x1_aa55_8001);
-    check_mask_methods!(simd, mask16x8, 8, 0x1_a5);
-    check_mask_methods!(simd, mask32x4, 4, 0x1d);
-    check_mask_methods!(simd, mask64x2, 2, 0x6);
-
-    check_mask_methods!(simd, mask8x32, 32, 0x1_8000_aa55);
-    check_mask_methods!(simd, mask16x16, 16, 0x1_aa55_8001);
-    check_mask_methods!(simd, mask32x8, 8, 0x1_a5);
-    check_mask_methods!(simd, mask64x4, 4, 0x1d);
-
-    check_mask_methods!(simd, mask8x64, 64, 0x8000_0001_5555_aaab);
-    check_mask_methods!(simd, mask16x32, 32, 0x1_8000_aa55);
-    check_mask_methods!(simd, mask32x16, 16, 0x1_aa55_8001);
-    check_mask_methods!(simd, mask64x8, 8, 0x1_a5);
-}

From 5235116895e915609d8600ddd46caa34e775fc07 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 00:03:33 +0100
Subject: [PATCH 15/16] Replace lists of interesting values with exhaustive
 loops for smaller mask sizes in tests

---
 .../harness/lm_generated/mask_methods.rs      | 392 ++++--------------
 fearless_simd_tests/tests/mod.rs              |   1 -
 2 files changed, 70 insertions(+), 323 deletions(-)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
index 7d9549cd3..747a3e47e 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
@@ -6,137 +6,74 @@ use fearless_simd_dev_macros::simd_test;
 
 #[simd_test]
 fn mask8x16_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask8x16::from_bitmask(simd, 0x0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask8x16::from_bitmask(simd, 0x0001);
-    assert_eq!(mask.to_bitmask(), 0x0001);
-
-    let mask = mask8x16::from_bitmask(simd, 0x8000);
-    assert_eq!(mask.to_bitmask(), 0x8000);
-
-    let mask = mask8x16::from_bitmask(simd, 0x00ff);
-    assert_eq!(mask.to_bitmask(), 0x00ff);
-
-    let mask = mask8x16::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0xff00);
-
-    let mask = mask8x16::from_bitmask(simd, 0x5555);
-    assert_eq!(mask.to_bitmask(), 0x5555);
-
-    let mask = mask8x16::from_bitmask(simd, 0xaaaa);
-    assert_eq!(mask.to_bitmask(), 0xaaaa);
-
-    let mask = mask8x16::from_bitmask(simd, 0xaa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask8x16::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
-
-    let mask = mask8x16::from_bitmask(simd, 0xffff_0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask8x16::from_bitmask(simd, 0xffff_aa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask8x16::from_bitmask(simd, 0xffff_ffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
+    for bits in 0..=0xffff_u64 {
+        let mask = mask8x16::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits);
+    }
 }
 
 #[simd_test]
 fn mask16x8_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask16x8::from_bitmask(simd, 0x00);
-    assert_eq!(mask.to_bitmask(), 0x00);
-
-    let mask = mask16x8::from_bitmask(simd, 0x01);
-    assert_eq!(mask.to_bitmask(), 0x01);
-
-    let mask = mask16x8::from_bitmask(simd, 0x80);
-    assert_eq!(mask.to_bitmask(), 0x80);
-
-    let mask = mask16x8::from_bitmask(simd, 0x0f);
-    assert_eq!(mask.to_bitmask(), 0x0f);
-
-    let mask = mask16x8::from_bitmask(simd, 0xf0);
-    assert_eq!(mask.to_bitmask(), 0xf0);
-
-    let mask = mask16x8::from_bitmask(simd, 0x55);
-    assert_eq!(mask.to_bitmask(), 0x55);
-
-    let mask = mask16x8::from_bitmask(simd, 0xaa);
-    assert_eq!(mask.to_bitmask(), 0xaa);
-
-    let mask = mask16x8::from_bitmask(simd, 0xa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
-
-    let mask = mask16x8::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0xff);
-
-    let mask = mask16x8::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0x00);
-
-    let mask = mask16x8::from_bitmask(simd, 0xffa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
-
-    let mask = mask16x8::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xff);
+    for bits in 0..=0xffff_u64 {
+        let mask = mask16x8::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0xff);
+    }
 }
 
 #[simd_test]
 fn mask32x4_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask32x4::from_bitmask(simd, 0x0);
-    assert_eq!(mask.to_bitmask(), 0x0);
-
-    let mask = mask32x4::from_bitmask(simd, 0x1);
-    assert_eq!(mask.to_bitmask(), 0x1);
-
-    let mask = mask32x4::from_bitmask(simd, 0x8);
-    assert_eq!(mask.to_bitmask(), 0x8);
-
-    let mask = mask32x4::from_bitmask(simd, 0x5);
-    assert_eq!(mask.to_bitmask(), 0x5);
-
-    let mask = mask32x4::from_bitmask(simd, 0xa);
-    assert_eq!(mask.to_bitmask(), 0xa);
-
-    let mask = mask32x4::from_bitmask(simd, 0xd);
-    assert_eq!(mask.to_bitmask(), 0xd);
-
-    let mask = mask32x4::from_bitmask(simd, 0xf);
-    assert_eq!(mask.to_bitmask(), 0xf);
-
-    let mask = mask32x4::from_bitmask(simd, 0xf0);
-    assert_eq!(mask.to_bitmask(), 0x0);
-
-    let mask = mask32x4::from_bitmask(simd, 0xfd);
-    assert_eq!(mask.to_bitmask(), 0xd);
-
-    let mask = mask32x4::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0xf);
+    for bits in 0..=0xffff_u64 {
+        let mask = mask32x4::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0xf);
+    }
 }
 
 #[simd_test]
 fn mask64x2_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask64x2::from_bitmask(simd, 0x0);
-    assert_eq!(mask.to_bitmask(), 0x0);
-
-    let mask = mask64x2::from_bitmask(simd, 0x1);
-    assert_eq!(mask.to_bitmask(), 0x1);
+    for bits in 0..=0xffff_u64 {
+        let mask = mask64x2::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0x3);
+    }
+}
 
-    let mask = mask64x2::from_bitmask(simd, 0x2);
-    assert_eq!(mask.to_bitmask(), 0x2);
+#[simd_test]
+fn mask16x16_bitmask_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        let mask = mask16x16::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits);
+    }
+}
 
-    let mask = mask64x2::from_bitmask(simd, 0x3);
-    assert_eq!(mask.to_bitmask(), 0x3);
+#[simd_test]
+fn mask32x8_bitmask_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        let mask = mask32x8::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0xff);
+    }
+}
 
-    let mask = mask64x2::from_bitmask(simd, 0xfc);
-    assert_eq!(mask.to_bitmask(), 0x0);
+#[simd_test]
+fn mask64x4_bitmask_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        let mask = mask64x4::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0xf);
+    }
+}
 
-    let mask = mask64x2::from_bitmask(simd, 0xfd);
-    assert_eq!(mask.to_bitmask(), 0x1);
+#[simd_test]
+fn mask32x16_bitmask_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        let mask = mask32x16::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits);
+    }
+}
 
-    let mask = mask64x2::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0x3);
+#[simd_test]
+fn mask64x8_bitmask_roundtrip<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_u64 {
+        let mask = mask64x8::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits & 0xff);
+    }
 }
 
 #[simd_test]
@@ -178,147 +115,6 @@ fn mask8x32_bitmask_roundtrip<S: Simd>(simd: S) {
     assert_eq!(mask.to_bitmask(), 0xffff_ffff);
 }
 
-#[simd_test]
-fn mask16x16_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask16x16::from_bitmask(simd, 0x0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask16x16::from_bitmask(simd, 0x0001);
-    assert_eq!(mask.to_bitmask(), 0x0001);
-
-    let mask = mask16x16::from_bitmask(simd, 0x8000);
-    assert_eq!(mask.to_bitmask(), 0x8000);
-
-    let mask = mask16x16::from_bitmask(simd, 0x00ff);
-    assert_eq!(mask.to_bitmask(), 0x00ff);
-
-    let mask = mask16x16::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0xff00);
-
-    let mask = mask16x16::from_bitmask(simd, 0x5555);
-    assert_eq!(mask.to_bitmask(), 0x5555);
-
-    let mask = mask16x16::from_bitmask(simd, 0xaaaa);
-    assert_eq!(mask.to_bitmask(), 0xaaaa);
-
-    let mask = mask16x16::from_bitmask(simd, 0xaa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask16x16::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
-
-    let mask = mask16x16::from_bitmask(simd, 0xffff_0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask16x16::from_bitmask(simd, 0xffff_aa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask16x16::from_bitmask(simd, 0xffff_ffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
-}
-
-#[simd_test]
-fn mask32x8_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask32x8::from_bitmask(simd, 0x00);
-    assert_eq!(mask.to_bitmask(), 0x00);
-
-    let mask = mask32x8::from_bitmask(simd, 0x01);
-    assert_eq!(mask.to_bitmask(), 0x01);
-
-    let mask = mask32x8::from_bitmask(simd, 0x80);
-    assert_eq!(mask.to_bitmask(), 0x80);
-
-    let mask = mask32x8::from_bitmask(simd, 0x0f);
-    assert_eq!(mask.to_bitmask(), 0x0f);
-
-    let mask = mask32x8::from_bitmask(simd, 0xf0);
-    assert_eq!(mask.to_bitmask(), 0xf0);
-
-    let mask = mask32x8::from_bitmask(simd, 0x55);
-    assert_eq!(mask.to_bitmask(), 0x55);
-
-    let mask = mask32x8::from_bitmask(simd, 0xaa);
-    assert_eq!(mask.to_bitmask(), 0xaa);
-
-    let mask = mask32x8::from_bitmask(simd, 0xa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
-
-    let mask = mask32x8::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0xff);
-
-    let mask = mask32x8::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0x00);
-
-    let mask = mask32x8::from_bitmask(simd, 0xffa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
-
-    let mask = mask32x8::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xff);
-}
-
-#[simd_test]
-fn mask64x4_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask64x4::from_bitmask(simd, 0x0);
-    assert_eq!(mask.to_bitmask(), 0x0);
-
-    let mask = mask64x4::from_bitmask(simd, 0x1);
-    assert_eq!(mask.to_bitmask(), 0x1);
-
-    let mask = mask64x4::from_bitmask(simd, 0x8);
-    assert_eq!(mask.to_bitmask(), 0x8);
-
-    let mask = mask64x4::from_bitmask(simd, 0x5);
-    assert_eq!(mask.to_bitmask(), 0x5);
-
-    let mask = mask64x4::from_bitmask(simd, 0xa);
-    assert_eq!(mask.to_bitmask(), 0xa);
-
-    let mask = mask64x4::from_bitmask(simd, 0xd);
-    assert_eq!(mask.to_bitmask(), 0xd);
-
-    let mask = mask64x4::from_bitmask(simd, 0xf);
-    assert_eq!(mask.to_bitmask(), 0xf);
-
-    let mask = mask64x4::from_bitmask(simd, 0xf0);
-    assert_eq!(mask.to_bitmask(), 0x0);
-
-    let mask = mask64x4::from_bitmask(simd, 0xfd);
-    assert_eq!(mask.to_bitmask(), 0xd);
-
-    let mask = mask64x4::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0xf);
-}
-
-#[simd_test]
-fn mask8x64_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0000);
-    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0000);
-
-    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0001);
-    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0001);
-
-    let mask = mask8x64::from_bitmask(simd, 0x8000_0000_0000_0000);
-    assert_eq!(mask.to_bitmask(), 0x8000_0000_0000_0000);
-
-    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_ffff_ffff);
-    assert_eq!(mask.to_bitmask(), 0x0000_0000_ffff_ffff);
-
-    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_0000_0000);
-    assert_eq!(mask.to_bitmask(), 0xffff_ffff_0000_0000);
-
-    let mask = mask8x64::from_bitmask(simd, 0x5555_5555_5555_5555);
-    assert_eq!(mask.to_bitmask(), 0x5555_5555_5555_5555);
-
-    let mask = mask8x64::from_bitmask(simd, 0xaaaa_aaaa_aaaa_aaaa);
-    assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa_aaaa_aaaa);
-
-    let mask = mask8x64::from_bitmask(simd, 0x8000_0001_5555_aaab);
-    assert_eq!(mask.to_bitmask(), 0x8000_0001_5555_aaab);
-
-    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_ffff_ffff);
-    assert_eq!(mask.to_bitmask(), 0xffff_ffff_ffff_ffff);
-}
-
 #[simd_test]
 fn mask16x32_bitmask_roundtrip<S: Simd>(simd: S) {
     let mask = mask16x32::from_bitmask(simd, 0x0000_0000);
@@ -359,79 +155,31 @@ fn mask16x32_bitmask_roundtrip<S: Simd>(simd: S) {
 }
 
 #[simd_test]
-fn mask32x16_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask32x16::from_bitmask(simd, 0x0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask32x16::from_bitmask(simd, 0x0001);
-    assert_eq!(mask.to_bitmask(), 0x0001);
-
-    let mask = mask32x16::from_bitmask(simd, 0x8000);
-    assert_eq!(mask.to_bitmask(), 0x8000);
-
-    let mask = mask32x16::from_bitmask(simd, 0x00ff);
-    assert_eq!(mask.to_bitmask(), 0x00ff);
-
-    let mask = mask32x16::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0xff00);
-
-    let mask = mask32x16::from_bitmask(simd, 0x5555);
-    assert_eq!(mask.to_bitmask(), 0x5555);
-
-    let mask = mask32x16::from_bitmask(simd, 0xaaaa);
-    assert_eq!(mask.to_bitmask(), 0xaaaa);
-
-    let mask = mask32x16::from_bitmask(simd, 0xaa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask32x16::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
-
-    let mask = mask32x16::from_bitmask(simd, 0xffff_0000);
-    assert_eq!(mask.to_bitmask(), 0x0000);
-
-    let mask = mask32x16::from_bitmask(simd, 0xffff_aa55);
-    assert_eq!(mask.to_bitmask(), 0xaa55);
-
-    let mask = mask32x16::from_bitmask(simd, 0xffff_ffff);
-    assert_eq!(mask.to_bitmask(), 0xffff);
-}
-
-#[simd_test]
-fn mask64x8_bitmask_roundtrip<S: Simd>(simd: S) {
-    let mask = mask64x8::from_bitmask(simd, 0x00);
-    assert_eq!(mask.to_bitmask(), 0x00);
-
-    let mask = mask64x8::from_bitmask(simd, 0x01);
-    assert_eq!(mask.to_bitmask(), 0x01);
-
-    let mask = mask64x8::from_bitmask(simd, 0x80);
-    assert_eq!(mask.to_bitmask(), 0x80);
-
-    let mask = mask64x8::from_bitmask(simd, 0x0f);
-    assert_eq!(mask.to_bitmask(), 0x0f);
+fn mask8x64_bitmask_roundtrip<S: Simd>(simd: S) {
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0000);
 
-    let mask = mask64x8::from_bitmask(simd, 0xf0);
-    assert_eq!(mask.to_bitmask(), 0xf0);
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_0000_0001);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_0000_0001);
 
-    let mask = mask64x8::from_bitmask(simd, 0x55);
-    assert_eq!(mask.to_bitmask(), 0x55);
+    let mask = mask8x64::from_bitmask(simd, 0x8000_0000_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0x8000_0000_0000_0000);
 
-    let mask = mask64x8::from_bitmask(simd, 0xaa);
-    assert_eq!(mask.to_bitmask(), 0xaa);
+    let mask = mask8x64::from_bitmask(simd, 0x0000_0000_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0x0000_0000_ffff_ffff);
 
-    let mask = mask64x8::from_bitmask(simd, 0xa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
+    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_0000_0000);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff_0000_0000);
 
-    let mask = mask64x8::from_bitmask(simd, 0xff);
-    assert_eq!(mask.to_bitmask(), 0xff);
+    let mask = mask8x64::from_bitmask(simd, 0x5555_5555_5555_5555);
+    assert_eq!(mask.to_bitmask(), 0x5555_5555_5555_5555);
 
-    let mask = mask64x8::from_bitmask(simd, 0xff00);
-    assert_eq!(mask.to_bitmask(), 0x00);
+    let mask = mask8x64::from_bitmask(simd, 0xaaaa_aaaa_aaaa_aaaa);
+    assert_eq!(mask.to_bitmask(), 0xaaaa_aaaa_aaaa_aaaa);
 
-    let mask = mask64x8::from_bitmask(simd, 0xffa5);
-    assert_eq!(mask.to_bitmask(), 0xa5);
+    let mask = mask8x64::from_bitmask(simd, 0x8000_0001_5555_aaab);
+    assert_eq!(mask.to_bitmask(), 0x8000_0001_5555_aaab);
 
-    let mask = mask64x8::from_bitmask(simd, 0xffff);
-    assert_eq!(mask.to_bitmask(), 0xff);
+    let mask = mask8x64::from_bitmask(simd, 0xffff_ffff_ffff_ffff);
+    assert_eq!(mask.to_bitmask(), 0xffff_ffff_ffff_ffff);
 }
diff --git a/fearless_simd_tests/tests/mod.rs b/fearless_simd_tests/tests/mod.rs
index ac410b667..4d2f053d8 100644
--- a/fearless_simd_tests/tests/mod.rs
+++ b/fearless_simd_tests/tests/mod.rs
@@ -10,7 +10,6 @@ use fearless_simd::*;
 use fearless_simd_dev_macros::simd_test;
 
 mod harness;
-mod mask_methods;
 mod soundness;
 
 // Ensure that we can cast between generic native-width vectors

From c8b5f172ccab5762feb9df6401f20f01429dd062 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Sun, 24 May 2026 11:46:32 +0100
Subject: [PATCH 16/16] Add exhaustive roundtrip tests for more mask types

---
 .../harness/lm_generated/mask_methods.rs      | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
index 747a3e47e..15963b2a3 100644
--- a/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
+++ b/fearless_simd_tests/tests/harness/lm_generated/mask_methods.rs
@@ -76,6 +76,25 @@ fn mask64x8_bitmask_roundtrip<S: Simd>(simd: S) {
     }
 }
 
+#[simd_test]
+#[ignore] // takes too long to run on CI
+fn mask8x32_bitmask_roundtrip_exhaustive<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_ffff_u64 {
+        let mask = mask8x32::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits);
+    }
+}
+
+#[simd_test]
+#[ignore] // takes too long to run on CI
+fn mask16x32_bitmask_roundtrip_exhaustive<S: Simd>(simd: S) {
+    for bits in 0..=0xffff_ffff_u64 {
+        let mask = mask16x32::from_bitmask(simd, bits);
+        assert_eq!(mask.to_bitmask(), bits);
+    }
+}
+
+// selected interesting bit patterns to test always
 #[simd_test]
 fn mask8x32_bitmask_roundtrip<S: Simd>(simd: S) {
     let mask = mask8x32::from_bitmask(simd, 0x0000_0000);
@@ -115,6 +134,7 @@ fn mask8x32_bitmask_roundtrip<S: Simd>(simd: S) {
     assert_eq!(mask.to_bitmask(), 0xffff_ffff);
 }
 
+// selected interesting bit patterns to test always
 #[simd_test]
 fn mask16x32_bitmask_roundtrip<S: Simd>(simd: S) {
     let mask = mask16x32::from_bitmask(simd, 0x0000_0000);