diff --git a/.cargo/config.toml b/.cargo/config.toml
index 4c044c259..479d9fd44 100644
--- a/.cargo/config.toml
+++ b/.cargo/config.toml
@@ -1,3 +1,6 @@
+[target.thumbv7neon-unknown-linux-gnueabihf]
+linker = "arm-linux-gnueabihf-gcc"
+
 [target.wasm32-wasip1]
 runner = "wasmtime --dir=. --dir=.. --env ARBTEST_BUDGET_MS"
 rustflags = ["-C", "target-feature=+simd128,+relaxed-simd"]
diff --git a/.github/workflows/pull_request.yml b/.github/workflows/pull_request.yml
index 1701cf1cd..7828fef52 100644
--- a/.github/workflows/pull_request.yml
+++ b/.github/workflows/pull_request.yml
@@ -67,6 +67,14 @@ jobs:
       matrix:
         features: [all, none]
         os: [ubuntu-24.04-arm, ubuntu-latest]
+        include:
+          - os: ubuntu-24.04-arm
+            nightly: true
+            features: neon
+            target: thumbv7neon-unknown-linux-gnueabihf
+          - os: ubuntu-latest
+            features: simd128
+            target: wasm32-wasip1
     steps:
       - name: Checkout
         uses: actions/checkout@34e114876b0b11c390a56381ad16ebd13914f8d5 # v4
@@ -78,6 +86,7 @@ jobs:
         uses: dtolnay/rust-toolchain@29eef336d9b2848a0b548edc03f92a220660cdb8 # stable
         with:
           components: clippy
+          targets: ${{ matrix.target }}
 
       - name: Rust cache
         uses: Swatinem/rust-cache@98c8021b550208e191a6a3145459bfc9fb29c4c0 # v2.8.0
@@ -87,11 +96,17 @@ jobs:
 
       - name: Clippy with all features
         if: ${{ matrix.features == 'all' }}
-        run: cargo clippy --release --all-targets --all-features --tests --all -- -D warnings
+        run: cargo clippy --release --all-targets --all-features --all -- -D warnings
 
       - name: Clippy with no features
         if: ${{ matrix.features == 'none' }}
-        run: cargo clippy --release --all-targets --no-default-features --tests --all -- -D warnings
+        run: cargo clippy --release --all-targets --no-default-features --all -- -D warnings
+
+      - name: Clippy with ${{ matrix.features }} features
+        if: ${{ matrix.target }}
+        env:
+          RUSTC_BOOTSTRAP: ${{ matrix.nightly && 1 || 0 }}
+        run: cargo clippy --release -p jxl_simd --no-default-features --features ${{ matrix.features }} ${{ matrix.nightly && '--features nightly' }} --target ${{ matrix.target }} -- -D warnings
 
   test:
     runs-on: ${{ matrix.os || 'ubuntu-latest' }}
@@ -109,6 +124,9 @@ jobs:
             simd: avx512
           - os: ubuntu-24.04-arm
             simd: neon
+          - os: ubuntu-24.04-arm
+            simd: nightly-neon
+            target: thumbv7neon-unknown-linux-gnueabihf
           - os: ubuntu-latest
             simd: simd128
             target: wasm32-wasip1
@@ -129,6 +147,12 @@ jobs:
         with:
           targets: ${{ matrix.target }}
 
+      - name: Install ARMv7 toolchain
+        if: ${{ matrix.simd == 'nightly-neon' }}
+        run: |
+          sudo dpkg --add-architecture armhf
+          sudo apt install --update -y gcc-arm-linux-gnueabihf libc6-dev:armhf
+
       - name: Install wasmtime
         if: ${{ matrix.target == 'wasm32-wasip1' }}
         uses: bytecodealliance/actions/wasmtime/setup@v1
@@ -142,6 +166,16 @@ jobs:
         if: ${{ matrix.simd != 'none' && !matrix.target }}
         run: cargo test --release --all --no-fail-fast --no-default-features --features ${{ matrix.simd }}
 
+      - name: Tests with SIMD feature ${{ matrix.simd }} (ARMv7)
+        if: ${{ matrix.simd == 'nightly-neon' }}
+        env:
+          RUSTC_BOOTSTRAP: 1
+        run: |
+          cargo test --release --workspace --no-fail-fast --no-default-features --features nightly,neon --target ${{ matrix.target }} -- \
+            --skip compare_incremental_tirr_photo \
+            --skip compare_pipelines_progressive \
+            --skip compare_pipelines_tirr_photo
+
       - name: Tests with SIMD feature ${{ matrix.simd }} (wasm ${{ matrix.wasm_features }})
         if: ${{ matrix.target == 'wasm32-wasip1' }}
         env:
@@ -149,7 +183,6 @@ jobs:
         run: |
           cargo test --release --workspace --exclude jxl_cli --exclude jxl_cms --no-fail-fast --no-default-features --features ${{ matrix.simd }} --target ${{ matrix.target }} -- \
             --skip tirr_photo \
-            --skip huge_image \
             --skip fuzzer_smallbuffer
 
       - name: Tests with no features
diff --git a/AUTHORS b/AUTHORS
index a57e97210..e514dc557 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -19,6 +19,7 @@ Google LLC <*@google.com>
 Caio <c410.f3r@gmail.com>
 Galaxy4594 <164440799+Galaxy4594@users.noreply.github.com>
 Ewout ter Hoeven <E.M.terHoeven@student.tudelft.nl>
+Foolbar <118464521+FooIbar@users.noreply.github.com>
 Helmut Januschka <helmut@januschka.com>
 Inflation <2375962+inflation@users.noreply.github.com>
 Jacob Abel <jacobabel@nullpo.dev>
diff --git a/jxl/Cargo.toml b/jxl/Cargo.toml
index 51a006e7a..55924898d 100644
--- a/jxl/Cargo.toml
+++ b/jxl/Cargo.toml
@@ -34,12 +34,13 @@ jxl_macros = { path = "../jxl_macros", version = "=0.4.3", features = ["test"] }
 [features]
 default = ["all-simd"]
 
-all-simd = ["jxl_simd/all-simd"]
-sse42 = ["jxl_simd/sse42"]
-avx = ["jxl_simd/avx"]
-avx512 = ["jxl_simd/avx512"]
-neon = ["jxl_simd/neon"]
-simd128 = ["jxl_simd/simd128"]
+all-simd = ["jxl_transforms/all-simd"]
+sse42 = ["jxl_transforms/sse42"]
+avx = ["jxl_transforms/avx"]
+avx512 = ["jxl_transforms/avx512"]
+neon = ["jxl_transforms/neon"]
+simd128 = ["jxl_transforms/simd128"]
+nightly = ["jxl_transforms/nightly"]
 
 [lints]
 workspace = true
diff --git a/jxl/src/image/test.rs b/jxl/src/image/test.rs
index ca9179a22..3a88a26a5 100644
--- a/jxl/src/image/test.rs
+++ b/jxl/src/image/test.rs
@@ -31,6 +31,10 @@ impl<T: ImageDataType> Image<T> {
     }
 }
 
+#[cfg_attr(
+    target_pointer_width = "32",
+    ignore = "will overflow on 32-bit targets"
+)]
 #[test]
 fn huge_image() {
     assert!(Image::<u8>::new((1 << 28, 1 << 28)).is_err());
diff --git a/jxl/src/lib.rs b/jxl/src/lib.rs
index 6787b0b2e..427a4d6a0 100644
--- a/jxl/src/lib.rs
+++ b/jxl/src/lib.rs
@@ -4,6 +4,16 @@
 // license that can be found in the LICENSE file.
 
 #![deny(unsafe_code)]
+#![cfg_attr(
+    all(
+        target_arch = "arm",
+        target_feature = "v7",
+        feature = "nightly",
+        feature = "neon"
+    ),
+    feature(arm_target_feature)
+)]
+
 pub mod api;
 pub mod bit_reader;
 pub mod color;
diff --git a/jxl_cli/Cargo.toml b/jxl_cli/Cargo.toml
index 604710698..525d339bb 100644
--- a/jxl_cli/Cargo.toml
+++ b/jxl_cli/Cargo.toml
@@ -37,6 +37,7 @@ avx = ["jxl/avx"]
 avx512 = ["jxl/avx512"]
 neon = ["jxl/neon"]
 simd128 = ["jxl/simd128"]
+nightly = ["jxl/nightly"]
 
 [lints]
 workspace = true
diff --git a/jxl_simd/Cargo.toml b/jxl_simd/Cargo.toml
index 3298d12e0..64c0601e7 100644
--- a/jxl_simd/Cargo.toml
+++ b/jxl_simd/Cargo.toml
@@ -23,6 +23,7 @@ avx = ["sse42"]
 avx512 = ["avx"]
 neon = []
 simd128 = []
+nightly = []
 
 [lints]
 workspace = true
diff --git a/jxl_simd/src/aarch64/mod.rs b/jxl_simd/src/arm/mod.rs
similarity index 96%
rename from jxl_simd/src/aarch64/mod.rs
rename to jxl_simd/src/arm/mod.rs
index bf1f97a5e..eed8392b9 100644
--- a/jxl_simd/src/aarch64/mod.rs
+++ b/jxl_simd/src/arm/mod.rs
@@ -42,6 +42,7 @@ macro_rules! simd_function_body_neon {
             return $name(d, $($val),*);
         } else if let Some(d) = $crate::NeonDescriptor::new() {
             #[target_feature(enable = "neon")]
+			#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
             fn neon(d: $crate::NeonDescriptor, $($arg: $ty),*) $(-> $ret)? {
                 $name(d, $($val),*)
             }
@@ -87,6 +88,7 @@ macro_rules! test_neon {
                 use $crate::SimdDescriptor;
                 let Some(d) = $crate::NeonDescriptor::new() else { return; };
                 #[target_feature(enable = "neon")]
+                #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
                 fn inner(d: $crate::NeonDescriptor) {
                     $name(d)
                 }
diff --git a/jxl_simd/src/aarch64/neon.rs b/jxl_simd/src/arm/neon.rs
similarity index 73%
rename from jxl_simd/src/aarch64/neon.rs
rename to jxl_simd/src/arm/neon.rs
index e4a56bb0a..32ff09b7d 100644
--- a/jxl_simd/src/aarch64/neon.rs
+++ b/jxl_simd/src/arm/neon.rs
@@ -3,14 +3,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
-use std::{
-    arch::aarch64::*,
-    ops::{
-        Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Div,
-        DivAssign, Mul, MulAssign, Neg, Sub, SubAssign,
-    },
+use std::ops::{
+    Add, AddAssign, BitAnd, BitAndAssign, BitOr, BitOrAssign, BitXor, BitXorAssign, Div, DivAssign,
+    Mul, MulAssign, Neg, Sub, SubAssign,
 };
 
+#[cfg(target_arch = "aarch64")]
+use std::arch::aarch64::*;
+#[cfg(target_arch = "arm")]
+use std::arch::arm::*;
+
 use crate::U32SimdVec;
 
 use super::super::{F32SimdVec, I32SimdVec, SimdDescriptor, SimdMask, U8SimdVec, U16SimdVec};
@@ -51,7 +53,12 @@ impl SimdDescriptor for NeonDescriptor {
     type Descriptor128 = Self;
 
     fn new() -> Option<Self> {
-        if std::arch::is_aarch64_feature_detected!("neon") {
+        #[cfg(target_arch = "aarch64")]
+        let has_neon = std::arch::is_aarch64_feature_detected!("neon");
+        #[cfg(target_arch = "arm")]
+        let has_neon = std::arch::is_arm_feature_detected!("neon");
+
+        if has_neon {
             // SAFETY: we just checked neon.
             Some(unsafe { Self::new_unchecked() })
         } else {
@@ -69,6 +76,7 @@ impl SimdDescriptor for NeonDescriptor {
 
     fn call<R>(self, f: impl FnOnce(Self) -> R) -> R {
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline(never)]
         unsafe fn inner<R>(d: NeonDescriptor, f: impl FnOnce(NeonDescriptor) -> R) -> R {
             f(d)
@@ -88,6 +96,7 @@ macro_rules! fn_neon {
         #[inline(always)]
         fn $name(self: $self_ty, $($arg: $ty),*) $(-> $ret)? {
             #[target_feature(enable = "neon")]
+			#[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
             #[inline]
             fn inner($this: $self_ty, $($arg: $ty),*) $(-> $ret)? {
                 $body
@@ -102,6 +111,20 @@ macro_rules! fn_neon {
 #[repr(transparent)]
 pub struct F32VecNeon(float32x4_t, NeonDescriptor);
 
+#[cfg(target_arch = "arm")]
+#[inline(always)]
+fn f32_vec_from_array(array: [f32; 4]) -> float32x4_t {
+    // SAFETY: Both types have an identical layout.
+    unsafe { core::mem::transmute(array) }
+}
+
+#[cfg(target_arch = "arm")]
+#[inline(always)]
+fn f32_vec_to_array(vec: float32x4_t) -> [f32; 4] {
+    // SAFETY: Both types have an identical layout.
+    unsafe { core::mem::transmute(vec) }
+}
+
 impl F32SimdVec for F32VecNeon {
     type Descriptor = NeonDescriptor;
 
@@ -180,6 +203,7 @@ impl F32SimdVec for F32VecNeon {
         dest: &mut [f32],
     ) {
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline]
         fn store_interleaved_8_impl(
             a: float32x4_t,
@@ -195,60 +219,86 @@ impl F32SimdVec for F32VecNeon {
             assert!(dest.len() >= 8 * F32VecNeon::LEN);
             // NEON doesn't have vst8, so we use manual interleaving
             // For 4-wide vectors, output is 32 elements: [a0,b0,c0,d0,e0,f0,g0,h0, a1,...]
+            let (out0, out1, out2, out3, out4, out5, out6, out7);
+
+            #[cfg(target_arch = "aarch64")]
+            {
+                // Use zip to interleave pairs
+                let ae_lo = vzip1q_f32(a, e); // [a0, e0, a1, e1]
+                let ae_hi = vzip2q_f32(a, e); // [a2, e2, a3, e3]
+                let bf_lo = vzip1q_f32(b, f);
+                let bf_hi = vzip2q_f32(b, f);
+                let cg_lo = vzip1q_f32(c, g);
+                let cg_hi = vzip2q_f32(c, g);
+                let dh_lo = vzip1q_f32(d, h);
+                let dh_hi = vzip2q_f32(d, h);
+
+                // Now interleave ae with bf, and cg with dh
+                let aebf_0 = vzip1q_f32(ae_lo, bf_lo); // [a0, b0, e0, f0]
+                let aebf_1 = vzip2q_f32(ae_lo, bf_lo); // [a1, b1, e1, f1]
+                let aebf_2 = vzip1q_f32(ae_hi, bf_hi);
+                let aebf_3 = vzip2q_f32(ae_hi, bf_hi);
+                let cgdh_0 = vzip1q_f32(cg_lo, dh_lo); // [c0, d0, g0, h0]
+                let cgdh_1 = vzip2q_f32(cg_lo, dh_lo);
+                let cgdh_2 = vzip1q_f32(cg_hi, dh_hi);
+                let cgdh_3 = vzip2q_f32(cg_hi, dh_hi);
+
+                // Final interleave to get [a0,b0,c0,d0,e0,f0,g0,h0]
+                out0 = vreinterpretq_f32_f64(vzip1q_f64(
+                    vreinterpretq_f64_f32(aebf_0),
+                    vreinterpretq_f64_f32(cgdh_0),
+                ));
+                out1 = vreinterpretq_f32_f64(vzip2q_f64(
+                    vreinterpretq_f64_f32(aebf_0),
+                    vreinterpretq_f64_f32(cgdh_0),
+                ));
+                out2 = vreinterpretq_f32_f64(vzip1q_f64(
+                    vreinterpretq_f64_f32(aebf_1),
+                    vreinterpretq_f64_f32(cgdh_1),
+                ));
+                out3 = vreinterpretq_f32_f64(vzip2q_f64(
+                    vreinterpretq_f64_f32(aebf_1),
+                    vreinterpretq_f64_f32(cgdh_1),
+                ));
+                out4 = vreinterpretq_f32_f64(vzip1q_f64(
+                    vreinterpretq_f64_f32(aebf_2),
+                    vreinterpretq_f64_f32(cgdh_2),
+                ));
+                out5 = vreinterpretq_f32_f64(vzip2q_f64(
+                    vreinterpretq_f64_f32(aebf_2),
+                    vreinterpretq_f64_f32(cgdh_2),
+                ));
+                out6 = vreinterpretq_f32_f64(vzip1q_f64(
+                    vreinterpretq_f64_f32(aebf_3),
+                    vreinterpretq_f64_f32(cgdh_3),
+                ));
+                out7 = vreinterpretq_f32_f64(vzip2q_f64(
+                    vreinterpretq_f64_f32(aebf_3),
+                    vreinterpretq_f64_f32(cgdh_3),
+                ));
+            }
 
-            // Use zip to interleave pairs
-            let ae_lo = vzip1q_f32(a, e); // [a0, e0, a1, e1]
-            let ae_hi = vzip2q_f32(a, e); // [a2, e2, a3, e3]
-            let bf_lo = vzip1q_f32(b, f);
-            let bf_hi = vzip2q_f32(b, f);
-            let cg_lo = vzip1q_f32(c, g);
-            let cg_hi = vzip2q_f32(c, g);
-            let dh_lo = vzip1q_f32(d, h);
-            let dh_hi = vzip2q_f32(d, h);
-
-            // Now interleave ae with bf, and cg with dh
-            let aebf_0 = vzip1q_f32(ae_lo, bf_lo); // [a0, b0, e0, f0]
-            let aebf_1 = vzip2q_f32(ae_lo, bf_lo); // [a1, b1, e1, f1]
-            let aebf_2 = vzip1q_f32(ae_hi, bf_hi);
-            let aebf_3 = vzip2q_f32(ae_hi, bf_hi);
-            let cgdh_0 = vzip1q_f32(cg_lo, dh_lo); // [c0, d0, g0, h0]
-            let cgdh_1 = vzip2q_f32(cg_lo, dh_lo);
-            let cgdh_2 = vzip1q_f32(cg_hi, dh_hi);
-            let cgdh_3 = vzip2q_f32(cg_hi, dh_hi);
-
-            // Final interleave to get [a0,b0,c0,d0,e0,f0,g0,h0]
-            let out0 = vreinterpretq_f32_f64(vzip1q_f64(
-                vreinterpretq_f64_f32(aebf_0),
-                vreinterpretq_f64_f32(cgdh_0),
-            ));
-            let out1 = vreinterpretq_f32_f64(vzip2q_f64(
-                vreinterpretq_f64_f32(aebf_0),
-                vreinterpretq_f64_f32(cgdh_0),
-            ));
-            let out2 = vreinterpretq_f32_f64(vzip1q_f64(
-                vreinterpretq_f64_f32(aebf_1),
-                vreinterpretq_f64_f32(cgdh_1),
-            ));
-            let out3 = vreinterpretq_f32_f64(vzip2q_f64(
-                vreinterpretq_f64_f32(aebf_1),
-                vreinterpretq_f64_f32(cgdh_1),
-            ));
-            let out4 = vreinterpretq_f32_f64(vzip1q_f64(
-                vreinterpretq_f64_f32(aebf_2),
-                vreinterpretq_f64_f32(cgdh_2),
-            ));
-            let out5 = vreinterpretq_f32_f64(vzip2q_f64(
-                vreinterpretq_f64_f32(aebf_2),
-                vreinterpretq_f64_f32(cgdh_2),
-            ));
-            let out6 = vreinterpretq_f32_f64(vzip1q_f64(
-                vreinterpretq_f64_f32(aebf_3),
-                vreinterpretq_f64_f32(cgdh_3),
-            ));
-            let out7 = vreinterpretq_f32_f64(vzip2q_f64(
-                vreinterpretq_f64_f32(aebf_3),
-                vreinterpretq_f64_f32(cgdh_3),
-            ));
+            #[cfg(target_arch = "arm")]
+            {
+                let ae = vzipq_f32(a, e);
+                let bf = vzipq_f32(b, f);
+                let cg = vzipq_f32(c, g);
+                let dh = vzipq_f32(d, h);
+
+                let aebf_lo = vzipq_f32(ae.0, bf.0);
+                let aebf_hi = vzipq_f32(ae.1, bf.1);
+                let cgdh_lo = vzipq_f32(cg.0, dh.0);
+                let cgdh_hi = vzipq_f32(cg.1, dh.1);
+
+                out0 = vcombine_f32(vget_low_f32(aebf_lo.0), vget_low_f32(cgdh_lo.0));
+                out1 = vcombine_f32(vget_high_f32(aebf_lo.0), vget_high_f32(cgdh_lo.0));
+                out2 = vcombine_f32(vget_low_f32(aebf_lo.1), vget_low_f32(cgdh_lo.1));
+                out3 = vcombine_f32(vget_high_f32(aebf_lo.1), vget_high_f32(cgdh_lo.1));
+                out4 = vcombine_f32(vget_low_f32(aebf_hi.0), vget_low_f32(cgdh_hi.0));
+                out5 = vcombine_f32(vget_high_f32(aebf_hi.0), vget_high_f32(cgdh_hi.0));
+                out6 = vcombine_f32(vget_low_f32(aebf_hi.1), vget_low_f32(cgdh_hi.1));
+                out7 = vcombine_f32(vget_high_f32(aebf_hi.1), vget_high_f32(cgdh_hi.1));
+            }
 
             // SAFETY: we just checked that dest has enough space.
             unsafe {
@@ -298,6 +348,7 @@ impl F32SimdVec for F32VecNeon {
     #[inline(always)]
     fn transpose_square(d: NeonDescriptor, data: &mut [[f32; 4]], stride: usize) {
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline]
         fn transpose4x4f32(d: NeonDescriptor, data: &mut [[f32; 4]], stride: usize) {
             assert!(data.len() > 3 * stride);
@@ -307,26 +358,43 @@ impl F32SimdVec for F32VecNeon {
             let p2 = F32VecNeon::load_array(d, &data[2 * stride]).0;
             let p3 = F32VecNeon::load_array(d, &data[3 * stride]).0;
 
-            // Stage 1: Transpose within each of 2x2 blocks
-            let tr0 = vreinterpretq_f64_f32(vtrn1q_f32(p0, p1));
-            let tr1 = vreinterpretq_f64_f32(vtrn2q_f32(p0, p1));
-            let tr2 = vreinterpretq_f64_f32(vtrn1q_f32(p2, p3));
-            let tr3 = vreinterpretq_f64_f32(vtrn2q_f32(p2, p3));
+            let (out0, out1, out2, out3);
+
+            #[cfg(target_arch = "aarch64")]
+            {
+                // Stage 1: Transpose within each of 2x2 blocks
+                let tr0 = vreinterpretq_f64_f32(vtrn1q_f32(p0, p1));
+                let tr1 = vreinterpretq_f64_f32(vtrn2q_f32(p0, p1));
+                let tr2 = vreinterpretq_f64_f32(vtrn1q_f32(p2, p3));
+                let tr3 = vreinterpretq_f64_f32(vtrn2q_f32(p2, p3));
+
+                // Stage 2: Transpose 2x2 grid of 2x2 blocks
+                out0 = vreinterpretq_f32_f64(vzip1q_f64(tr0, tr2));
+                out1 = vreinterpretq_f32_f64(vzip1q_f64(tr1, tr3));
+                out2 = vreinterpretq_f32_f64(vzip2q_f64(tr0, tr2));
+                out3 = vreinterpretq_f32_f64(vzip2q_f64(tr1, tr3));
+            }
 
-            // Stage 2: Transpose 2x2 grid of 2x2 blocks
-            let p0 = vreinterpretq_f32_f64(vzip1q_f64(tr0, tr2));
-            let p1 = vreinterpretq_f32_f64(vzip1q_f64(tr1, tr3));
-            let p2 = vreinterpretq_f32_f64(vzip2q_f64(tr0, tr2));
-            let p3 = vreinterpretq_f32_f64(vzip2q_f64(tr1, tr3));
+            #[cfg(target_arch = "arm")]
+            {
+                let tr01 = vtrnq_f32(p0, p1);
+                let tr23 = vtrnq_f32(p2, p3);
 
-            F32VecNeon(p0, d).store_array(&mut data[0]);
-            F32VecNeon(p1, d).store_array(&mut data[1 * stride]);
-            F32VecNeon(p2, d).store_array(&mut data[2 * stride]);
-            F32VecNeon(p3, d).store_array(&mut data[3 * stride]);
+                out0 = vcombine_f32(vget_low_f32(tr01.0), vget_low_f32(tr23.0));
+                out1 = vcombine_f32(vget_low_f32(tr01.1), vget_low_f32(tr23.1));
+                out2 = vcombine_f32(vget_high_f32(tr01.0), vget_high_f32(tr23.0));
+                out3 = vcombine_f32(vget_high_f32(tr01.1), vget_high_f32(tr23.1));
+            }
+
+            F32VecNeon(out0, d).store_array(&mut data[0]);
+            F32VecNeon(out1, d).store_array(&mut data[1 * stride]);
+            F32VecNeon(out2, d).store_array(&mut data[2 * stride]);
+            F32VecNeon(out3, d).store_array(&mut data[3 * stride]);
         }
 
         /// Potentially faster variant of `transpose4x4f32` where `stride == 1`.
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline]
         fn transpose4x4f32_contiguous(d: NeonDescriptor, data: &mut [[f32; 4]]) {
             assert!(data.len() > 3);
@@ -358,11 +426,19 @@ impl F32SimdVec for F32VecNeon {
 
     fn_neon! {
         fn mul_add(this: F32VecNeon, mul: F32VecNeon, add: F32VecNeon) -> F32VecNeon {
-            F32VecNeon(vfmaq_f32(add.0, this.0, mul.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vfmaq_f32(add.0, this.0, mul.0);
+            #[cfg(target_arch = "arm")]
+            let res = vmlaq_f32(add.0, this.0, mul.0);
+            F32VecNeon(res, this.1)
         }
 
         fn neg_mul_add(this: F32VecNeon, mul: F32VecNeon, add: F32VecNeon) -> F32VecNeon {
-            F32VecNeon(vfmsq_f32(add.0, this.0, mul.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vfmsq_f32(add.0, this.0, mul.0);
+            #[cfg(target_arch = "arm")]
+            let res = vmlsq_f32(add.0, this.0, mul.0);
+            F32VecNeon(res, this.1)
         }
 
         fn abs(this: F32VecNeon) -> F32VecNeon {
@@ -370,11 +446,31 @@ impl F32SimdVec for F32VecNeon {
         }
 
         fn floor(this: F32VecNeon) -> F32VecNeon {
-            F32VecNeon(vrndmq_f32(this.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vrndmq_f32(this.0);
+            #[cfg(target_arch = "arm")]
+            let res = {
+                let mut tmp = f32_vec_to_array(this.0);
+                for x in &mut tmp {
+                    *x = x.floor();
+                }
+                f32_vec_from_array(tmp)
+            };
+            F32VecNeon(res, this.1)
         }
 
         fn sqrt(this: F32VecNeon) -> F32VecNeon {
-            F32VecNeon(vsqrtq_f32(this.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vsqrtq_f32(this.0);
+            #[cfg(target_arch = "arm")]
+            let res = {
+                let mut tmp = f32_vec_to_array(this.0);
+                for x in &mut tmp {
+                    *x = x.sqrt();
+                }
+                f32_vec_from_array(tmp)
+            };
+            F32VecNeon(res, this.1)
         }
 
         fn neg(this: F32VecNeon) -> F32VecNeon {
@@ -411,7 +507,10 @@ impl F32SimdVec for F32VecNeon {
         fn round_store_u8(this: F32VecNeon, dest: &mut [u8]) {
             assert!(dest.len() >= F32VecNeon::LEN);
             // Round to nearest integer
+            #[cfg(target_arch = "aarch64")]
             let rounded = vrndnq_f32(this.0);
+            #[cfg(target_arch = "arm")]
+            let rounded = vaddq_f32(this.0, vdupq_n_f32(0.5));
             // Convert to i32, then to u16, then to u8
             let i32s = vcvtq_s32_f32(rounded);
             let u16s = vqmovun_s32(i32s);
@@ -426,7 +525,10 @@ impl F32SimdVec for F32VecNeon {
         fn round_store_u16(this: F32VecNeon, dest: &mut [u16]) {
             assert!(dest.len() >= F32VecNeon::LEN);
             // Round to nearest integer
+            #[cfg(target_arch = "aarch64")]
             let rounded = vrndnq_f32(this.0);
+            #[cfg(target_arch = "arm")]
+            let rounded = vaddq_f32(this.0, vdupq_n_f32(0.5));
             // Convert to i32, then to u16
             let i32s = vcvtq_s32_f32(rounded);
             let u16s = vqmovun_s32(i32s);
@@ -439,12 +541,14 @@ impl F32SimdVec for F32VecNeon {
 
         fn store_f16_bits(this: F32VecNeon, dest: &mut [u16]) {
             assert!(dest.len() >= F32VecNeon::LEN);
+
+            #[cfg(target_arch = "aarch64")]
             // Use inline asm because Rust stdarch incorrectly requires fp16 target feature
             // for vcvt_f16_f32 (fixed in https://github.com/rust-lang/stdarch/pull/1978)
-            let f16_bits: uint16x4_t;
             // SAFETY: NEON is available (guaranteed by descriptor), dest has enough space,
             // vst1_u16 supports unaligned stores.
             unsafe {
+                let f16_bits: uint16x4_t;
                 std::arch::asm!(
                     "fcvtn {out:v}.4h, {inp:v}.4s",
                     inp = in(vreg) this.0,
@@ -453,15 +557,26 @@ impl F32SimdVec for F32VecNeon {
                 );
                 vst1_u16(dest.as_mut_ptr(), f16_bits);
             }
+
+            #[cfg(target_arch = "arm")]
+            {
+                let tmp = f32_vec_to_array(this.0);
+                for (d, t) in dest.iter_mut().zip(tmp) {
+                    *d = crate::f16::from_f32(t).to_bits();
+                }
+            }
         }
     }
 
     #[inline(always)]
     fn load_f16_bits(d: Self::Descriptor, mem: &[u16]) -> Self {
         assert!(mem.len() >= Self::LEN);
+
+        let result: float32x4_t;
+
+        #[cfg(target_arch = "aarch64")]
         // Use inline asm because Rust stdarch incorrectly requires fp16 target feature
         // for vcvt_f32_f16 (fixed in https://github.com/rust-lang/stdarch/pull/1978)
-        let result: float32x4_t;
         // SAFETY: NEON is available (guaranteed by descriptor), mem has enough space.
         // vld1_u16 supports unaligned loads.
         unsafe {
@@ -473,12 +588,20 @@ impl F32SimdVec for F32VecNeon {
                 options(pure, nomem, nostack),
             );
         }
+
+        #[cfg(target_arch = "arm")]
+        {
+            let tmp = core::array::from_fn(|i| crate::f16::from_bits(mem[i]).to_f32());
+            result = f32_vec_from_array(tmp);
+        }
+
         F32VecNeon(result, d)
     }
 
     #[inline(always)]
     fn prepare_table_bf16_8(_d: NeonDescriptor, table: &[f32; 8]) -> Bf16Table8Neon {
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline]
         fn prepare_impl(table: &[f32; 8]) -> uint8x16_t {
             // Convert f32 table to BF16 packed in 128 bits (16 bytes for 8 entries)
@@ -507,6 +630,7 @@ impl F32SimdVec for F32VecNeon {
     #[inline(always)]
     fn table_lookup_bf16_8(d: NeonDescriptor, table: Bf16Table8Neon, indices: I32VecNeon) -> Self {
         #[target_feature(enable = "neon")]
+        #[cfg_attr(target_arch = "arm", target_feature(enable = "v7"))]
         #[inline]
         fn lookup_impl(bf16_table: uint8x16_t, indices: int32x4_t) -> float32x4_t {
             // Build shuffle mask efficiently using arithmetic on 32-bit indices.
@@ -523,9 +647,19 @@ impl F32SimdVec for F32VecNeon {
             let shl25 = vshlq_n_u32::<25>(indices_u32);
             let base = vdupq_n_u32(0x01008080);
             let shuffle_mask = vorrq_u32(vorrq_u32(shl17, shl25), base);
+            let shuffle_mask_u8 = vreinterpretq_u8_u32(shuffle_mask);
 
             // Perform the table lookup (out of range indices give 0)
-            let result = vqtbl1q_u8(bf16_table, vreinterpretq_u8_u32(shuffle_mask));
+            #[cfg(target_arch = "aarch64")]
+            let result = vqtbl1q_u8(bf16_table, shuffle_mask_u8);
+
+            #[cfg(target_arch = "arm")]
+            let result = {
+                let table = uint8x8x2_t(vget_low_u8(bf16_table), vget_high_u8(bf16_table));
+                let res_low = vtbl2_u8(table, vget_low_u8(shuffle_mask_u8));
+                let res_high = vtbl2_u8(table, vget_high_u8(shuffle_mask_u8));
+                vcombine_u8(res_low, res_high)
+            };
 
             // Result has bf16 in high 16 bits of each 32-bit lane = valid f32
             vreinterpretq_f32_u8(result)
@@ -566,7 +700,18 @@ impl Div<F32VecNeon> for F32VecNeon {
     type Output = Self;
     fn_neon! {
         fn div(this: F32VecNeon, rhs: F32VecNeon) -> F32VecNeon {
-            F32VecNeon(vdivq_f32(this.0, rhs.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vdivq_f32(this.0, rhs.0);
+            #[cfg(target_arch = "arm")]
+            let res = {
+                let mut a = f32_vec_to_array(this.0);
+                let b = f32_vec_to_array(rhs.0);
+                for (x, y) in a.iter_mut().zip(b) {
+                    *x /= y;
+                }
+                f32_vec_from_array(a)
+            };
+            F32VecNeon(res, this.1)
         }
     }
 }
@@ -598,7 +743,19 @@ impl MulAssign<F32VecNeon> for F32VecNeon {
 impl DivAssign<F32VecNeon> for F32VecNeon {
     fn_neon! {
         fn div_assign(this: &mut F32VecNeon, rhs: F32VecNeon) {
-            this.0 = vdivq_f32(this.0, rhs.0);
+            #[cfg(target_arch = "aarch64")]
+            {
+                this.0 = vdivq_f32(this.0, rhs.0);
+            }
+            #[cfg(target_arch = "arm")]
+            {
+                let mut a = f32_vec_to_array(this.0);
+                let b = f32_vec_to_array(rhs.0);
+                for (x, y) in a.iter_mut().zip(b) {
+                    *x /= y;
+                }
+                this.0 = f32_vec_from_array(a);
+            }
         }
     }
 }
@@ -656,7 +813,11 @@ impl I32SimdVec for I32VecNeon {
         }
 
         fn lt_zero(this: I32VecNeon) -> MaskNeon {
-            MaskNeon(vcltzq_s32(this.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vcltzq_s32(this.0);
+            #[cfg(target_arch = "arm")]
+            let res = vcltq_s32(this.0, vdupq_n_s32(0));
+            MaskNeon(res, this.1)
         }
 
         fn eq(this: I32VecNeon, other: I32VecNeon) -> MaskNeon {
@@ -664,15 +825,26 @@ impl I32SimdVec for I32VecNeon {
         }
 
         fn eq_zero(this: I32VecNeon) -> MaskNeon {
-            MaskNeon(vceqzq_s32(this.0), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vceqzq_s32(this.0);
+            #[cfg(target_arch = "arm")]
+            let res = vceqq_s32(this.0, vdupq_n_s32(0));
+            MaskNeon(res, this.1)
         }
 
         fn mul_wide_take_high(this: I32VecNeon, rhs: I32VecNeon) -> I32VecNeon {
             let l = vmull_s32(vget_low_s32(this.0), vget_low_s32(rhs.0));
             let l = vreinterpretq_s32_s64(l);
+            #[cfg(target_arch = "aarch64")]
             let h = vmull_high_s32(this.0, rhs.0);
+            #[cfg(target_arch = "arm")]
+            let h = vmull_s32(vget_high_s32(this.0), vget_high_s32(rhs.0));
             let h = vreinterpretq_s32_s64(h);
-            I32VecNeon(vuzp2q_s32(l, h), this.1)
+            #[cfg(target_arch = "aarch64")]
+            let res = vuzp2q_s32(l, h);
+            #[cfg(target_arch = "arm")]
+            let res = vuzpq_s32(l, h).1;
+            I32VecNeon(res, this.1)
         }
     }
 
@@ -1010,7 +1182,15 @@ impl SimdMask for MaskNeon {
         }
 
         fn all(this: MaskNeon) -> bool {
-            vminvq_u32(this.0) == u32::MAX
+            #[cfg(target_arch = "aarch64")]
+            {
+                vminvq_u32(this.0) == u32::MAX
+            }
+            #[cfg(target_arch = "arm")]
+            {
+                let res = vand_u32(vget_low_u32(this.0), vget_high_u32(this.0));
+                (vget_lane_u32::<0>(res) & vget_lane_u32::<1>(res)) == u32::MAX
+            }
         }
     }
 }
diff --git a/jxl_simd/src/lib.rs b/jxl_simd/src/lib.rs
index 3c3b5367c..0224aeb40 100644
--- a/jxl_simd/src/lib.rs
+++ b/jxl_simd/src/lib.rs
@@ -4,6 +4,19 @@
 // license that can be found in the LICENSE file.
 
 #![allow(clippy::too_many_arguments)]
+#![cfg_attr(
+    all(
+        target_arch = "arm",
+        target_feature = "v7",
+        feature = "nightly",
+        feature = "neon"
+    ),
+    feature(
+        arm_target_feature,
+        stdarch_arm_feature_detection,
+        stdarch_arm_neon_intrinsics
+    )
+)]
 
 use std::{
     fmt::Debug,
@@ -16,8 +29,11 @@ use std::{
 #[cfg(target_arch = "x86_64")]
 mod x86_64;
 
-#[cfg(target_arch = "aarch64")]
-mod aarch64;
+#[cfg(any(
+    target_arch = "aarch64",
+    all(target_arch = "arm", target_feature = "v7", feature = "nightly")
+))]
+mod arm;
 
 #[cfg(target_arch = "wasm32")]
 mod wasm32;
@@ -34,8 +50,14 @@ pub use x86_64::avx512::Avx512Descriptor;
 #[cfg(all(target_arch = "x86_64", feature = "sse42"))]
 pub use x86_64::sse42::Sse42Descriptor;
 
-#[cfg(all(target_arch = "aarch64", feature = "neon"))]
-pub use aarch64::neon::NeonDescriptor;
+#[cfg(all(
+    any(
+        target_arch = "aarch64",
+        all(target_arch = "arm", target_feature = "v7", feature = "nightly")
+    ),
+    feature = "neon"
+))]
+pub use arm::neon::NeonDescriptor;
 
 #[cfg(all(target_arch = "wasm32", feature = "simd128"))]
 pub use wasm32::simd128::Simd128Descriptor;
diff --git a/jxl_simd/src/scalar.rs b/jxl_simd/src/scalar.rs
index 67c4186d1..60c292a59 100644
--- a/jxl_simd/src/scalar.rs
+++ b/jxl_simd/src/scalar.rs
@@ -440,7 +440,8 @@ impl SimdMask for bool {
 #[cfg(not(any(
     target_arch = "x86_64",
     target_arch = "aarch64",
-    target_arch = "wasm32"
+    target_arch = "wasm32",
+    all(target_arch = "arm", target_feature = "v7", feature = "nightly")
 )))]
 #[macro_export]
 macro_rules! simd_function {
@@ -463,7 +464,8 @@ macro_rules! simd_function {
 #[cfg(not(any(
     target_arch = "x86_64",
     target_arch = "aarch64",
-    target_arch = "wasm32"
+    target_arch = "wasm32",
+    all(target_arch = "arm", target_feature = "v7", feature = "nightly")
 )))]
 #[macro_export]
 macro_rules! test_all_instruction_sets {
@@ -483,7 +485,8 @@ macro_rules! test_all_instruction_sets {
 #[cfg(not(any(
     target_arch = "x86_64",
     target_arch = "aarch64",
-    target_arch = "wasm32"
+    target_arch = "wasm32",
+    all(target_arch = "arm", target_feature = "v7", feature = "nightly")
 )))]
 #[macro_export]
 macro_rules! bench_all_instruction_sets {
diff --git a/jxl_transforms/Cargo.toml b/jxl_transforms/Cargo.toml
index fae8061d4..7d0044b1d 100644
--- a/jxl_transforms/Cargo.toml
+++ b/jxl_transforms/Cargo.toml
@@ -33,6 +33,7 @@ avx = ["jxl_simd/avx"]
 avx512 = ["jxl_simd/avx512"]
 neon = ["jxl_simd/neon"]
 simd128 = ["jxl_simd/simd128"]
+nightly = ["jxl_simd/nightly"]
 
 [[bench]]
 name = "dct"
diff --git a/jxl_transforms/benches/dct.rs b/jxl_transforms/benches/dct.rs
index f8f1c0e89..f876d62c4 100644
--- a/jxl_transforms/benches/dct.rs
+++ b/jxl_transforms/benches/dct.rs
@@ -4,6 +4,15 @@
 // license that can be found in the LICENSE file.
 
 #![allow(clippy::identity_op)]
+#![cfg_attr(
+    all(
+        target_arch = "arm",
+        target_feature = "v7",
+        feature = "nightly",
+        feature = "neon"
+    ),
+    feature(arm_target_feature)
+)]
 
 use criterion::measurement::Measurement;
 use criterion::{criterion_group, criterion_main, BenchmarkGroup, BenchmarkId, Criterion};
diff --git a/jxl_transforms/src/lib.rs b/jxl_transforms/src/lib.rs
index 38849b9a0..b72d78546 100644
--- a/jxl_transforms/src/lib.rs
+++ b/jxl_transforms/src/lib.rs
@@ -3,6 +3,16 @@
 // Use of this source code is governed by a BSD-style
 // license that can be found in the LICENSE file.
 
+#![cfg_attr(
+    all(
+        target_arch = "arm",
+        target_feature = "v7",
+        feature = "nightly",
+        feature = "neon"
+    ),
+    feature(arm_target_feature)
+)]
+
 mod idct2d;
 mod reinterpreting_dct2d;
 pub mod transform;