From 46065505e6e591fd8730c3f38f700ae095dcd621 Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 11:16:16 +0200
Subject: [PATCH 1/7] Always inline three kernel methods

---
 sparse_strips/vello_common/src/clip.rs | 1 +
 sparse_strips/vello_common/src/rect.rs | 1 +
 sparse_strips/vello_common/src/tile.rs | 1 +
 3 files changed, 3 insertions(+)
diff --git a/sparse_strips/vello_common/src/clip.rs b/sparse_strips/vello_common/src/clip.rs
index a305e76f84..3988090b6d 100644
--- a/sparse_strips/vello_common/src/clip.rs
+++ b/sparse_strips/vello_common/src/clip.rs
@@ -230,6 +230,7 @@ pub fn intersect(
 ///
 /// This is all that this method does. It just looks more complicated as the logic for iterating
 /// in lock step is a bit tricky.
+#[inline(always)]
 fn intersect_impl<S: Simd>(
     simd: S,
     path_1: PathDataRef<'_>,
diff --git a/sparse_strips/vello_common/src/rect.rs b/sparse_strips/vello_common/src/rect.rs
index 55f98b7160..f4dfaeb33b 100644
--- a/sparse_strips/vello_common/src/rect.rs
+++ b/sparse_strips/vello_common/src/rect.rs
@@ -40,6 +40,7 @@ pub fn render(level: Level, rect: Rect, strip_buf: &mut Vec<Strip>, alpha_buf: &
 ///
 /// The x-alpha masks for the left/right edge tiles are y-independent, so they
 /// are precomputed once and reused across all interior rows.
+#[inline(always)]
 fn render_impl<S: Simd>(s: S, rect: Rect, strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u8>) {
     if rect.is_zero_area() {
         return;
diff --git a/sparse_strips/vello_common/src/tile.rs b/sparse_strips/vello_common/src/tile.rs
index 59f8885ecb..9c32a8943e 100644
--- a/sparse_strips/vello_common/src/tile.rs
+++ b/sparse_strips/vello_common/src/tile.rs
@@ -503,6 +503,7 @@ impl Tiles {
         ))
     }
 
+    #[inline(always)]
     fn make_tiles_analytic_aa_impl<S: Simd>(
         &mut self,
         s: S,

From a06ea8a3b365f5438740aeb1a04b8777b5b65410 Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 11:28:35 +0200
Subject: [PATCH 2/7] Vectorize `new` methods

---
 .../vello_cpu/src/fine/common/gradient/mod.rs |  27 +++--
 .../src/fine/common/gradient/radial.rs        |  43 ++++---
 .../src/fine/common/gradient/sweep.rs         |  13 +-
 .../vello_cpu/src/fine/common/image.rs        | 114 ++++++++++--------
 .../src/fine/common/rounded_blurred_rect.rs   |  99 ++++++++-------
 .../vello_cpu/src/fine/lowp/gradient.rs       |  23 ++--
 .../vello_cpu/src/fine/lowp/image.rs          | 103 ++++++++--------
 7 files changed, 235 insertions(+), 187 deletions(-)

diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
index 60cdbf8518..0386b17eff 100644
--- a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
@@ -53,17 +53,22 @@ pub(crate) struct GradientPainter<'a, S: Simd> {
 
 impl<'a, S: Simd> GradientPainter<'a, S> {
     pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self {
-        let lut = gradient.f32_lut(simd);
-        let scale_factor: f32x8<S> = f32x8::splat(simd, lut.scale_factor());
-
-        Self {
-            gradient,
-            scale_factor,
-            lut,
-            t_vals: t_vals.chunks_exact(8),
-            has_undefined: gradient.has_undefined,
-            simd,
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let lut = gradient.f32_lut(simd);
+                let scale_factor: f32x8<S> = f32x8::splat(simd, lut.scale_factor());
+
+                Self {
+                    gradient,
+                    scale_factor,
+                    lut,
+                    t_vals: t_vals.chunks_exact(8),
+                    has_undefined: gradient.has_undefined,
+                    simd,
+                }
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs
index 78ce5fb648..b7db4d30e4 100644
--- a/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs
@@ -26,26 +26,31 @@ pub(crate) struct SimdRadialKind<S: Simd> {
 
 impl<S: Simd> SimdRadialKind<S> {
     pub(crate) fn new(simd: S, kind: &RadialKind) -> Self {
-        let inner = match kind {
-            RadialKind::Radial { bias, scale } => SimdRadialKindInner::Radial {
-                bias: f32x8::splat(simd, *bias),
-                scale: f32x8::splat(simd, *scale),
-            },
-            RadialKind::Strip { scaled_r0_squared } => SimdRadialKindInner::Strip {
-                scaled_r0_squared: f32x8::splat(simd, *scaled_r0_squared),
-            },
-            RadialKind::Focal {
-                focal_data,
-                fp0,
-                fp1,
-            } => SimdRadialKindInner::Focal {
-                fp0: f32x8::splat(simd, *fp0),
-                fp1: f32x8::splat(simd, *fp1),
-                focal_data: *focal_data,
-            },
-        };
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let inner = match kind {
+                    RadialKind::Radial { bias, scale } => SimdRadialKindInner::Radial {
+                        bias: f32x8::splat(simd, *bias),
+                        scale: f32x8::splat(simd, *scale),
+                    },
+                    RadialKind::Strip { scaled_r0_squared } => SimdRadialKindInner::Strip {
+                        scaled_r0_squared: f32x8::splat(simd, *scaled_r0_squared),
+                    },
+                    RadialKind::Focal {
+                        focal_data,
+                        fp0,
+                        fp1,
+                    } => SimdRadialKindInner::Focal {
+                        fp0: f32x8::splat(simd, *fp0),
+                        fp1: f32x8::splat(simd, *fp1),
+                        focal_data: *focal_data,
+                    },
+                };
 
-        Self { inner }
+                Self { inner }
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs
index 02a0a401ff..7b38029e55 100644
--- a/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs
@@ -15,11 +15,14 @@ pub(crate) struct SimdSweepKind<S: Simd> {
 
 impl<S: Simd> SimdSweepKind<S> {
     pub(crate) fn new(simd: S, kind: &SweepKind) -> Self {
-        Self {
-            start_angle: f32x8::splat(simd, kind.start_angle),
-            inv_angle_delta: f32x8::splat(simd, kind.inv_angle_delta),
-            simd,
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || Self {
+                start_angle: f32x8::splat(simd, kind.start_angle),
+                inv_angle_delta: f32x8::splat(simd, kind.inv_angle_delta),
+                simd,
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs
index dad257b071..671f9061cb 100644
--- a/sparse_strips/vello_cpu/src/fine/common/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/image.rs
@@ -29,33 +29,38 @@ impl<'a, S: Simd> PlainNNImagePainter<'a, S> {
     ) -> Self {
         let data = ImagePainterData::new(simd, image, pixmap, start_x, start_y);
 
-        let y_positions = extend(
-            simd,
-            f32x4::splat_pos(
-                simd,
-                data.cur_pos.y as f32,
-                data.x_advances.1,
-                data.y_advances.1,
-            ),
-            image.sampler.y_extend,
-            data.height,
-            data.height_inv,
-        );
-
-        let cur_x_pos = f32x4::splat_pos(
-            simd,
-            data.cur_pos.x as f32,
-            data.x_advances.0,
-            data.y_advances.0,
-        );
-
-        Self {
-            data,
-            advance: image.x_advance.x as f32,
-            y_positions,
-            cur_x_pos,
-            simd,
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let y_positions = extend(
+                    simd,
+                    f32x4::splat_pos(
+                        simd,
+                        data.cur_pos.y as f32,
+                        data.x_advances.1,
+                        data.y_advances.1,
+                    ),
+                    image.sampler.y_extend,
+                    data.height,
+                    data.height_inv,
+                );
+
+                let cur_x_pos = f32x4::splat_pos(
+                    simd,
+                    data.cur_pos.x as f32,
+                    data.x_advances.0,
+                    data.y_advances.0,
+                );
+
+                Self {
+                    data,
+                    advance: image.x_advance.x as f32,
+                    y_positions,
+                    cur_x_pos,
+                    simd,
+                }
+            },
+        )
     }
 }
 
@@ -366,31 +371,36 @@ impl<'a, S: Simd> ImagePainterData<'a, S> {
         start_x: f64,
         start_y: f64,
     ) -> Self {
-        let width = pixmap.width() as f32;
-        let height = pixmap.height() as f32;
-        let start_pos = image.transform * Point::new(start_x, start_y);
-
-        let width_inv = f32x4::splat(simd, 1.0 / width);
-        let height_inv = f32x4::splat(simd, 1.0 / height);
-        let width = f32x4::splat(simd, width);
-        let width_u32 = u32x4::splat(simd, pixmap.width() as u32);
-        let height = f32x4::splat(simd, height);
-
-        let x_advances = (image.x_advance.x as f32, image.x_advance.y as f32);
-        let y_advances = (image.y_advance.x as f32, image.y_advance.y as f32);
-
-        Self {
-            cur_pos: start_pos,
-            pixmap,
-            x_advances,
-            y_advances,
-            image,
-            width,
-            height,
-            width_u32,
-            width_inv,
-            height_inv,
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let width = pixmap.width() as f32;
+                let height = pixmap.height() as f32;
+                let start_pos = image.transform * Point::new(start_x, start_y);
+
+                let width_inv = f32x4::splat(simd, 1.0 / width);
+                let height_inv = f32x4::splat(simd, 1.0 / height);
+                let width = f32x4::splat(simd, width);
+                let width_u32 = u32x4::splat(simd, pixmap.width() as u32);
+                let height = f32x4::splat(simd, height);
+
+                let x_advances = (image.x_advance.x as f32, image.x_advance.y as f32);
+                let y_advances = (image.y_advance.x as f32, image.y_advance.y as f32);
+
+                Self {
+                    cur_pos: start_pos,
+                    pixmap,
+                    x_advances,
+                    y_advances,
+                    image,
+                    width,
+                    height,
+                    width_u32,
+                    width_inv,
+                    height_inv,
+                }
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
index dc84f22a00..5dfc93a292 100644
--- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
@@ -29,23 +29,33 @@ impl<S: Simd> BlurredRoundedRectFiller<S> {
         start_x: f64,
         start_y: f64,
     ) -> Self {
-        let start_pos = rect.transform * Point::new(start_x, start_y);
-        let color_components = rect.color.as_premul_f32().components;
-        let r = f32x8::splat(simd, color_components[0]);
-        let g = f32x8::splat(simd, color_components[1]);
-        let b = f32x8::splat(simd, color_components[2]);
-        let a = f32x8::splat(simd, color_components[3]);
-        let simd_rect = SimdRoundedBlurredRect::new(rect, simd);
-        let alpha_calculator =
-            AlphaCalculator::new(start_pos, rect.x_advance, rect.y_advance, simd_rect, simd);
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let start_pos = rect.transform * Point::new(start_x, start_y);
+                let color_components = rect.color.as_premul_f32().components;
+                let r = f32x8::splat(simd, color_components[0]);
+                let g = f32x8::splat(simd, color_components[1]);
+                let b = f32x8::splat(simd, color_components[2]);
+                let a = f32x8::splat(simd, color_components[3]);
+                let simd_rect = SimdRoundedBlurredRect::new(rect, simd);
+                let alpha_calculator = AlphaCalculator::new(
+                    start_pos,
+                    rect.x_advance,
+                    rect.y_advance,
+                    simd_rect,
+                    simd,
+                );
 
-        Self {
-            alpha_calculator,
-            r,
-            g,
-            b,
-            a,
-        }
+                Self {
+                    alpha_calculator,
+                    r,
+                    g,
+                    b,
+                    a,
+                }
+            },
+        )
     }
 }
 
@@ -177,33 +187,38 @@ struct SimdRoundedBlurredRect<S: Simd> {
 
 impl<S: Simd> SimdRoundedBlurredRect<S> {
     fn new(encoded: &EncodedBlurredRoundedRectangle, s: S) -> Self {
-        let h = f32x8::splat(s, encoded.h);
-        let w = f32x8::splat(s, encoded.w);
-        let width = f32x8::splat(s, encoded.width);
-        let height = f32x8::splat(s, encoded.height);
-        let r1 = f32x8::splat(s, encoded.r1);
-        let exponent = encoded.exponent;
-        let recip_exponent = encoded.recip_exponent;
-        let scale = f32x8::splat(s, encoded.scale);
-        let min_edge = f32x8::splat(s, encoded.min_edge);
-        let std_dev_inv = f32x8::splat(s, encoded.std_dev_inv);
-        let v0 = f32x8::splat(s, 0.0);
-        let v1 = f32x8::splat(s, 0.5);
+        s.vectorize(
+            #[inline(always)]
+            || {
+                let h = f32x8::splat(s, encoded.h);
+                let w = f32x8::splat(s, encoded.w);
+                let width = f32x8::splat(s, encoded.width);
+                let height = f32x8::splat(s, encoded.height);
+                let r1 = f32x8::splat(s, encoded.r1);
+                let exponent = encoded.exponent;
+                let recip_exponent = encoded.recip_exponent;
+                let scale = f32x8::splat(s, encoded.scale);
+                let min_edge = f32x8::splat(s, encoded.min_edge);
+                let std_dev_inv = f32x8::splat(s, encoded.std_dev_inv);
+                let v0 = f32x8::splat(s, 0.0);
+                let v1 = f32x8::splat(s, 0.5);
 
-        Self {
-            exponent,
-            recip_exponent,
-            scale,
-            std_dev_inv,
-            min_edge,
-            w,
-            v0,
-            v1,
-            h,
-            width,
-            height,
-            r1,
-        }
+                Self {
+                    exponent,
+                    recip_exponent,
+                    scale,
+                    std_dev_inv,
+                    min_edge,
+                    w,
+                    v0,
+                    v1,
+                    h,
+                    width,
+                    height,
+                    r1,
+                }
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs b/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs
index 210818a524..3ff1fa599a 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs
@@ -20,16 +20,21 @@ pub(crate) struct GradientPainter<'a, S: Simd> {
 
 impl<'a, S: Simd> GradientPainter<'a, S> {
     pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self {
-        let lut = gradient.u8_lut(simd);
-        let scale_factor = f32x16::splat(simd, lut.scale_factor());
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let lut = gradient.u8_lut(simd);
+                let scale_factor = f32x16::splat(simd, lut.scale_factor());
 
-        Self {
-            gradient,
-            scale_factor,
-            lut: lut.lut(),
-            t_vals: t_vals.chunks_exact(16),
-            simd,
-        }
+                Self {
+                    gradient,
+                    scale_factor,
+                    lut: lut.lut(),
+                    t_vals: t_vals.chunks_exact(16),
+                    simd,
+                }
+            },
+        )
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/image.rs b/sparse_strips/vello_cpu/src/fine/lowp/image.rs
index f68e2162cf..1b0226a62c 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/image.rs
@@ -145,55 +145,60 @@ impl<'a, S: Simd> PlainBilinearImagePainter<'a, S> {
     ) -> Self {
         let data = ImagePainterData::new(simd, image, pixmap, start_x, start_y);
 
-        // For axis-aligned images, y doesn't change across the strip
-        let y_positions = f32x4::splat_pos(
-            simd,
-            data.cur_pos.y as f32,
-            data.x_advances.1,
-            data.y_advances.1,
-        );
-
-        // Pre-compute y extend positions
-        let y_pos1 = extend(
-            simd,
-            y_positions - 0.5,
-            image.sampler.y_extend,
-            data.height,
-            data.height_inv,
-        );
-        let y_pos2 = extend(
-            simd,
-            y_positions + 0.5,
-            image.sampler.y_extend,
-            data.height,
-            data.height_inv,
-        );
-
-        // Pre-compute y interpolation weights
-        let fy = f32_to_u8(element_wise_splat(
-            simd,
-            fract_floor(y_positions + 0.5).mul_add(255.0, 0.5),
-        ));
-        let fy = simd.widen_u8x16(fy);
-        let fy_inv = u16x16::splat(simd, 255) - fy;
-
-        let cur_x_pos = f32x4::splat_pos(
-            simd,
-            data.cur_pos.x as f32,
-            data.x_advances.0,
-            data.y_advances.0,
-        );
-
-        Self {
-            data,
-            y_pos1,
-            y_pos2,
-            fy,
-            fy_inv,
-            cur_x_pos,
-            advance: image.x_advance.x as f32,
-            simd,
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                // For axis-aligned images, y doesn't change across the strip
+                let y_positions = f32x4::splat_pos(
+                    simd,
+                    data.cur_pos.y as f32,
+                    data.x_advances.1,
+                    data.y_advances.1,
+                );
+
+                // Pre-compute y extend positions
+                let y_pos1 = extend(
+                    simd,
+                    y_positions - 0.5,
+                    image.sampler.y_extend,
+                    data.height,
+                    data.height_inv,
+                );
+                let y_pos2 = extend(
+                    simd,
+                    y_positions + 0.5,
+                    image.sampler.y_extend,
+                    data.height,
+                    data.height_inv,
+                );
+
+                // Pre-compute y interpolation weights
+                let fy = f32_to_u8(element_wise_splat(
+                    simd,
+                    fract_floor(y_positions + 0.5).mul_add(255.0, 0.5),
+                ));
+                let fy = simd.widen_u8x16(fy);
+                let fy_inv = u16x16::splat(simd, 255) - fy;
+
+                let cur_x_pos = f32x4::splat_pos(
+                    simd,
+                    data.cur_pos.x as f32,
+                    data.x_advances.0,
+                    data.y_advances.0,
+                );
+
+                Self {
+                    data,
+                    y_pos1,
+                    y_pos2,
+                    fy,
+                    fy_inv,
+                    cur_x_pos,
+                    advance: image.x_advance.x as f32,
+                    simd,
+                }
+            },
+        )
     }
 }
 

From 797a30e13f7b8c7edec9ea7e4db78d39cc82439c Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 11:45:32 +0200
Subject: [PATCH 3/7] Always inline next methods

---
 .../vello_cpu/src/fine/common/image.rs        |  2 +
 .../src/fine/common/rounded_blurred_rect.rs   | 54 ++++++++++++-------
 .../vello_cpu/src/fine/lowp/image.rs          |  1 +
 3 files changed, 38 insertions(+), 19 deletions(-)

diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs
index 671f9061cb..58bb151bdc 100644
--- a/sparse_strips/vello_cpu/src/fine/common/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/image.rs
@@ -111,6 +111,7 @@ impl<'a, S: Simd> NNImagePainter<'a, S> {
 impl<S: Simd> Iterator for NNImagePainter<'_, S> {
     type Item = u8x16<S>;
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let x_positions = extend(
             self.simd,
@@ -180,6 +181,7 @@ impl<'a, S: Simd, const QUALITY: u8> FilteredImagePainter<'a, S, QUALITY> {
 impl<S: Simd, const QUALITY: u8> Iterator for FilteredImagePainter<'_, S, QUALITY> {
     type Item = f32x16<S>;
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let x_positions = f32x4::splat_pos(
             self.simd,
diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
index 5dfc93a292..fc78cf45e5 100644
--- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
@@ -62,6 +62,7 @@ impl<S: Simd> BlurredRoundedRectFiller<S> {
 impl<S: Simd> Iterator for BlurredRoundedRectFiller<S> {
     type Item = ShaderResultF32<S>;
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let next = self.alpha_calculator.next().unwrap();
         let r = self.r * next;
@@ -75,30 +76,44 @@ impl<S: Simd> Iterator for BlurredRoundedRectFiller<S> {
 
 impl<S: Simd> crate::fine::Painter for BlurredRoundedRectFiller<S> {
     fn paint_u8(&mut self, buf: &mut [u8]) {
-        for chunk in buf.chunks_exact_mut(64) {
-            let first = self.next().unwrap();
-            let simd = first.r.simd;
-            let second = self.next().unwrap();
+        self.a.simd.vectorize(
+            #[inline(always)]
+            || {
+                for chunk in buf.chunks_exact_mut(64) {
+                    let first = self.next().unwrap();
+                    let simd = first.r.simd;
+                    let second = self.next().unwrap();
 
-            let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r));
-            let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g));
-            let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b));
-            let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a));
+                    let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r));
+                    let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g));
+                    let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b));
+                    let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a));
 
-            let combined = simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a));
+                    let combined =
+                        simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a));
 
-            simd.store_interleaved_128_u8x64(combined, (&mut chunk[..]).try_into().unwrap());
-        }
+                    simd.store_interleaved_128_u8x64(
+                        combined,
+                        (&mut chunk[..]).try_into().unwrap(),
+                    );
+                }
+            },
+        );
     }
 
     fn paint_f32(&mut self, buf: &mut [f32]) {
-        for chunk in buf.chunks_exact_mut(32) {
-            let (c1, c2) = self.next().unwrap().get();
-            c1.simd
-                .store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap());
-            c2.simd
-                .store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap());
-        }
+        self.a.simd.vectorize(
+            #[inline(always)]
+            || {
+                for chunk in buf.chunks_exact_mut(32) {
+                    let (c1, c2) = self.next().unwrap().get();
+                    c1.simd
+                        .store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap());
+                    c2.simd
+                        .store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap());
+                }
+            },
+        );
     }
 }
 
@@ -132,6 +147,7 @@ impl<S: Simd> AlphaCalculator<S> {
 impl<S: Simd> Iterator for AlphaCalculator<S> {
     type Item = f32x8<S>;
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let i = f32x8::splat_pos(
             self.simd,
@@ -225,13 +241,13 @@ impl<S: Simd> SimdRoundedBlurredRect<S> {
 trait FloatExt<S: Simd> {
     // See https://raphlinus.github.io/audio/2018/09/05/sigmoid.html for a little
     // explanation of this approximation to the erf function.
-    // Doing `inline(always)` seems to reduce performance for some reason.
     /// Approximate the erf function.
     fn compute_erf7(simd: S, x: Self) -> Self;
     fn powf(self, x: f32) -> Self;
 }
 
 impl<S: Simd> FloatExt<S> for f32x8<S> {
+    #[inline(always)]
     fn compute_erf7(simd: S, x: Self) -> Self {
         // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or
         // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64`
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/image.rs b/sparse_strips/vello_cpu/src/fine/lowp/image.rs
index 1b0226a62c..c5bbfc7349 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/image.rs
@@ -34,6 +34,7 @@ impl<'a, S: Simd> BilinearImagePainter<'a, S> {
 impl<S: Simd> Iterator for BilinearImagePainter<'_, S> {
     type Item = u8x16<S>;
 
+    #[inline(always)]
     fn next(&mut self) -> Option<Self::Item> {
         let x_positions = f32x4::splat_pos(
             self.simd,

From 9049df262eb99692d95a0fa857d1912970d10768 Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 11:48:31 +0200
Subject: [PATCH 4/7] Fix blending inlining

---
 .../vello_cpu/src/fine/highp/blend.rs         | 92 ++++++++++++-------
 .../vello_cpu/src/fine/highp/compose.rs       | 91 ++++++++++--------
 sparse_strips/vello_cpu/src/fine/highp/mod.rs | 17 ++--
 .../vello_cpu/src/fine/lowp/blend.rs          | 49 ++++++----
 .../vello_cpu/src/fine/lowp/compose.rs        | 63 ++++++++-----
 5 files changed, 191 insertions(+), 121 deletions(-)

diff --git a/sparse_strips/vello_cpu/src/fine/highp/blend.rs b/sparse_strips/vello_cpu/src/fine/highp/blend.rs
index cb2174da11..93b1f0107f 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/blend.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/blend.rs
@@ -23,29 +23,23 @@ impl<S: Simd> Channels<S> {
     }
 }
 
-// TODO: blending is still extremely slow, investigate whether there is something obvious we are
-// missing that other renderers do.
 pub(crate) fn mix<S: Simd>(src_c: f32x16<S>, bg: f32x16<S>, blend_mode: BlendMode) -> f32x16<S> {
+    src_c.simd.vectorize(
+        #[inline(always)]
+        || mix_inner(src_c, bg, blend_mode),
+    )
+}
+
+#[inline(always)]
+fn mix_inner<S: Simd>(src_c: f32x16<S>, bg: f32x16<S>, blend_mode: BlendMode) -> f32x16<S> {
     if matches!(blend_mode.mix, Mix::Normal) {
         return src_c;
     }
     // See https://www.w3.org/TR/compositing-1/#blending
     let simd = src_c.simd;
 
-    let split = |input: f32x16<S>| {
-        let mut storage = [0.0; 16];
-        simd.store_interleaved_128_f32x16(input, &mut storage);
-        let input_v = f32x16::from_slice(simd, &storage);
-
-        let p1 = simd.split_f32x16(input_v);
-        let (r, g) = simd.split_f32x8(p1.0);
-        let (b, a) = simd.split_f32x8(p1.1);
-
-        (Channels { r, g, b }, a)
-    };
-
-    let (bg_channels, bg_a) = split(bg);
-    let (src_channels, src_a) = split(src_c);
+    let (bg_channels, bg_a) = split(simd, bg);
+    let (src_channels, src_a) = split(simd, src_c);
 
     let unpremultiplied_bg = bg_channels.unpremultiply(bg_a);
     let unpremultiplied_src = src_channels.unpremultiply(src_a);
@@ -53,18 +47,9 @@ pub(crate) fn mix<S: Simd>(src_c: f32x16<S>, bg: f32x16<S>, blend_mode: BlendMod
     let mut res_bg = unpremultiplied_bg;
     let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg);
 
-    let apply_alpha = |unpremultiplied_src_channel: f32x4<S>,
-                       mix_src_channel: f32x4<S>,
-                       dest_channel: &mut f32x4<S>| {
-        let p1 = (1.0 - bg_a) * unpremultiplied_src_channel;
-        let p2 = bg_a * mix_src_channel;
-
-        *dest_channel = (p1 + p2).premultiply(src_a);
-    };
-
-    apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r);
-    apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g);
-    apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b);
+    res_bg.r = apply_alpha(bg_a, src_a, unpremultiplied_src.r, mix_src.r);
+    res_bg.g = apply_alpha(bg_a, src_a, unpremultiplied_src.g, mix_src.g);
+    res_bg.b = apply_alpha(bg_a, src_a, unpremultiplied_src.b, mix_src.b);
 
     let combined = simd.combine_f32x8(
         simd.combine_f32x4(res_bg.r, res_bg.g),
@@ -76,11 +61,38 @@ pub(crate) fn mix<S: Simd>(src_c: f32x16<S>, bg: f32x16<S>, blend_mode: BlendMod
     f32x16::from_slice(simd, &storage)
 }
 
+#[inline(always)]
+fn split<S: Simd>(simd: S, input: f32x16<S>) -> (Channels<S>, f32x4<S>) {
+    let mut storage = [0.0; 16];
+    simd.store_interleaved_128_f32x16(input, &mut storage);
+    let input_v = f32x16::from_slice(simd, &storage);
+
+    let p1 = simd.split_f32x16(input_v);
+    let (r, g) = simd.split_f32x8(p1.0);
+    let (b, a) = simd.split_f32x8(p1.1);
+
+    (Channels { r, g, b }, a)
+}
+
+#[inline(always)]
+fn apply_alpha<S: Simd>(
+    bg_a: f32x4<S>,
+    src_a: f32x4<S>,
+    unpremultiplied_src_channel: f32x4<S>,
+    mix_src_channel: f32x4<S>,
+) -> f32x4<S> {
+    let p1 = (1.0 - bg_a) * unpremultiplied_src_channel;
+    let p2 = bg_a * mix_src_channel;
+
+    (p1 + p2).premultiply(src_a)
+}
+
 trait MixExt {
     fn mix<S: Simd>(&self, src: Channels<S>, bg: Channels<S>) -> Channels<S>;
 }
 
 impl MixExt for BlendMode {
+    #[inline(always)]
     fn mix<S: Simd>(&self, src: Channels<S>, bg: Channels<S>) -> Channels<S> {
         match self.mix {
             Mix::Normal => src,
@@ -118,6 +130,7 @@ impl Screen {
 }
 
 impl HardLight {
+    #[inline(always)]
     fn single<S: Simd>(src: f32x4<S>, bg: f32x4<S>) -> f32x4<S> {
         let two = f32x4::splat(src.simd, 2.0);
 
@@ -254,14 +267,17 @@ non_separable_mix!(Luminosity, |cs: &mut Channels<S>, cb: &mut Channels<S>| {
     *cb
 });
 
+#[inline(always)]
 fn lum<S: Simd>(r: f32x4<S>, g: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
     0.3 * r + 0.59 * g + 0.11 * b
 }
 
+#[inline(always)]
 fn sat<S: Simd>(r: f32x4<S>, g: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
     r.max(g).max(b) - r.min(g).min(b)
 }
 
+#[inline(always)]
 fn clip_color<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>) {
     let simd = r.simd;
 
@@ -284,6 +300,7 @@ fn clip_color<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>) {
     }
 }
 
+#[inline(always)]
 fn set_lum<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>, l: f32x4<S>) {
     let d = l - lum(*r, *g, *b);
     *r += d;
@@ -294,17 +311,24 @@ fn set_lum<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>, l: f32
 }
 
 // Adapted from tiny-skia
+#[inline(always)]
 fn set_sat<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>, s: f32x4<S>) {
-    let simd = r.simd;
-    let zero = f32x4::splat(simd, 0.0);
     let mn = r.min(g.min(*b));
     let mx = r.max(g.max(*b));
     let sat = mx - mn;
 
     // Map min channel to 0, max channel to s, and scale the middle proportionally.
-    let scale = |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat);
+    *r = scale_sat_channel(*r, mn, sat, s);
+    *g = scale_sat_channel(*g, mn, sat, s);
+    *b = scale_sat_channel(*b, mn, sat, s);
+}
 
-    *r = scale(*r);
-    *g = scale(*g);
-    *b = scale(*b);
+#[inline(always)]
+fn scale_sat_channel<S: Simd>(c: f32x4<S>, mn: f32x4<S>, sat: f32x4<S>, s: f32x4<S>) -> f32x4<S> {
+    let simd = c.simd;
+    simd.select_f32x4(
+        simd.simd_eq_f32x4(sat, f32x4::splat(simd, 0.0)),
+        f32x4::splat(simd, 0.0),
+        (c - mn) * s / sat,
+    )
 }
diff --git a/sparse_strips/vello_cpu/src/fine/highp/compose.rs b/sparse_strips/vello_cpu/src/fine/highp/compose.rs
index 32983c9abe..3e3b689059 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/compose.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/compose.rs
@@ -23,48 +23,62 @@ impl ComposeExt for BlendMode {
         bg_c: f32x16<S>,
         alpha_mask: Option<f32x16<S>>,
     ) -> f32x16<S> {
-        // There some non-obvious subtleties worth highlighting here.
-        // We support two kinds of blending (in this case, we focus on compositing specifically):
-        // - Isolated blending, where layers as a whole are blended together with their backdrop.
-        //   If we are currently performing this kind of blending, `alpha_mask` will always be `None`.
-        //   After all, there is no concrete shape opacity associated with a layer. Instead, we are
-        //   just compositing the RGBA values at _all_ positions of the source layer with the backdrop
-        //   layer. For example, if the backdrop contains a green rectangle and source layer is just
-        //   empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared,
-        //   because we are compositing the whole source layer with the whole backdrop, and not
-        //   just the parts of the source layer that have actually be drawn on.
-        // - Non-isolated blending, where a single path is blended with the backdrop. In this case,
-        //   `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently
-        //   compositing. Remember that strips always have a fixed height of 4, because of this, the
-        //   strips might cover areas that aren't actually covered by the path (and just have an alpha
-        //   value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for
-        //   non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`.
+        simd.vectorize(
+            #[inline(always)]
+            || compose_inner(*self, simd, src_c, bg_c, alpha_mask),
+        )
+    }
+}
 
-        let mut res = match self.compose {
-            Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
-            Compose::Clear => Clear::compose(simd, src_c, bg_c),
-            Compose::Copy => Copy::compose(simd, src_c, bg_c),
-            Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
-            Compose::Dest => Dest::compose(simd, src_c, bg_c),
-            Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
-            Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
-            Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
-            Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
-            Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
-            Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
-            Compose::Xor => Xor::compose(simd, src_c, bg_c),
-            Compose::Plus => Plus::compose(simd, src_c, bg_c),
-            // Have not been able to find a formula for this, so just fallback to Plus.
-            Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
-        };
+#[inline(always)]
+fn compose_inner<S: Simd>(
+    blend_mode: BlendMode,
+    simd: S,
+    src_c: f32x16<S>,
+    bg_c: f32x16<S>,
+    alpha_mask: Option<f32x16<S>>,
+) -> f32x16<S> {
+    // There some non-obvious subtleties worth highlighting here.
+    // We support two kinds of blending (in this case, we focus on compositing specifically):
+    // - Isolated blending, where layers as a whole are blended together with their backdrop.
+    //   If we are currently performing this kind of blending, `alpha_mask` will always be `None`.
+    //   After all, there is no concrete shape opacity associated with a layer. Instead, we are
+    //   just compositing the RGBA values at _all_ positions of the source layer with the backdrop
+    //   layer. For example, if the backdrop contains a green rectangle and source layer is just
+    //   empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared,
+    //   because we are compositing the whole source layer with the whole backdrop, and not
+    //   just the parts of the source layer that have actually be drawn on.
+    // - Non-isolated blending, where a single path is blended with the backdrop. In this case,
+    //   `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently
+    //   compositing. Remember that strips always have a fixed height of 4, because of this, the
+    //   strips might cover areas that aren't actually covered by the path (and just have an alpha
+    //   value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for
+    //   non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`.
 
-        if let Some(alpha_mask) = alpha_mask {
-            let alpha_mask_inv = 1.0 - alpha_mask;
-            res = alpha_mask * res + alpha_mask_inv * bg_c;
-        }
+    let mut res = match blend_mode.compose {
+        Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
+        Compose::Clear => Clear::compose(simd, src_c, bg_c),
+        Compose::Copy => Copy::compose(simd, src_c, bg_c),
+        Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
+        Compose::Dest => Dest::compose(simd, src_c, bg_c),
+        Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
+        Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
+        Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
+        Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
+        Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
+        Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
+        Compose::Xor => Xor::compose(simd, src_c, bg_c),
+        Compose::Plus => Plus::compose(simd, src_c, bg_c),
+        // Have not been able to find a formula for this, so just fallback to Plus.
+        Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
+    };
 
-        res
+    if let Some(alpha_mask) = alpha_mask {
+        let alpha_mask_inv = 1.0 - alpha_mask;
+        res = alpha_mask * res + alpha_mask_inv * bg_c;
     }
+
+    res
 }
 
 macro_rules! compose {
@@ -72,6 +86,7 @@ macro_rules! compose {
         struct $name;
 
         impl $name {
+            #[inline(always)]
             fn compose<S: Simd>(simd: S, src_c: f32x16<S>, bg_c: f32x16<S>) -> f32x16<S> {
                 let al_b = bg_c.splat_4th();
                 let al_s = src_c.splat_4th();
diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
index 0948001bee..eb2236af43 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
@@ -385,12 +385,17 @@ mod fill {
         src: T,
         blend_mode: BlendMode,
     ) {
-        for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
-            let bg_v = f32x16::from_slice(simd, next_dest);
-            let src_c = blend::mix(next_src, bg_v, blend_mode);
-            let res = blend_mode.compose(simd, src_c, bg_v, None);
-            res.store_slice(next_dest);
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
+                    let bg_v = f32x16::from_slice(simd, next_dest);
+                    let src_c = blend::mix(next_src, bg_v, blend_mode);
+                    let res = blend_mode.compose(simd, src_c, bg_v, None);
+                    res.store_slice(next_dest);
+                }
+            },
+        );
     }
 
     /// Performs the core alpha compositing calculation.
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
index 318b8d0a0a..aa7dbdc9bb 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
@@ -6,32 +6,21 @@ use crate::peniko::{BlendMode, Mix};
 use vello_common::fearless_simd::*;
 use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32};
 
-// TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed.
 pub(crate) fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
+    src_c.simd.vectorize(
+        #[inline(always)]
+        || mix_inner(src_c, bg_c, blend_mode),
+    )
+}
+
+#[inline(always)]
+fn mix_inner<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
     if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) {
         return res;
     }
 
     // Fallback for blend modes that aren't supported in u8.
 
-    let to_f32 = |val: u8x32<S>| {
-        let (a, b) = src_c.simd.split_u8x32(val);
-        let mut a = u8_to_f32(a);
-        let mut b = u8_to_f32(b);
-        a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
-        b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
-        (a, b)
-    };
-
-    let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
-        let val1 =
-            f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
-        let val2 =
-            f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));
-
-        val1.simd.combine_u8x16(val1, val2)
-    };
-
     let (mut src_1, mut src_2) = to_f32(src_c);
     let (bg_1, bg_2) = to_f32(bg_c);
 
@@ -41,6 +30,28 @@ pub(crate) fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMod
     to_u8(src_1, src_2)
 }
 
+#[inline(always)]
+fn to_f32<S: Simd>(val: u8x32<S>) -> (f32x16<S>, f32x16<S>) {
+    let simd = val.simd;
+    let (a, b) = simd.split_u8x32(val);
+    let mut a = u8_to_f32(a);
+    let mut b = u8_to_f32(b);
+    a *= f32x16::splat(simd, 1.0 / 255.0);
+    b *= f32x16::splat(simd, 1.0 / 255.0);
+    (a, b)
+}
+
+#[inline(always)]
+fn to_u8<S: Simd>(val1: f32x16<S>, val2: f32x16<S>) -> u8x32<S> {
+    let val1 =
+        f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
+    let val2 =
+        f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));
+
+    val1.simd.combine_u8x16(val1, val2)
+}
+
+#[inline(always)]
 fn try_u8_mix<S: Simd>(blend_mode: BlendMode, src_c: u8x32<S>, bg_c: u8x32<S>) -> Option<u8x32<S>> {
     // We implement the u8 fast path for blend modes that
     // 1) are separable.
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
index c03d43bd4a..c44de0dbb3 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
@@ -25,33 +25,47 @@ impl ComposeExt for BlendMode {
         bg_c: u8x32<S>,
         alpha_mask: Option<u8x32<S>>,
     ) -> u8x32<S> {
-        let mut res = match self.compose {
-            Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
-            Compose::Clear => Clear::compose(simd, src_c, bg_c),
-            Compose::Copy => Copy::compose(simd, src_c, bg_c),
-            Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
-            Compose::Dest => Dest::compose(simd, src_c, bg_c),
-            Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
-            Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
-            Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
-            Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
-            Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
-            Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
-            Compose::Xor => Xor::compose(simd, src_c, bg_c),
-            Compose::Plus => Plus::compose(simd, src_c, bg_c),
-            // Have not been able to find a formula for this, so just fallback to Plus.
-            Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
-        };
+        simd.vectorize(
+            #[inline(always)]
+            || compose_inner(*self, simd, src_c, bg_c, alpha_mask),
+        )
+    }
+}
 
-        if let Some(alpha_mask) = alpha_mask {
-            let alpha_mask_inv = 255 - alpha_mask;
-            let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res);
-            let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c);
-            res = simd.narrow_u16x32((p1 + p2).div_255());
-        }
+#[inline(always)]
+fn compose_inner<S: Simd>(
+    blend_mode: BlendMode,
+    simd: S,
+    src_c: u8x32<S>,
+    bg_c: u8x32<S>,
+    alpha_mask: Option<u8x32<S>>,
+) -> u8x32<S> {
+    let mut res = match blend_mode.compose {
+        Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
+        Compose::Clear => Clear::compose(simd, src_c, bg_c),
+        Compose::Copy => Copy::compose(simd, src_c, bg_c),
+        Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
+        Compose::Dest => Dest::compose(simd, src_c, bg_c),
+        Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
+        Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
+        Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
+        Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
+        Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
+        Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
+        Compose::Xor => Xor::compose(simd, src_c, bg_c),
+        Compose::Plus => Plus::compose(simd, src_c, bg_c),
+        // Have not been able to find a formula for this, so just fallback to Plus.
+        Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
+    };
 
-        res
+    if let Some(alpha_mask) = alpha_mask {
+        let alpha_mask_inv = 255 - alpha_mask;
+        let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res);
+        let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c);
+        res = simd.narrow_u16x32((p1 + p2).div_255());
     }
+
+    res
 }
 
 macro_rules! compose {
@@ -59,6 +73,7 @@ macro_rules! compose {
         struct $name;
 
         impl $name {
+            #[inline(always)]
             fn compose<S: Simd>(simd: S, src_c: u8x32<S>, bg_c: u8x32<S>) -> u8x32<S> {
                 let al_b = bg_c.splat_4th();
                 let al_s = src_c.splat_4th();

From 97bd00a6ee38a0a233be41783a717e57419662b8 Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 12:21:41 +0200
Subject: [PATCH 5/7] More tweaks

---
 sparse_strips/vello_common/src/encode.rs      | 10 ++++++
 sparse_strips/vello_cpu/src/fine/highp/mod.rs | 32 ++++++++++---------
 sparse_strips/vello_cpu/src/fine/lowp/mod.rs  | 31 ++++++++++--------
 sparse_strips/vello_cpu/src/fine/mod.rs       | 19 +++++------
 4 files changed, 54 insertions(+), 38 deletions(-)

diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs
index 5161fe021a..d9160ce2c3 100644
--- a/sparse_strips/vello_common/src/encode.rs
+++ b/sparse_strips/vello_common/src/encode.rs
@@ -985,6 +985,7 @@ pub trait FromF32Color: Sized + Debug + Copy + Clone {
 impl FromF32Color for f32 {
     const ZERO: Self = 0.0;
 
+    #[inline(always)]
     fn from_f32<S: Simd>(color: f32x4<S>) -> [Self; 4] {
         color.into()
     }
@@ -993,6 +994,7 @@ impl FromF32Color for f32 {
 impl FromF32Color for u8 {
     const ZERO: Self = 0;
 
+    #[inline(always)]
     fn from_f32<S: Simd>(mut color: f32x4<S>) -> [Self; 4] {
         let simd = color.simd;
         color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5));
@@ -1016,6 +1018,14 @@ pub struct GradientLut<T: FromF32Color> {
 impl<T: FromF32Color> GradientLut<T> {
     /// Create a new lookup table.
     fn new<S: Simd>(simd: S, ranges: &[GradientRange]) -> Self {
+        simd.vectorize(
+            #[inline(always)]
+            || Self::new_inner(simd, ranges),
+        )
+    }
+
+    #[inline(always)]
+    fn new_inner<S: Simd>(simd: S, ranges: &[GradientRange]) -> Self {
         let lut_size = determine_lut_size(ranges);
         let mut lut = vec![[T::ZERO; 4]; lut_size];
 
diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
index eb2236af43..eac28ca36c 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
@@ -162,28 +162,30 @@ impl<S: Simd> FineKernel<S> for F32Kernel {
         painter.paint_f32(dest);
     }
 
-    #[inline(always)]
     fn apply_tint(simd: S, dest: &mut [Self::Numeric], tint: &Tint) {
         let premul = tint.color.premultiply();
         let [r, g, b, a] = premul.components;
-        let tint_v = f32x16::block_splat(f32x4::from_slice(simd, &[r, g, b, a]));
 
         simd.vectorize(
             #[inline(always)]
-            || match tint.mode {
-                TintMode::AlphaMask => {
-                    for chunk in dest.chunks_exact_mut(16) {
-                        let pixel = f32x16::from_slice(simd, chunk);
-                        let alphas = pixel.splat_4th();
-                        let tinted = tint_v * alphas;
-                        tinted.store_slice(chunk);
+            || {
+                let tint_v = f32x16::block_splat(f32x4::from_slice(simd, &[r, g, b, a]));
+
+                match tint.mode {
+                    TintMode::AlphaMask => {
+                        for chunk in dest.chunks_exact_mut(16) {
+                            let pixel = f32x16::from_slice(simd, chunk);
+                            let alphas = pixel.splat_4th();
+                            let tinted = tint_v * alphas;
+                            tinted.store_slice(chunk);
+                        }
                     }
-                }
-                TintMode::Multiply => {
-                    for chunk in dest.chunks_exact_mut(16) {
-                        let pixel = f32x16::from_slice(simd, chunk);
-                        let tinted = pixel * tint_v;
-                        tinted.store_slice(chunk);
+                    TintMode::Multiply => {
+                        for chunk in dest.chunks_exact_mut(16) {
+                            let pixel = f32x16::from_slice(simd, chunk);
+                            let tinted = pixel * tint_v;
+                            tinted.store_slice(chunk);
+                        }
                     }
                 }
             },
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
index 3e9e88b79f..e6e8981684 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
@@ -203,24 +203,27 @@ impl<S: Simd> FineKernel<S> for U8Kernel {
         let [r, g, b, a] = premul.components;
         let to_u8 = |v: f32| (v * 255.0 + 0.5) as u8;
         let color = u32::from_ne_bytes([to_u8(r), to_u8(g), to_u8(b), to_u8(a)]);
-        let tint_v = u32x8::block_splat(u32x4::splat(simd, color)).to_bytes();
 
         simd.vectorize(
             #[inline(always)]
-            || match tint.mode {
-                TintMode::AlphaMask => {
-                    for chunk in dest.chunks_exact_mut(32) {
-                        let pixel = u8x32::from_slice(simd, chunk);
-                        let alphas = pixel.splat_4th();
-                        let tinted = tint_v.normalized_mul(alphas);
-                        tinted.store_slice(chunk);
+            || {
+                let tint_v = u32x8::block_splat(u32x4::splat(simd, color)).to_bytes();
+
+                match tint.mode {
+                    TintMode::AlphaMask => {
+                        for chunk in dest.chunks_exact_mut(32) {
+                            let pixel = u8x32::from_slice(simd, chunk);
+                            let alphas = pixel.splat_4th();
+                            let tinted = tint_v.normalized_mul(alphas);
+                            tinted.store_slice(chunk);
+                        }
                     }
-                }
-                TintMode::Multiply => {
-                    for chunk in dest.chunks_exact_mut(32) {
-                        let pixel = u8x32::from_slice(simd, chunk);
-                        let tinted = pixel.normalized_mul(tint_v);
-                        tinted.store_slice(chunk);
+                    TintMode::Multiply => {
+                        for chunk in dest.chunks_exact_mut(32) {
+                            let pixel = u8x32::from_slice(simd, chunk);
+                            let tinted = pixel.normalized_mul(tint_v);
+                            tinted.store_slice(chunk);
+                        }
                     }
                 }
             },
diff --git a/sparse_strips/vello_cpu/src/fine/mod.rs b/sparse_strips/vello_cpu/src/fine/mod.rs
index c16c8282df..0a87abf014 100644
--- a/sparse_strips/vello_cpu/src/fine/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/mod.rs
@@ -641,15 +641,12 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
             Cmd::Opacity(o) => {
                 if *o != 1.0 {
                     let blend_buf = self.blend_buf.last_mut().unwrap();
-
-                    T::apply_mask(
-                        self.simd,
-                        blend_buf,
-                        iter::repeat(T::NumericVec::from_f32(
-                            self.simd,
-                            f32x16::splat(self.simd, *o),
-                        )),
+                    let opacity = self.simd.vectorize(
+                        #[inline(always)]
+                        || T::NumericVec::from_f32(self.simd, f32x16::splat(self.simd, *o)),
                     );
+
+                    T::apply_mask(self.simd, blend_buf, iter::repeat(opacity));
                 }
             }
             Cmd::PushZeroClip(_) | Cmd::PopZeroClip => {
@@ -704,13 +701,17 @@ impl<S: Simd, T: FineKernel<S>> Fine<S, T> {
                 } else {
                     let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16;
                     let start_y = self.wide_coords.1 * Tile::HEIGHT;
+                    let src = self.simd.vectorize(
+                        #[inline(always)]
+                        || T::Composite::from_color(self.simd, color),
+                    );
 
                     T::blend(
                         self.simd,
                         blend_buf,
                         start_x,
                         start_y,
-                        iter::repeat(T::Composite::from_color(self.simd, color)),
+                        iter::repeat(src),
                         blend_mode,
                         alphas,
                         mask,

From ec4d32098b2c7008a45b79828507f04d7fb5a03d Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 14:04:49 +0200
Subject: [PATCH 6/7] Add comments

---
 sparse_strips/vello_common/src/encode.rs                | 2 ++
 sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs  | 2 ++
 sparse_strips/vello_cpu/src/dispatch/single_threaded.rs | 4 ++++
 3 files changed, 8 insertions(+)

diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs
index d9160ce2c3..8805dee87a 100644
--- a/sparse_strips/vello_common/src/encode.rs
+++ b/sparse_strips/vello_common/src/encode.rs
@@ -778,12 +778,14 @@ pub struct EncodedGradient {
 
 impl EncodedGradient {
     /// Get the lookup table for sampling u8-based gradient values.
+    // No need to vectorize here, as vectorization happens in the constructor.
     pub fn u8_lut<S: Simd>(&self, simd: S) -> &GradientLut<u8> {
         self.u8_lut
             .get_or_init(|| GradientLut::new(simd, &self.ranges))
     }
 
     /// Get the lookup table for sampling f32-based gradient values.
+    // No need to vectorize here, as vectorization happens in the constructor.
     pub fn f32_lut<S: Simd>(&self, simd: S) -> &GradientLut<f32> {
         self.f32_lut
             .get_or_init(|| GradientLut::new(simd, &self.ranges))
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index 22fb125f90..e27b8a3749 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -358,6 +358,8 @@ impl MultiThreadedDispatcher {
         }
     }
 
+    // No need to vectorize here, as vectorization happens in each of the 
+    // functions that are called within.
     fn rasterize_with<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
index 2a08d4b1b2..6f11c95683 100644
--- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
@@ -118,6 +118,10 @@ impl SingleThreadedDispatcher {
         dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height, encoded_paints, image_resolver));
     }
 
+    // Note: We purposefully don't add `vectorize` to each of the functions
+    // like `rasterize_with`, `composite_at_offset`, etc. since vectoriation
+    // instead is applied wherever necessary in child functions.
+    
     /// Core rasterization dispatcher that chooses between simple and filter-aware paths.
     ///
     /// # Type Parameters

From f204c08bcbc2ba7c53f801dc15253a6e6a7d0973 Mon Sep 17 00:00:00 2001
From: Laurenz Stampfl <laurenz.stampfl+github@gmail.com>
Date: Sat, 30 May 2026 14:09:16 +0200
Subject: [PATCH 7/7] Reformat

---
 sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs  | 2 +-
 sparse_strips/vello_cpu/src/dispatch/single_threaded.rs | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index e27b8a3749..a587225e05 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -358,7 +358,7 @@ impl MultiThreadedDispatcher {
         }
     }
 
-    // No need to vectorize here, as vectorization happens in each of the 
+    // No need to vectorize here, as vectorization happens in each of the
     // functions that are called within.
     fn rasterize_with<S: Simd, F: FineKernel<S>>(
         &self,
diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
index 6f11c95683..b0053cead1 100644
--- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
@@ -121,7 +121,7 @@ impl SingleThreadedDispatcher {
     // Note: We purposefully don't add `vectorize` to each of the functions
     // like `rasterize_with`, `composite_at_offset`, etc. since vectoriation
     // instead is applied wherever necessary in child functions.
-    
+
     /// Core rasterization dispatcher that chooses between simple and filter-aware paths.
     ///
     /// # Type Parameters