From bc699bb5795ff1d796b53b6ef6ce6f4f75816535 Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 15:14:15 +0100
Subject: [PATCH 1/2] Fix benchmark proc macro to actually create a
 #[target_feature(enable = ...)] context

---
 sparse_strips/vello_dev_macros/src/bench.rs | 23 ++++++++++++++++-----
 1 file changed, 18 insertions(+), 5 deletions(-)

diff --git a/sparse_strips/vello_dev_macros/src/bench.rs b/sparse_strips/vello_dev_macros/src/bench.rs
index 67e12cf31a..e30abdd412 100644
--- a/sparse_strips/vello_dev_macros/src/bench.rs
+++ b/sparse_strips/vello_dev_macros/src/bench.rs
@@ -4,7 +4,7 @@
 use proc_macro::TokenStream;
 use proc_macro2::Ident;
 use quote::quote;
-use syn::{ItemFn, parse_macro_input};
+use syn::{ItemFn, parse_macro_input, parse_quote};
 
 pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStream {
     let mut input_fn = parse_macro_input!(item as ItemFn);
@@ -14,6 +14,7 @@ pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStrea
     let inner_fn_name = Ident::new(&format!("{input_fn_name}_inner"), input_fn_name.span());
 
     input_fn.sig.ident = inner_fn_name.clone();
+    input_fn.attrs.push(parse_quote!(#[inline(always)]));
 
     let expanded = quote! {
         #input_fn
@@ -37,14 +38,26 @@ pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStrea
                 format!("{}/{}_{}", module, suffix1, suffix2)
             }
 
+            #[inline(always)]
             fn run_integer<S: Simd>(b: &mut Bencher, simd: S) {
-                let mut fine = Fine::<S, U8Kernel>::new(simd);
-                #inner_fn_name(b, &mut fine);
+                simd.vectorize(
+                    #[inline(always)]
+                    || {
+                        let mut fine = Fine::<S, U8Kernel>::new(simd);
+                        #inner_fn_name(b, &mut fine);
+                    },
+                );
             }
 
+            #[inline(always)]
             fn run_float<S: Simd>(b: &mut Bencher, simd: S) {
-                let mut fine = Fine::<S, F32Kernel>::new(simd);
-                #inner_fn_name(b, &mut fine);
+                simd.vectorize(
+                    #[inline(always)]
+                    || {
+                        let mut fine = Fine::<S, F32Kernel>::new(simd);
+                        #inner_fn_name(b, &mut fine);
+                    },
+                );
             }
 
             // Uncomment this to enable u8_scalar benchmarks.

From 34c840dd767104fc0ee8243882694e335927a45b Mon Sep 17 00:00:00 2001
From: "Sergey \"Shnatsel\" Davidoff" <shnatsel@gmail.com>
Date: Wed, 27 May 2026 17:37:46 +0100
Subject: [PATCH 2/2] Insert vectorize() into all places that should be
 #[inline(always)] for feature propagation to work, but currently aren't

---
 sparse_strips/vello_common/src/clip.rs        | 256 ++++---
 sparse_strips/vello_common/src/encode.rs      | 173 +++--
 sparse_strips/vello_common/src/rect.rs        | 174 ++---
 sparse_strips/vello_common/src/tile.rs        | 712 +++++++++---------
 .../vello_cpu/src/dispatch/multi_threaded.rs  |  82 +-
 .../vello_cpu/src/dispatch/single_threaded.rs | 431 ++++++-----
 .../vello_cpu/src/fine/common/gradient/mod.rs |   1 +
 .../vello_cpu/src/fine/common/image.rs        |   1 +
 .../src/fine/common/rounded_blurred_rect.rs   |  62 +-
 .../vello_cpu/src/fine/highp/blend.rs         | 264 ++++---
 .../vello_cpu/src/fine/highp/compose.rs       | 110 +--
 sparse_strips/vello_cpu/src/fine/highp/mod.rs |  20 +-
 .../vello_cpu/src/fine/lowp/blend.rs          | 122 +--
 .../vello_cpu/src/fine/lowp/compose.rs        |  88 ++-
 sparse_strips/vello_cpu/src/fine/lowp/mod.rs  |   4 +
 15 files changed, 1368 insertions(+), 1132 deletions(-)

diff --git a/sparse_strips/vello_common/src/clip.rs b/sparse_strips/vello_common/src/clip.rs
index a305e76f84..c68f1489c9 100644
--- a/sparse_strips/vello_common/src/clip.rs
+++ b/sparse_strips/vello_common/src/clip.rs
@@ -230,135 +230,163 @@ pub fn intersect(
 ///
 /// This is all that this method does. It just looks more complicated as the logic for iterating
 /// in lock step is a bit tricky.
+#[inline(always)]
 fn intersect_impl<S: Simd>(
     simd: S,
     path_1: PathDataRef<'_>,
     path_2: PathDataRef<'_>,
     target: &mut StripStorage,
 ) {
-    // In case either path is empty, the clip path should be empty.
-    if path_1.strips.is_empty() || path_2.strips.is_empty() {
-        return;
-    }
-
-    // Ignore any y values that are outside the bounding box of either of the two paths, as
-    // those are guaranteed to have neither fill nor strip regions.
-    let mut cur_y = path_1.strips[0].strip_y().min(path_2.strips[0].strip_y());
-    let end_y = path_1.strips[path_1.strips.len() - 1]
-        .strip_y()
-        .min(path_2.strips[path_2.strips.len() - 1].strip_y());
-
-    let mut path_1_idx = 0;
-    let mut path_2_idx = 0;
-    let mut strip_state = None;
-
-    // Iterate over each strip row and handle them.
-    while cur_y <= end_y {
-        // For each row, we create two iterators that alternatingly yield the strips and fill
-        // regions in that row, until the last strip has been reached.
-        let mut p1_iter = RowIterator::new(path_1, &mut path_1_idx, cur_y);
-        let mut p2_iter = RowIterator::new(path_2, &mut path_2_idx, cur_y);
-
-        let mut p1_region = p1_iter.next();
-        let mut p2_region = p2_iter.next();
-
-        // If at least one region is none, it means that we reached the end of the row
-        // for that path, meaning that we exceeded the bounding box of that path and no
-        // additional strips should be generated for that row, even if the other path might
-        // still have more strips left. They will all be clipped away. So only consider it
-        // if both paths have a region left.
-        while let (Some(region_1), Some(region_2)) = (p1_region, p2_region) {
-            match region_1.overlap_relationship(&region_2) {
-                // This means there is no overlap between the regions, so we need to advance
-                // the iterator of the region that is further behind.
-                OverlapRelationship::Advance(advance) => {
-                    match advance {
-                        Advance::Left => p1_region = p1_iter.next(),
-                        Advance::Right => p2_region = p2_iter.next(),
-                    };
-
-                    continue;
-                }
-                // We have an overlap!
-                OverlapRelationship::Overlap(overlap) => {
-                    match (region_1, region_2) {
-                        // Both regions are a fill. Flush the current strip and start a new
-                        // one at the end of the overlap region setting `fill_gap` to true,
-                        // so that the whole area before that will be filled with a sparse
-                        // fill.
-                        (Region::Fill(_), Region::Fill(_)) => {
-                            flush_strip(&mut strip_state, &mut target.strips, cur_y);
-                            start_strip(&mut strip_state, &target.alphas, overlap.end, true);
-                        }
-                        // One fill one strip, so we simply use the alpha mask from the strip region.
-                        (Region::Strip(s), Region::Fill(_))
-                        | (Region::Fill(_), Region::Strip(s)) => {
-                            // If possible, don't create a new strip but just extend the current one.
-                            if should_create_new_strip(&strip_state, &target.alphas, overlap.start)
-                            {
-                                flush_strip(&mut strip_state, &mut target.strips, cur_y);
-                                start_strip(&mut strip_state, &target.alphas, overlap.start, false);
-                            }
+    simd.vectorize(
+        #[inline(always)]
+        || {
+            // In case either path is empty, the clip path should be empty.
+            if path_1.strips.is_empty() || path_2.strips.is_empty() {
+                return;
+            }
 
-                            let s_alphas = &s.alphas[(overlap.start - s.start) as usize * 4..]
-                                [..overlap.width() as usize * 4];
-                            target.alphas.extend_from_slice(s_alphas);
+            // Ignore any y values that are outside the bounding box of either of the two paths, as
+            // those are guaranteed to have neither fill nor strip regions.
+            let mut cur_y = path_1.strips[0].strip_y().min(path_2.strips[0].strip_y());
+            let end_y = path_1.strips[path_1.strips.len() - 1]
+                .strip_y()
+                .min(path_2.strips[path_2.strips.len() - 1].strip_y());
+
+            let mut path_1_idx = 0;
+            let mut path_2_idx = 0;
+            let mut strip_state = None;
+
+            // Iterate over each strip row and handle them.
+            while cur_y <= end_y {
+                // For each row, we create two iterators that alternatingly yield the strips and fill
+                // regions in that row, until the last strip has been reached.
+                let mut p1_iter = RowIterator::new(path_1, &mut path_1_idx, cur_y);
+                let mut p2_iter = RowIterator::new(path_2, &mut path_2_idx, cur_y);
+
+                let mut p1_region = p1_iter.next();
+                let mut p2_region = p2_iter.next();
+
+                // If at least one region is none, it means that we reached the end of the row
+                // for that path, meaning that we exceeded the bounding box of that path and no
+                // additional strips should be generated for that row, even if the other path might
+                // still have more strips left. They will all be clipped away. So only consider it
+                // if both paths have a region left.
+                while let (Some(region_1), Some(region_2)) = (p1_region, p2_region) {
+                    match region_1.overlap_relationship(&region_2) {
+                        // This means there is no overlap between the regions, so we need to advance
+                        // the iterator of the region that is further behind.
+                        OverlapRelationship::Advance(advance) => {
+                            match advance {
+                                Advance::Left => p1_region = p1_iter.next(),
+                                Advance::Right => p2_region = p2_iter.next(),
+                            };
+
+                            continue;
                         }
-                        // Two strips, we need to multiply the opacity masks from both paths.
-                        (Region::Strip(s_region_1), Region::Strip(s_region_2)) => {
-                            // Once again, only create a new strip if we can't extend the current one.
-                            if should_create_new_strip(&strip_state, &target.alphas, overlap.start)
-                            {
-                                flush_strip(&mut strip_state, &mut target.strips, cur_y);
-                                start_strip(&mut strip_state, &target.alphas, overlap.start, false);
+                        // We have an overlap!
+                        OverlapRelationship::Overlap(overlap) => {
+                            match (region_1, region_2) {
+                                // Both regions are a fill. Flush the current strip and start a new
+                                // one at the end of the overlap region setting `fill_gap` to true,
+                                // so that the whole area before that will be filled with a sparse
+                                // fill.
+                                (Region::Fill(_), Region::Fill(_)) => {
+                                    flush_strip(&mut strip_state, &mut target.strips, cur_y);
+                                    start_strip(
+                                        &mut strip_state,
+                                        &target.alphas,
+                                        overlap.end,
+                                        true,
+                                    );
+                                }
+                                // One fill one strip, so we simply use the alpha mask from the strip region.
+                                (Region::Strip(s), Region::Fill(_))
+                                | (Region::Fill(_), Region::Strip(s)) => {
+                                    // If possible, don't create a new strip but just extend the current one.
+                                    if should_create_new_strip(
+                                        &strip_state,
+                                        &target.alphas,
+                                        overlap.start,
+                                    ) {
+                                        flush_strip(&mut strip_state, &mut target.strips, cur_y);
+                                        start_strip(
+                                            &mut strip_state,
+                                            &target.alphas,
+                                            overlap.start,
+                                            false,
+                                        );
+                                    }
+
+                                    let s_alphas = &s.alphas
+                                        [(overlap.start - s.start) as usize * 4..]
+                                        [..overlap.width() as usize * 4];
+                                    target.alphas.extend_from_slice(s_alphas);
+                                }
+                                // Two strips, we need to multiply the opacity masks from both paths.
+                                (Region::Strip(s_region_1), Region::Strip(s_region_2)) => {
+                                    // Once again, only create a new strip if we can't extend the current one.
+                                    if should_create_new_strip(
+                                        &strip_state,
+                                        &target.alphas,
+                                        overlap.start,
+                                    ) {
+                                        flush_strip(&mut strip_state, &mut target.strips, cur_y);
+                                        start_strip(
+                                            &mut strip_state,
+                                            &target.alphas,
+                                            overlap.start,
+                                            false,
+                                        );
+                                    }
+
+                                    let num_blocks = overlap.width() / Tile::HEIGHT;
+
+                                    // Get the right alpha values for the specific position.
+                                    let s1_alphas = s_region_1.alphas
+                                        [(overlap.start - s_region_1.start) as usize * 4..]
+                                        .chunks_exact(16)
+                                        .take(num_blocks as usize);
+                                    let s2_alphas = s_region_2.alphas
+                                        [(overlap.start - s_region_2.start) as usize * 4..]
+                                        .chunks_exact(16)
+                                        .take(num_blocks as usize);
+
+                                    for (s1_alpha, s2_alpha) in s1_alphas.zip(s2_alphas) {
+                                        let s1 = u8x16::from_slice(simd, s1_alpha);
+                                        let s2 = u8x16::from_slice(simd, s2_alpha);
+
+                                        // Combine them.
+                                        let res = simd.narrow_u16x16(normalized_mul_u8x16(s1, s2));
+                                        target.alphas.extend(res.as_slice());
+                                    }
+                                }
                             }
 
-                            let num_blocks = overlap.width() / Tile::HEIGHT;
-
-                            // Get the right alpha values for the specific position.
-                            let s1_alphas = s_region_1.alphas
-                                [(overlap.start - s_region_1.start) as usize * 4..]
-                                .chunks_exact(16)
-                                .take(num_blocks as usize);
-                            let s2_alphas = s_region_2.alphas
-                                [(overlap.start - s_region_2.start) as usize * 4..]
-                                .chunks_exact(16)
-                                .take(num_blocks as usize);
-
-                            for (s1_alpha, s2_alpha) in s1_alphas.zip(s2_alphas) {
-                                let s1 = u8x16::from_slice(simd, s1_alpha);
-                                let s2 = u8x16::from_slice(simd, s2_alpha);
-
-                                // Combine them.
-                                let res = simd.narrow_u16x16(normalized_mul_u8x16(s1, s2));
-                                target.alphas.extend(res.as_slice());
-                            }
+                            // Advance the iterator of the path whose region's end is further behind.
+                            match overlap.advance {
+                                Advance::Left => p1_region = p1_iter.next(),
+                                Advance::Right => p2_region = p2_iter.next(),
+                            };
                         }
                     }
-
-                    // Advance the iterator of the path whose region's end is further behind.
-                    match overlap.advance {
-                        Advance::Left => p1_region = p1_iter.next(),
-                        Advance::Right => p2_region = p2_iter.next(),
-                    };
                 }
-            }
-        }
 
-        // Flush the strip before advancing to the next strip row.
-        flush_strip(&mut strip_state, &mut target.strips, cur_y);
-        cur_y += 1;
-    }
+                // Flush the strip before advancing to the next strip row.
+                flush_strip(&mut strip_state, &mut target.strips, cur_y);
+                cur_y += 1;
+            }
 
-    // Push the sentinel strip, if one wasn't already pushed.
-    if !target.strips.last().is_some_and(Strip::is_sentinel) {
-        target.strips.push(Strip::new(
-            u16::MAX,
-            end_y * Tile::HEIGHT,
-            target.alphas.len() as u32,
-            false,
-        ));
-    }
+            // Push the sentinel strip, if one wasn't already pushed.
+            if !target.strips.last().is_some_and(Strip::is_sentinel) {
+                target.strips.push(Strip::new(
+                    u16::MAX,
+                    end_y * Tile::HEIGHT,
+                    target.alphas.len() as u32,
+                    false,
+                ));
+            }
+        },
+    );
 }
 
 /// An overlap between two regions.
diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs
index 5161fe021a..a309050d2a 100644
--- a/sparse_strips/vello_common/src/encode.rs
+++ b/sparse_strips/vello_common/src/encode.rs
@@ -778,15 +778,27 @@ pub struct EncodedGradient {
 
 impl EncodedGradient {
     /// Get the lookup table for sampling u8-based gradient values.
+    #[inline(always)]
     pub fn u8_lut<S: Simd>(&self, simd: S) -> &GradientLut<u8> {
-        self.u8_lut
-            .get_or_init(|| GradientLut::new(simd, &self.ranges))
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                self.u8_lut
+                    .get_or_init(|| GradientLut::new(simd, &self.ranges))
+            },
+        )
     }
 
     /// Get the lookup table for sampling f32-based gradient values.
+    #[inline(always)]
     pub fn f32_lut<S: Simd>(&self, simd: S) -> &GradientLut<f32> {
-        self.f32_lut
-            .get_or_init(|| GradientLut::new(simd, &self.ranges))
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                self.f32_lut
+                    .get_or_init(|| GradientLut::new(simd, &self.ranges))
+            },
+        )
     }
 }
 
@@ -985,24 +997,34 @@ pub trait FromF32Color: Sized + Debug + Copy + Clone {
 impl FromF32Color for f32 {
     const ZERO: Self = 0.0;
 
+    #[inline(always)]
     fn from_f32<S: Simd>(color: f32x4<S>) -> [Self; 4] {
-        color.into()
+        color.simd.vectorize(
+            #[inline(always)]
+            || color.into(),
+        )
     }
 }
 
 impl FromF32Color for u8 {
     const ZERO: Self = 0;
 
+    #[inline(always)]
     fn from_f32<S: Simd>(mut color: f32x4<S>) -> [Self; 4] {
-        let simd = color.simd;
-        color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5));
-
-        [
-            color[0] as Self,
-            color[1] as Self,
-            color[2] as Self,
-            color[3] as Self,
-        ]
+        color.simd.vectorize(
+            #[inline(always)]
+            || {
+                let simd = color.simd;
+                color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5));
+
+                [
+                    color[0] as Self,
+                    color[1] as Self,
+                    color[2] as Self,
+                    color[3] as Self,
+                ]
+            },
+        )
     }
 }
 
@@ -1015,68 +1037,79 @@ pub struct GradientLut<T: FromF32Color> {
 
 impl<T: FromF32Color> GradientLut<T> {
     /// Create a new lookup table.
+    #[inline(always)]
     fn new<S: Simd>(simd: S, ranges: &[GradientRange]) -> Self {
-        let lut_size = determine_lut_size(ranges);
-        let mut lut = vec![[T::ZERO; 4]; lut_size];
-
-        // Calculate how many indices are covered by each range.
-        let ramps = {
-            let mut ramps = Vec::with_capacity(ranges.len());
-            let mut prev_idx = 0;
-
-            for range in ranges {
-                let max_idx = (range.x1 * lut_size as f32) as usize;
-
-                ramps.push((prev_idx..max_idx, range));
-                prev_idx = max_idx;
-            }
-
-            ramps
-        };
-
-        let scale = lut_size as f32 - 1.0;
-
-        let inv_lut_scale = f32x4::splat(simd, 1.0 / scale);
-        let add_factor = f32x4::from_slice(simd, &[0.0, 1.0, 2.0, 3.0]) * inv_lut_scale;
-
-        for (ramp_range, range) in ramps {
-            let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias));
-            let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale));
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let lut_size = determine_lut_size(ranges);
+                let mut lut = vec![[T::ZERO; 4]; lut_size];
+
+                // Calculate how many indices are covered by each range.
+                let ramps = {
+                    let mut ramps = Vec::with_capacity(ranges.len());
+                    let mut prev_idx = 0;
+
+                    for range in ranges {
+                        let max_idx = (range.x1 * lut_size as f32) as usize;
+
+                        ramps.push((prev_idx..max_idx, range));
+                        prev_idx = max_idx;
+                    }
 
-            ramp_range.clone().step_by(4).for_each(|idx| {
-                let t_vals = f32x4::splat(simd, idx as f32).mul_add(inv_lut_scale, add_factor);
+                    ramps
+                };
 
-                let t_vals = element_wise_splat(simd, t_vals);
+                let scale = lut_size as f32 - 1.0;
+
+                let inv_lut_scale = f32x4::splat(simd, 1.0 / scale);
+                let add_factor = f32x4::from_slice(simd, &[0.0, 1.0, 2.0, 3.0]) * inv_lut_scale;
+
+                for (ramp_range, range) in ramps {
+                    let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias));
+                    let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale));
+
+                    ramp_range.clone().step_by(4).for_each(|idx| {
+                        let t_vals =
+                            f32x4::splat(simd, idx as f32).mul_add(inv_lut_scale, add_factor);
+
+                        let t_vals = element_wise_splat(simd, t_vals);
+
+                        let mut result = scales.mul_add(t_vals, biases);
+                        let alphas = result.splat_4th();
+                        // Premultiply colors, since we did interpolation in unpremultiplied space.
+                        if range.interpolation_alpha_space
+                            == InterpolationAlphaSpace::Unpremultiplied
+                        {
+                            result = {
+                                let mask = mask32x16::block_splat(mask32x4::from_slice(
+                                    simd,
+                                    &[-1, -1, -1, 0],
+                                ));
+                                simd.select_f32x16(mask, result * alphas, alphas)
+                            };
+                        }
 
-                let mut result = scales.mul_add(t_vals, biases);
-                let alphas = result.splat_4th();
-                // Premultiply colors, since we did interpolation in unpremultiplied space.
-                if range.interpolation_alpha_space == InterpolationAlphaSpace::Unpremultiplied {
-                    result = {
-                        let mask =
-                            mask32x16::block_splat(mask32x4::from_slice(simd, &[-1, -1, -1, 0]));
-                        simd.select_f32x16(mask, result * alphas, alphas)
-                    };
+                        // Due to floating-point impreciseness, it can happen that
+                        // values either become greater than 1 or the RGB channels
+                        // become greater than the alpha channel. To prevent overflows
+                        // in later parts of the pipeline, we need to take the minimum here.
+                        result = result.min(1.0).min(alphas);
+                        let (im1, im2) = simd.split_f32x16(result);
+                        let (r1, r2) = simd.split_f32x8(im1);
+                        let (r3, r4) = simd.split_f32x8(im2);
+                        let rs = [r1, r2, r3, r4].map(T::from_f32);
+
+                        // We always compute 4 samples at a time, but a gradient ramp does not necessarily
+                        // start at a multiple of 4, therefore we might have to truncate.
+                        let lut = &mut lut[idx..(idx + 4).min(lut_size)];
+                        lut.copy_from_slice(&rs[..lut.len()]);
+                    });
                 }
 
-                // Due to floating-point impreciseness, it can happen that
-                // values either become greater than 1 or the RGB channels
-                // become greater than the alpha channel. To prevent overflows
-                // in later parts of the pipeline, we need to take the minimum here.
-                result = result.min(1.0).min(alphas);
-                let (im1, im2) = simd.split_f32x16(result);
-                let (r1, r2) = simd.split_f32x8(im1);
-                let (r3, r4) = simd.split_f32x8(im2);
-                let rs = [r1, r2, r3, r4].map(T::from_f32);
-
-                // We always compute 4 samples at a time, but a gradient ramp does not necessarily
-                // start at a multiple of 4, therefore we might have to truncate.
-                let lut = &mut lut[idx..(idx + 4).min(lut_size)];
-                lut.copy_from_slice(&rs[..lut.len()]);
-            });
-        }
-
-        Self { lut, scale }
+                Self { lut, scale }
+            },
+        )
     }
 
     /// Get the sample value at a specific index.
diff --git a/sparse_strips/vello_common/src/rect.rs b/sparse_strips/vello_common/src/rect.rs
index 55f98b7160..8513072d12 100644
--- a/sparse_strips/vello_common/src/rect.rs
+++ b/sparse_strips/vello_common/src/rect.rs
@@ -40,96 +40,102 @@ pub fn render(level: Level, rect: Rect, strip_buf: &mut Vec<Strip>, alpha_buf: &
 ///
 /// The x-alpha masks for the left/right edge tiles are y-independent, so they
 /// are precomputed once and reused across all interior rows.
+#[inline(always)]
 fn render_impl<S: Simd>(s: S, rect: Rect, strip_buf: &mut Vec<Strip>, alpha_buf: &mut Vec<u8>) {
-    if rect.is_zero_area() {
-        return;
-    }
-
-    let rect_x0 = rect.x0 as f32;
-    let rect_y0 = rect.y0 as f32;
-    let rect_x1 = rect.x1 as f32;
-    let rect_y1 = rect.y1 as f32;
-
-    // Integer pixel bounds.
-    let px_x0 = rect_x0.floor() as u16;
-    let px_y0 = rect_y0.floor() as u16;
-    let px_y1 = rect_y1.ceil() as u16;
-
-    let left_tile_x = (px_x0 / Tile::WIDTH) * Tile::WIDTH;
-    // Inclusive, so don't use `ceil` here but just `rect_x1` directly.
-    let right_tile_x = (rect_x1 as u16 / Tile::WIDTH) * Tile::WIDTH;
-
-    let y0 = (px_y0 / Tile::HEIGHT) * Tile::HEIGHT;
-    // Note: y1 is exclusive, but it's gonna break for the very last tile if we have a height of u16::MAX.
-    let y1 = (px_y1.saturating_add(Tile::HEIGHT - 1) / Tile::HEIGHT) * Tile::HEIGHT;
-    // Include one tile past the right edge so the right-edge tile column is
-    // covered by the edge-row wide-strip loop.
-    let x_end = right_tile_x.saturating_add(Tile::WIDTH);
-
-    if x_end <= left_tile_x || y1 <= y0 {
-        return;
-    }
+    s.vectorize(
+        #[inline(always)]
+        || {
+            if rect.is_zero_area() {
+                return;
+            }
 
-    let tile_start_y = y0 / Tile::HEIGHT;
-    let tile_end_y = y1 / Tile::HEIGHT;
-
-    // A right strip is only needed when the rect spans more than one tile column.
-    let needs_right_strip = right_tile_x > left_tile_x;
-
-    let left_x_cov = coverage(left_tile_x, rect_x0, rect_x1);
-    let right_x_cov = coverage(right_tile_x, rect_x0, rect_x1);
-    let left_x_mask = alpha_mask_from_x_coverage(s, &left_x_cov);
-    let right_x_mask = alpha_mask_from_x_coverage(s, &right_x_cov);
-
-    for tile_y in tile_start_y..tile_end_y {
-        let strip_y = tile_y * Tile::HEIGHT;
-        let strip_y_f = strip_y as f32;
-        let strip_y_end_f = strip_y as f32 + Tile::HEIGHT as f32;
-
-        // A row is an "edge" if the rect's top or bottom boundary falls
-        // *inside* it (i.e. partial vertical coverage).
-        let is_top_edge = strip_y_f < rect_y0 && rect_y0 < strip_y_end_f;
-        let is_bottom_edge = strip_y_f < rect_y1 && rect_y1 < strip_y_end_f;
-
-        if is_top_edge || is_bottom_edge {
-            let alpha_start = alpha_buf.len() as u32;
-
-            let y_cov = coverage(strip_y, rect_y0, rect_y1);
-            let mut col = left_tile_x;
-            // TODO: Can this result in an infinite loop in case x_end == u16::MAX?
-            while col + Tile::WIDTH <= x_end {
-                // TODO: We could optimize this so this is only computed for the left-most and right-most
-                // tile of the edge, all intermediate tiles have full horizontal coverage.
-                let x_cov = coverage(col, rect_x0, rect_x1);
-                let combined = combined_tile_alpha(s, &x_cov, &y_cov);
-                alpha_buf.extend_from_slice(combined.as_slice());
-                col += Tile::WIDTH;
+            let rect_x0 = rect.x0 as f32;
+            let rect_y0 = rect.y0 as f32;
+            let rect_x1 = rect.x1 as f32;
+            let rect_y1 = rect.y1 as f32;
+
+            // Integer pixel bounds.
+            let px_x0 = rect_x0.floor() as u16;
+            let px_y0 = rect_y0.floor() as u16;
+            let px_y1 = rect_y1.ceil() as u16;
+
+            let left_tile_x = (px_x0 / Tile::WIDTH) * Tile::WIDTH;
+            // Inclusive, so don't use `ceil` here but just `rect_x1` directly.
+            let right_tile_x = (rect_x1 as u16 / Tile::WIDTH) * Tile::WIDTH;
+
+            let y0 = (px_y0 / Tile::HEIGHT) * Tile::HEIGHT;
+            // Note: y1 is exclusive, but it's gonna break for the very last tile if we have a height of u16::MAX.
+            let y1 = (px_y1.saturating_add(Tile::HEIGHT - 1) / Tile::HEIGHT) * Tile::HEIGHT;
+            // Include one tile past the right edge so the right-edge tile column is
+            // covered by the edge-row wide-strip loop.
+            let x_end = right_tile_x.saturating_add(Tile::WIDTH);
+
+            if x_end <= left_tile_x || y1 <= y0 {
+                return;
             }
 
-            strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false));
-        } else {
-            let alpha_start = alpha_buf.len() as u32;
-            alpha_buf.extend_from_slice(left_x_mask.as_slice());
-            strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false));
-
-            if needs_right_strip {
-                // `fill_gap = true` tells the renderer to fill solid 0xFF
-                // between the previous strip's end and this strip's start.
-                let alpha_start = alpha_buf.len() as u32;
-                alpha_buf.extend_from_slice(right_x_mask.as_slice());
-                strip_buf.push(Strip::new(right_tile_x, strip_y, alpha_start, true));
+            let tile_start_y = y0 / Tile::HEIGHT;
+            let tile_end_y = y1 / Tile::HEIGHT;
+
+            // A right strip is only needed when the rect spans more than one tile column.
+            let needs_right_strip = right_tile_x > left_tile_x;
+
+            let left_x_cov = coverage(left_tile_x, rect_x0, rect_x1);
+            let right_x_cov = coverage(right_tile_x, rect_x0, rect_x1);
+            let left_x_mask = alpha_mask_from_x_coverage(s, &left_x_cov);
+            let right_x_mask = alpha_mask_from_x_coverage(s, &right_x_cov);
+
+            for tile_y in tile_start_y..tile_end_y {
+                let strip_y = tile_y * Tile::HEIGHT;
+                let strip_y_f = strip_y as f32;
+                let strip_y_end_f = strip_y as f32 + Tile::HEIGHT as f32;
+
+                // A row is an "edge" if the rect's top or bottom boundary falls
+                // *inside* it (i.e. partial vertical coverage).
+                let is_top_edge = strip_y_f < rect_y0 && rect_y0 < strip_y_end_f;
+                let is_bottom_edge = strip_y_f < rect_y1 && rect_y1 < strip_y_end_f;
+
+                if is_top_edge || is_bottom_edge {
+                    let alpha_start = alpha_buf.len() as u32;
+
+                    let y_cov = coverage(strip_y, rect_y0, rect_y1);
+                    let mut col = left_tile_x;
+                    // TODO: Can this result in an infinite loop in case x_end == u16::MAX?
+                    while col + Tile::WIDTH <= x_end {
+                        // TODO: We could optimize this so this is only computed for the left-most and right-most
+                        // tile of the edge, all intermediate tiles have full horizontal coverage.
+                        let x_cov = coverage(col, rect_x0, rect_x1);
+                        let combined = combined_tile_alpha(s, &x_cov, &y_cov);
+                        alpha_buf.extend_from_slice(combined.as_slice());
+                        col += Tile::WIDTH;
+                    }
+
+                    strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false));
+                } else {
+                    let alpha_start = alpha_buf.len() as u32;
+                    alpha_buf.extend_from_slice(left_x_mask.as_slice());
+                    strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false));
+
+                    if needs_right_strip {
+                        // `fill_gap = true` tells the renderer to fill solid 0xFF
+                        // between the previous strip's end and this strip's start.
+                        let alpha_start = alpha_buf.len() as u32;
+                        alpha_buf.extend_from_slice(right_x_mask.as_slice());
+                        strip_buf.push(Strip::new(right_tile_x, strip_y, alpha_start, true));
+                    }
+                }
             }
-        }
-    }
 
-    // Sentinel strip: marks the end of the strip list for this shape.
-    let last_strip_y = (tile_end_y - 1) * Tile::HEIGHT;
-    strip_buf.push(Strip::new(
-        u16::MAX,
-        last_strip_y,
-        alpha_buf.len() as u32,
-        false,
-    ));
+            // Sentinel strip: marks the end of the strip list for this shape.
+            let last_strip_y = (tile_end_y - 1) * Tile::HEIGHT;
+            strip_buf.push(Strip::new(
+                u16::MAX,
+                last_strip_y,
+                alpha_buf.len() as u32,
+                false,
+            ));
+        },
+    );
 }
 
 /// Compute fractional pixel coverage for `N` consecutive pixels starting at `start`.
diff --git a/sparse_strips/vello_common/src/tile.rs b/sparse_strips/vello_common/src/tile.rs
index b64e2d7024..ef3dcb078f 100644
--- a/sparse_strips/vello_common/src/tile.rs
+++ b/sparse_strips/vello_common/src/tile.rs
@@ -503,6 +503,7 @@ impl Tiles {
         ))
     }
 
+    #[inline(always)]
     fn make_tiles_analytic_aa_impl<S: Simd>(
         &mut self,
         s: S,
@@ -510,408 +511,423 @@ impl Tiles {
         width: u16,
         height: u16,
     ) -> bool {
-        self.reset();
+        s.vectorize(
+            #[inline(always)]
+            || {
+                self.reset();
 
-        if width == 0 || height == 0 {
-            return self.windings.culled;
-        }
+                if width == 0 || height == 0 {
+                    return self.windings.culled;
+                }
 
-        debug_assert!(
-            lines.len() <= MAX_LINES_PER_PATH as usize,
-            "Max. number of lines per path exceeded. Max is {}, got {}.",
-            MAX_LINES_PER_PATH,
-            lines.len()
-        );
+                debug_assert!(
+                    lines.len() <= MAX_LINES_PER_PATH as usize,
+                    "Max. number of lines per path exceeded. Max is {}, got {}.",
+                    MAX_LINES_PER_PATH,
+                    lines.len()
+                );
 
-        let tile_columns = width.div_ceil(Tile::WIDTH);
-        let tile_rows = height.div_ceil(Tile::HEIGHT);
+                let tile_columns = width.div_ceil(Tile::WIDTH);
+                let tile_rows = height.div_ceil(Tile::HEIGHT);
 
-        let px_top = f32x4::from_slice(s, &[0.0, 1.0, 2.0, 3.0]);
-        let px_bottom = px_top + f32x4::splat(s, 1.0);
-        let simd_zero = f32x4::splat(s, 0.0);
-        let tile_height_f32 = Tile::HEIGHT as f32;
+                let px_top = f32x4::from_slice(s, &[0.0, 1.0, 2.0, 3.0]);
+                let px_bottom = px_top + f32x4::splat(s, 1.0);
+                let simd_zero = f32x4::splat(s, 0.0);
+                let tile_height_f32 = Tile::HEIGHT as f32;
 
-        for (line_idx, line) in lines.iter().take(MAX_LINES_PER_PATH as usize).enumerate() {
-            let line_idx = line_idx as u32;
+                for (line_idx, line) in lines.iter().take(MAX_LINES_PER_PATH as usize).enumerate() {
+                    let line_idx = line_idx as u32;
 
-            let p0_x = line.p0.x / f32::from(Tile::WIDTH);
-            let p0_y = line.p0.y / f32::from(Tile::HEIGHT);
-            let p1_x = line.p1.x / f32::from(Tile::WIDTH);
-            let p1_y = line.p1.y / f32::from(Tile::HEIGHT);
+                    let p0_x = line.p0.x / f32::from(Tile::WIDTH);
+                    let p0_y = line.p0.y / f32::from(Tile::HEIGHT);
+                    let p1_x = line.p1.x / f32::from(Tile::WIDTH);
+                    let p1_y = line.p1.y / f32::from(Tile::HEIGHT);
 
-            let (line_left_x, line_right_x) = if p0_x < p1_x {
-                (p0_x, p1_x)
-            } else {
-                (p1_x, p0_x)
-            };
+                    let (line_left_x, line_right_x) = if p0_x < p1_x {
+                        (p0_x, p1_x)
+                    } else {
+                        (p1_x, p0_x)
+                    };
 
-            // Lines whose left-most endpoint exceed the right edge of the viewport are culled
-            if line_left_x > tile_columns as f32 {
-                continue;
-            }
+                    // Lines whose left-most endpoint exceed the right edge of the viewport are culled
+                    if line_left_x > tile_columns as f32 {
+                        continue;
+                    }
 
-            let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y {
-                (p0_y, p0_x, p1_y, p1_x)
-            } else {
-                (p1_y, p1_x, p0_y, p0_x)
-            };
+                    let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y {
+                        (p0_y, p0_x, p1_y, p1_x)
+                    } else {
+                        (p1_y, p1_x, p0_y, p0_x)
+                    };
 
-            // The `as u16` casts here intentionally clamp negative coordinates to 0.
-            let y_top_tiles = (line_top_y as u16).min(tile_rows);
-            let line_bottom_y_ceil = line_bottom_y.ceil();
-            let y_bottom_tiles = (line_bottom_y_ceil as u16).min(tile_rows);
+                    // The `as u16` casts here intentionally clamp negative coordinates to 0.
+                    let y_top_tiles = (line_top_y as u16).min(tile_rows);
+                    let line_bottom_y_ceil = line_bottom_y.ceil();
+                    let y_bottom_tiles = (line_bottom_y_ceil as u16).min(tile_rows);
+
+                    // If y_top_tiles == y_bottom_tiles, then the line is either completely above or below
+                    // the viewport OR it is perfectly horizontal and aligned to the tile grid, contributing
+                    // no winding. In either case, it should be culled.
+                    if y_top_tiles >= y_bottom_tiles {
+                        // Technically, the `>` part of the `>=` is unnecessary due to clamping, but this
+                        // gives stronger signal
+                        continue;
+                    }
 
-            // If y_top_tiles == y_bottom_tiles, then the line is either completely above or below
-            // the viewport OR it is perfectly horizontal and aligned to the tile grid, contributing
-            // no winding. In either case, it should be culled.
-            if y_top_tiles >= y_bottom_tiles {
-                // Technically, the `>` part of the `>=` is unnecessary due to clamping, but this
-                // gives stronger signal
-                continue;
-            }
+                    let dir = if p0_y >= p1_y { 1 } else { -1 };
+                    let f_dir = dir as f32;
+                    let f_dir_v = f32x4::splat(s, f_dir);
 
-            let dir = if p0_y >= p1_y { 1 } else { -1 };
-            let f_dir = dir as f32;
-            let f_dir_v = f32x4::splat(s, f_dir);
+                    macro_rules! calc_fractional_coverage {
+                        ($y_idx:expr, $segment_top_y:expr, $segment_bottom_y:expr) => {{
+                            let y_idx_f32 = f32::from($y_idx);
+                            let local_y_start = ($segment_top_y - y_idx_f32) * tile_height_f32;
+                            let local_y_end = ($segment_bottom_y - y_idx_f32) * tile_height_f32;
 
-            macro_rules! calc_fractional_coverage {
-                ($y_idx:expr, $segment_top_y:expr, $segment_bottom_y:expr) => {{
-                    let y_idx_f32 = f32::from($y_idx);
-                    let local_y_start = ($segment_top_y - y_idx_f32) * tile_height_f32;
-                    let local_y_end = ($segment_bottom_y - y_idx_f32) * tile_height_f32;
+                            let start_v = f32x4::splat(s, local_y_start);
+                            let end_v = f32x4::splat(s, local_y_end);
 
-                    let start_v = f32x4::splat(s, local_y_start);
-                    let end_v = f32x4::splat(s, local_y_end);
+                            (px_bottom.min(end_v) - px_top.max(start_v)).max(simd_zero)
+                        }};
+                    }
 
-                    (px_bottom.min(end_v) - px_top.max(start_v)).max(simd_zero)
-                }};
-            }
+                    // Lines fully to the left of the viewport are not visible but still produce winding
+                    // which we record here and forward to the rendering stage.
+                    if line_right_x < 0.0 {
+                        let is_start_culled = line_top_y < 0.0;
 
-            // Lines fully to the left of the viewport are not visible but still produce winding
-            // which we record here and forward to the rendering stage.
-            if line_right_x < 0.0 {
-                let is_start_culled = line_top_y < 0.0;
-
-                // This branch is for handling the "start" of the line. In case
-                // the line reaches above the viewport, we are already in the
-                // middle so we can skip that part.
-                if !is_start_culled {
-                    self.windings.mark_row_active(y_top_tiles as usize);
-
-                    // Note: In theory, == should be enough, but just as
-                    // additional safety against numerical precision errors we
-                    // use <=.
-                    let at_top_of_tile = line_top_y <= f32::from(y_top_tiles);
-                    if at_top_of_tile {
-                        self.windings.coarse[y_top_tiles as usize] += dir;
-                    }
+                        // This branch is for handling the "start" of the line. In case
+                        // the line reaches above the viewport, we are already in the
+                        // middle so we can skip that part.
+                        if !is_start_culled {
+                            self.windings.mark_row_active(y_top_tiles as usize);
+
+                            // Note: In theory, == should be enough, but just as
+                            // additional safety against numerical precision errors we
+                            // use <=.
+                            let at_top_of_tile = line_top_y <= f32::from(y_top_tiles);
+                            if at_top_of_tile {
+                                self.windings.coarse[y_top_tiles as usize] += dir;
+                            }
 
-                    let fractional_coverage =
-                        calc_fractional_coverage!(y_top_tiles, line_top_y, line_bottom_y);
-                    let target_row = &mut self.windings.partial[y_top_tiles as usize];
-                    let current = f32x4::from_slice(s, target_row);
+                            let fractional_coverage =
+                                calc_fractional_coverage!(y_top_tiles, line_top_y, line_bottom_y);
+                            let target_row = &mut self.windings.partial[y_top_tiles as usize];
+                            let current = f32x4::from_slice(s, target_row);
 
-                    // See comment below on the double counting risk!
-                    let double_count = if at_top_of_tile {
-                        f_dir_v
-                    } else {
-                        f32x4::splat(s, 0.0)
-                    };
-                    let next = fractional_coverage.mul_add(f_dir_v, current - double_count);
-                    next.store_slice(target_row);
-                }
+                            // See comment below on the double counting risk!
+                            let double_count = if at_top_of_tile {
+                                f_dir_v
+                            } else {
+                                f32x4::splat(s, 0.0)
+                            };
+                            let next = fractional_coverage.mul_add(f_dir_v, current - double_count);
+                            next.store_slice(target_row);
+                        }
 
-                let y_start_middle = if is_start_culled {
-                    y_top_tiles
-                } else {
-                    y_top_tiles + 1
-                };
-                let line_bottom_floor = line_bottom_y.floor();
-                let y_end_middle = (line_bottom_floor as u16).min(tile_rows);
+                        let y_start_middle = if is_start_culled {
+                            y_top_tiles
+                        } else {
+                            y_top_tiles + 1
+                        };
+                        let line_bottom_floor = line_bottom_y.floor();
+                        let y_end_middle = (line_bottom_floor as u16).min(tile_rows);
 
-                for y_idx in y_start_middle..y_end_middle {
-                    self.windings.coarse[y_idx as usize] += dir;
-                }
-                self.windings
-                    .mark_row_range_active(y_start_middle as usize, y_end_middle as usize);
+                        for y_idx in y_start_middle..y_end_middle {
+                            self.windings.coarse[y_idx as usize] += dir;
+                        }
+                        self.windings
+                            .mark_row_range_active(y_start_middle as usize, y_end_middle as usize);
 
-                if line_bottom_y != line_bottom_floor
+                        if line_bottom_y != line_bottom_floor
                     && y_end_middle < tile_rows
                     // Prevent double-processing, unless the start was off-screen and hasn't been
                     // handled yet.
                     && (is_start_culled || y_end_middle != y_top_tiles)
-                {
-                    self.windings.mark_row_active(y_end_middle as usize);
-                    // Ends implicitly cross the top.
-                    self.windings.coarse[y_end_middle as usize] += dir;
-                    let fractional_coverage =
-                        calc_fractional_coverage!(y_end_middle, line_top_y, line_bottom_y);
-                    let target_row = &mut self.windings.partial[y_end_middle as usize];
-                    let current = f32x4::from_slice(s, target_row);
-                    // Subtract the inverse direction to avoid double counting with the coarse winding.
-                    let next = fractional_coverage.mul_add(f_dir_v, current - f_dir_v);
-                    next.store_slice(target_row);
-                }
-
-                self.windings.culled = true;
-                continue;
-            }
-
-            // Get tile coordinates for start/end points, use i32 to preserve negative coordinates.
-            let p0_tile_x = line_top_x.floor() as i32;
-            let p0_tile_y = line_top_y.floor() as i32;
-            let p1_tile_x = line_bottom_x.floor() as i32;
-            let p1_tile_y = line_bottom_y.floor() as i32;
-
-            // Special-case out lines which are fully contained within a tile.
-            let not_same_tile = p0_tile_y != p1_tile_y || p0_tile_x != p1_tile_x;
-            if not_same_tile {
-                // Case vertical lines: By definition, these cannot be horizontally crossing, and
-                // thus require no additional left-edge culling handling.
-                if line_left_x == line_right_x {
-                    let x = (line_left_x as u16).min(tile_columns.saturating_sub(1));
-
-                    // Row Start, not culled.
-                    let is_start_culled = line_top_y < 0.0;
-                    if !is_start_culled {
-                        let winding =
-                            ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT;
-                        let tile = Tile::new_clamped(x, y_top_tiles, line_idx, winding);
-                        self.tile_buf.push(tile);
-                    }
-
-                    // Middle
-                    // If the start was culled, the first tile inside the viewport is a middle.
-                    let y_start = if is_start_culled {
-                        y_top_tiles
-                    } else {
-                        y_top_tiles + 1
-                    };
+                        {
+                            self.windings.mark_row_active(y_end_middle as usize);
+                            // Ends implicitly cross the top.
+                            self.windings.coarse[y_end_middle as usize] += dir;
+                            let fractional_coverage =
+                                calc_fractional_coverage!(y_end_middle, line_top_y, line_bottom_y);
+                            let target_row = &mut self.windings.partial[y_end_middle as usize];
+                            let current = f32x4::from_slice(s, target_row);
+                            // Subtract the inverse direction to avoid double counting with the coarse winding.
+                            let next = fractional_coverage.mul_add(f_dir_v, current - f_dir_v);
+                            next.store_slice(target_row);
+                        }
 
-                    for y_idx in y_start..y_bottom_tiles {
-                        let tile = Tile::new_clamped(x, y_idx, line_idx, W);
-                        self.tile_buf.push(tile);
+                        self.windings.culled = true;
+                        continue;
                     }
-                } else {
-                    // General case, any line which crosses more than one tile and is not vertical.
-                    let dx = p1_x - p0_x;
-                    let dy = p1_y - p0_y;
-                    let x_slope = dx / dy;
-                    let dx_dir = (line_bottom_x >= line_top_x) as u32;
-                    let not_dx_dir = dx_dir ^ 1;
 
-                    let w_start_base = dx_dir << WINDING_SHIFT;
-                    let w_end_base = not_dx_dir << WINDING_SHIFT;
-
-                    let push_row_extents = {
-                        #[inline(always)]
-                        |tile_buf: &mut Vec<Tile>,
-                         y_idx: u16,
-                         row_left_x: f32,
-                         row_right_x: f32,
-                         w_start: u32,
-                         w_end: u32,
-                         w_single: u32| {
-                            let x_start = row_left_x as u16;
-                            let x_end = (row_right_x as u16).min(tile_columns - 1);
-
-                            if x_start <= x_end {
-                                let winding = if x_start == x_end { w_single } else { w_start };
-
-                                tile_buf.push(Tile::new(x_start, y_idx, line_idx, winding));
+                    // Get tile coordinates for start/end points, use i32 to preserve negative coordinates.
+                    let p0_tile_x = line_top_x.floor() as i32;
+                    let p0_tile_y = line_top_y.floor() as i32;
+                    let p1_tile_x = line_bottom_x.floor() as i32;
+                    let p1_tile_y = line_bottom_y.floor() as i32;
+
+                    // Special-case out lines which are fully contained within a tile.
+                    let not_same_tile = p0_tile_y != p1_tile_y || p0_tile_x != p1_tile_x;
+                    if not_same_tile {
+                        // Case vertical lines: By definition, these cannot be horizontally crossing, and
+                        // thus require no additional left-edge culling handling.
+                        if line_left_x == line_right_x {
+                            let x = (line_left_x as u16).min(tile_columns.saturating_sub(1));
+
+                            // Row Start, not culled.
+                            let is_start_culled = line_top_y < 0.0;
+                            if !is_start_culled {
+                                let winding = ((f32::from(y_top_tiles) >= line_top_y) as u32)
+                                    << WINDING_SHIFT;
+                                let tile = Tile::new_clamped(x, y_top_tiles, line_idx, winding);
+                                self.tile_buf.push(tile);
+                            }
 
-                                for x_idx in x_start.saturating_add(1)..x_end {
-                                    tile_buf.push(Tile::new(x_idx, y_idx, line_idx, 0));
-                                }
+                            // Middle
+                            // If the start was culled, the first tile inside the viewport is a middle.
+                            let y_start = if is_start_culled {
+                                y_top_tiles
+                            } else {
+                                y_top_tiles + 1
+                            };
 
-                                if x_start < x_end {
-                                    tile_buf.push(Tile::new(x_end, y_idx, line_idx, w_end));
-                                }
+                            for y_idx in y_start..y_bottom_tiles {
+                                let tile = Tile::new_clamped(x, y_idx, line_idx, W);
+                                self.tile_buf.push(tile);
                             }
-                        }
-                    };
+                        } else {
+                            // General case, any line which crosses more than one tile and is not vertical.
+                            let dx = p1_x - p0_x;
+                            let dy = p1_y - p0_y;
+                            let x_slope = dx / dy;
+                            let dx_dir = (line_bottom_x >= line_top_x) as u32;
+                            let not_dx_dir = dx_dir ^ 1;
+
+                            let w_start_base = dx_dir << WINDING_SHIFT;
+                            let w_end_base = not_dx_dir << WINDING_SHIFT;
+
+                            let push_row_extents = {
+                                #[inline(always)]
+                                |tile_buf: &mut Vec<Tile>,
+                                 y_idx: u16,
+                                 row_left_x: f32,
+                                 row_right_x: f32,
+                                 w_start: u32,
+                                 w_end: u32,
+                                 w_single: u32| {
+                                    let x_start = row_left_x as u16;
+                                    let x_end = (row_right_x as u16).min(tile_columns - 1);
+
+                                    if x_start <= x_end {
+                                        let winding =
+                                            if x_start == x_end { w_single } else { w_start };
+
+                                        tile_buf.push(Tile::new(x_start, y_idx, line_idx, winding));
+
+                                        for x_idx in x_start.saturating_add(1)..x_end {
+                                            tile_buf.push(Tile::new(x_idx, y_idx, line_idx, 0));
+                                        }
+
+                                        if x_start < x_end {
+                                            tile_buf.push(Tile::new(x_end, y_idx, line_idx, w_end));
+                                        }
+                                    }
+                                }
+                            };
 
-                    let mut push_row = {
-                        #[inline(always)]
-                        |y_idx: u16,
-                         row_top_y: f32,
-                         row_bottom_y: f32,
-                         w_start: u32,
-                         w_end: u32,
-                         w_single: u32| {
-                            let row_top_x = p0_x + (row_top_y - p0_y) * x_slope;
-                            let row_bottom_x = p0_x + (row_bottom_y - p0_y) * x_slope;
-
-                            // TODO: Evaluate whether we need the second max/min.
-                            let row_left_x = f32::min(row_top_x, row_bottom_x).max(line_left_x);
-                            let row_right_x = f32::max(row_top_x, row_bottom_x).min(line_right_x);
-
-                            if row_left_x < 0.0 {
-                                self.windings.culled = true;
-
-                                if row_right_x < 0.0 {
-                                    // Although the line may cross the left edge, the rightmost point in
-                                    // this row may still be fully left of the viewport. In this case,
-                                    // record the winding and emit no tiles.
-                                    self.windings.mark_row_active(y_idx as usize);
-
-                                    let crosses_top = (w_single & W) != 0;
-                                    if crosses_top {
-                                        self.windings.coarse[y_idx as usize] += dir;
+                            let mut push_row = {
+                                #[inline(always)]
+                                |y_idx: u16,
+                                 row_top_y: f32,
+                                 row_bottom_y: f32,
+                                 w_start: u32,
+                                 w_end: u32,
+                                 w_single: u32| {
+                                    let row_top_x = p0_x + (row_top_y - p0_y) * x_slope;
+                                    let row_bottom_x = p0_x + (row_bottom_y - p0_y) * x_slope;
+
+                                    // TODO: Evaluate whether we need the second max/min.
+                                    let row_left_x =
+                                        f32::min(row_top_x, row_bottom_x).max(line_left_x);
+                                    let row_right_x =
+                                        f32::max(row_top_x, row_bottom_x).min(line_right_x);
+
+                                    if row_left_x < 0.0 {
+                                        self.windings.culled = true;
+
+                                        if row_right_x < 0.0 {
+                                            // Although the line may cross the left edge, the rightmost point in
+                                            // this row may still be fully left of the viewport. In this case,
+                                            // record the winding and emit no tiles.
+                                            self.windings.mark_row_active(y_idx as usize);
+
+                                            let crosses_top = (w_single & W) != 0;
+                                            if crosses_top {
+                                                self.windings.coarse[y_idx as usize] += dir;
+                                            }
+
+                                            let fractional_coverage = calc_fractional_coverage!(
+                                                y_idx,
+                                                row_top_y,
+                                                row_bottom_y
+                                            );
+                                            let target_row =
+                                                &mut self.windings.partial[y_idx as usize];
+                                            let current = f32x4::from_slice(s, target_row);
+
+                                            let double_count = if crosses_top {
+                                                f_dir_v
+                                            } else {
+                                                f32x4::splat(s, 0.0)
+                                            };
+                                            let next = fractional_coverage
+                                                .mul_add(f_dir_v, current - double_count);
+                                            next.store_slice(target_row);
+
+                                            return;
+                                        } else {
+                                            // The line crosses into the viewport in this row. Record only the
+                                            // fractional portion of the winding, as the coarse winding will
+                                            // naturally get included by the clamped tile logic!
+                                            let y_slope = dy / dx;
+                                            let y_intersect = row_top_y - (row_top_x * y_slope);
+
+                                            let (off_screen_top_y, off_screen_bottom_y) =
+                                                if row_top_x < 0.0 {
+                                                    (row_top_y, f32::min(row_bottom_y, y_intersect))
+                                                } else {
+                                                    (f32::max(row_top_y, y_intersect), row_bottom_y)
+                                                };
+
+                                            if off_screen_top_y < off_screen_bottom_y {
+                                                self.windings.mark_row_active(y_idx as usize);
+                                                let fractional_coverage = calc_fractional_coverage!(
+                                                    y_idx,
+                                                    off_screen_top_y,
+                                                    off_screen_bottom_y
+                                                );
+                                                let target_row =
+                                                    &mut self.windings.partial[y_idx as usize];
+                                                let current = f32x4::from_slice(s, target_row);
+                                                let next =
+                                                    fractional_coverage.mul_add(f_dir_v, current);
+                                                next.store_slice(target_row);
+                                            }
+                                        }
                                     }
 
-                                    let fractional_coverage =
-                                        calc_fractional_coverage!(y_idx, row_top_y, row_bottom_y);
-                                    let target_row = &mut self.windings.partial[y_idx as usize];
-                                    let current = f32x4::from_slice(s, target_row);
+                                    push_row_extents(
+                                        &mut self.tile_buf,
+                                        y_idx,
+                                        row_left_x,
+                                        row_right_x,
+                                        w_start,
+                                        w_end,
+                                        w_single,
+                                    );
+                                }
+                            };
 
-                                    let double_count = if crosses_top {
-                                        f_dir_v
+                            let is_start_culled = line_top_y < 0.0;
+                            // This branch is taken in case the line is completely inside
+                            // the viewport, allowing us to save many calculations that
+                            // otherwise would need to be made viewport culling work.
+                            if line_left_x >= 0.0 && line_right_x < tile_columns as f32 {
+                                if !is_start_culled {
+                                    let y = f32::from(y_top_tiles);
+                                    let row_bottom_y = (y + 1.0).min(line_bottom_y);
+                                    let row_bottom_x = if row_bottom_y == line_bottom_y {
+                                        line_bottom_x
                                     } else {
-                                        f32x4::splat(s, 0.0)
+                                        p0_x + (row_bottom_y - p0_y) * x_slope
                                     };
-                                    let next = fractional_coverage
-                                        .mul_add(f_dir_v, current - double_count);
-                                    next.store_slice(target_row);
+                                    let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT;
+                                    push_row_extents(
+                                        &mut self.tile_buf,
+                                        y_top_tiles,
+                                        f32::min(line_top_x, row_bottom_x),
+                                        f32::max(line_top_x, row_bottom_x),
+                                        w_start_base & mask,
+                                        w_end_base & mask,
+                                        W & mask,
+                                    );
+                                }
 
-                                    return;
+                                let y_start = if is_start_culled {
+                                    y_top_tiles
                                 } else {
-                                    // The line crosses into the viewport in this row. Record only the
-                                    // fractional portion of the winding, as the coarse winding will
-                                    // naturally get included by the clamped tile logic!
-                                    let y_slope = dy / dx;
-                                    let y_intersect = row_top_y - (row_top_x * y_slope);
-
-                                    let (off_screen_top_y, off_screen_bottom_y) = if row_top_x < 0.0
-                                    {
-                                        (row_top_y, f32::min(row_bottom_y, y_intersect))
-                                    } else {
-                                        (f32::max(row_top_y, y_intersect), row_bottom_y)
-                                    };
+                                    y_top_tiles + 1
+                                };
 
-                                    if off_screen_top_y < off_screen_bottom_y {
-                                        self.windings.mark_row_active(y_idx as usize);
-                                        let fractional_coverage = calc_fractional_coverage!(
+                                if y_start < y_bottom_tiles {
+                                    let mut row_top_x =
+                                        p0_x + (f32::from(y_start) - p0_y) * x_slope;
+                                    for y_idx in y_start..y_bottom_tiles {
+                                        let y = f32::from(y_idx);
+                                        // Note: We purposefully don't precompute it once
+                                        // and just increment by `x_slope` after every iteration
+                                        // to avoid errors due to floating point inaccuracies.
+                                        let row_bottom_x = if line_bottom_y < y + 1.0 {
+                                            line_bottom_x
+                                        } else {
+                                            p0_x + (y + 1.0 - p0_y) * x_slope
+                                        };
+                                        push_row_extents(
+                                            &mut self.tile_buf,
                                             y_idx,
-                                            off_screen_top_y,
-                                            off_screen_bottom_y
+                                            f32::min(row_top_x, row_bottom_x),
+                                            f32::max(row_top_x, row_bottom_x),
+                                            w_start_base,
+                                            w_end_base,
+                                            W,
                                         );
-                                        let target_row = &mut self.windings.partial[y_idx as usize];
-                                        let current = f32x4::from_slice(s, target_row);
-                                        let next = fractional_coverage.mul_add(f_dir_v, current);
-                                        next.store_slice(target_row);
+                                        row_top_x = row_bottom_x;
                                     }
                                 }
-                            }
-
-                            push_row_extents(
-                                &mut self.tile_buf,
-                                y_idx,
-                                row_left_x,
-                                row_right_x,
-                                w_start,
-                                w_end,
-                                w_single,
-                            );
-                        }
-                    };
-
-                    let is_start_culled = line_top_y < 0.0;
-                    // This branch is taken in case the line is completely inside
-                    // the viewport, allowing us to save many calculations that
-                    // otherwise would need to be made viewport culling work.
-                    if line_left_x >= 0.0 && line_right_x < tile_columns as f32 {
-                        if !is_start_culled {
-                            let y = f32::from(y_top_tiles);
-                            let row_bottom_y = (y + 1.0).min(line_bottom_y);
-                            let row_bottom_x = if row_bottom_y == line_bottom_y {
-                                line_bottom_x
                             } else {
-                                p0_x + (row_bottom_y - p0_y) * x_slope
-                            };
-                            let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT;
-                            push_row_extents(
-                                &mut self.tile_buf,
-                                y_top_tiles,
-                                f32::min(line_top_x, row_bottom_x),
-                                f32::max(line_top_x, row_bottom_x),
-                                w_start_base & mask,
-                                w_end_base & mask,
-                                W & mask,
-                            );
-                        }
-
-                        let y_start = if is_start_culled {
-                            y_top_tiles
-                        } else {
-                            y_top_tiles + 1
-                        };
+                                if !is_start_culled {
+                                    let y = f32::from(y_top_tiles);
+                                    let row_bottom_y = (y + 1.0).min(line_bottom_y);
+                                    let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT;
+                                    push_row(
+                                        y_top_tiles,
+                                        line_top_y,
+                                        row_bottom_y,
+                                        w_start_base & mask,
+                                        w_end_base & mask,
+                                        W & mask,
+                                    );
+                                }
 
-                        if y_start < y_bottom_tiles {
-                            let mut row_top_x = p0_x + (f32::from(y_start) - p0_y) * x_slope;
-                            for y_idx in y_start..y_bottom_tiles {
-                                let y = f32::from(y_idx);
-                                // Note: We purposefully don't precompute it once
-                                // and just increment by `x_slope` after every iteration
-                                // to avoid errors due to floating point inaccuracies.
-                                let row_bottom_x = if line_bottom_y < y + 1.0 {
-                                    line_bottom_x
+                                let y_start = if is_start_culled {
+                                    y_top_tiles
                                 } else {
-                                    p0_x + (y + 1.0 - p0_y) * x_slope
+                                    y_top_tiles + 1
                                 };
-                                push_row_extents(
-                                    &mut self.tile_buf,
-                                    y_idx,
-                                    f32::min(row_top_x, row_bottom_x),
-                                    f32::max(row_top_x, row_bottom_x),
-                                    w_start_base,
-                                    w_end_base,
-                                    W,
-                                );
-                                row_top_x = row_bottom_x;
+
+                                for y_idx in y_start..y_bottom_tiles {
+                                    let y = f32::from(y_idx);
+                                    let row_bottom_y = (y + 1.0).min(line_bottom_y);
+                                    push_row(y_idx, y, row_bottom_y, w_start_base, w_end_base, W);
+                                }
                             }
                         }
                     } else {
-                        if !is_start_culled {
-                            let y = f32::from(y_top_tiles);
-                            let row_bottom_y = (y + 1.0).min(line_bottom_y);
-                            let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT;
-                            push_row(
-                                y_top_tiles,
-                                line_top_y,
-                                row_bottom_y,
-                                w_start_base & mask,
-                                w_end_base & mask,
-                                W & mask,
-                            );
-                        }
-
-                        let y_start = if is_start_culled {
-                            y_top_tiles
-                        } else {
-                            y_top_tiles + 1
-                        };
-
-                        for y_idx in y_start..y_bottom_tiles {
-                            let y = f32::from(y_idx);
-                            let row_bottom_y = (y + 1.0).min(line_bottom_y);
-                            push_row(y_idx, y, row_bottom_y, w_start_base, w_end_base, W);
-                        }
+                        // Case line is fully contained within a single tile: These also cannot cross edges!
+                        let tile = Tile::new_clamped(
+                            (line_left_x as u16).min(tile_columns + 1),
+                            y_top_tiles,
+                            line_idx,
+                            ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT,
+                        );
+                        self.tile_buf.push(tile);
                     }
                 }
-            } else {
-                // Case line is fully contained within a single tile: These also cannot cross edges!
-                let tile = Tile::new_clamped(
-                    (line_left_x as u16).min(tile_columns + 1),
-                    y_top_tiles,
-                    line_idx,
-                    ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT,
-                );
-                self.tile_buf.push(tile);
-            }
-        }
 
-        self.windings.culled
+                self.windings.culled
+            },
+        )
     }
 
     /// Generates tile commands for MSAA (Multisample Anti-Aliasing) rasterization.
diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
index 22fb125f90..16fbb2ff35 100644
--- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs
@@ -358,6 +358,7 @@ impl MultiThreadedDispatcher {
         }
     }
 
+    #[inline(always)]
     fn rasterize_with<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
@@ -367,44 +368,53 @@ impl MultiThreadedDispatcher {
         encoded_paints: &[EncodedPaint],
         image_resolver: &dyn ImageResolver,
     ) {
-        let mut buffer = Regions::new(width, height, buffer);
-        let fines = ThreadLocal::new();
-        let wide = &self.wide;
-        let alpha_slots = self.alpha_storage.take();
-
-        self.thread_pool.install(|| {
-            buffer.update_regions_par(|region| {
-                let x = region.x;
-                let y = region.y;
-
-                let mut fine = fines
-                    .get_or(|| RefCell::new(Fine::<S, F>::new(simd)))
-                    .borrow_mut();
-
-                let wtile = wide.get(x, y);
-                fine.set_coords(x, y);
-
-                fine.clear(wtile.bg);
-                for cmd in &wtile.cmds {
-                    let thread_idx = match cmd {
-                        Cmd::AlphaFill(a) => Some(wide.attrs.fill[a.attrs_idx as usize].thread_idx),
-                        Cmd::ClipStrip(a) => Some(wide.attrs.clip[a.attrs_idx as usize].thread_idx),
-                        _ => None,
-                    };
-
-                    let alphas = thread_idx
-                        .map(|i| alpha_slots[i as usize].as_slice())
-                        .unwrap_or(&[]);
-                    fine.run_cmd(cmd, alphas, encoded_paints, image_resolver, &wide.attrs);
-                }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut buffer = Regions::new(width, height, buffer);
+                let fines = ThreadLocal::new();
+                let wide = &self.wide;
+                let alpha_slots = self.alpha_storage.take();
+
+                self.thread_pool.install(|| {
+                    buffer.update_regions_par(|region| {
+                        let x = region.x;
+                        let y = region.y;
+
+                        let mut fine = fines
+                            .get_or(|| RefCell::new(Fine::<S, F>::new(simd)))
+                            .borrow_mut();
+
+                        let wtile = wide.get(x, y);
+                        fine.set_coords(x, y);
+
+                        fine.clear(wtile.bg);
+                        for cmd in &wtile.cmds {
+                            let thread_idx = match cmd {
+                                Cmd::AlphaFill(a) => {
+                                    Some(wide.attrs.fill[a.attrs_idx as usize].thread_idx)
+                                }
+                                Cmd::ClipStrip(a) => {
+                                    Some(wide.attrs.clip[a.attrs_idx as usize].thread_idx)
+                                }
+                                _ => None,
+                            };
+
+                            let alphas = thread_idx
+                                .map(|i| alpha_slots[i as usize].as_slice())
+                                .unwrap_or(&[]);
+                            fine.run_cmd(cmd, alphas, encoded_paints, image_resolver, &wide.attrs);
+                        }
 
-                fine.pack(region);
-            });
-        });
+                        fine.pack(region);
+                    });
+                });
 
-        // Don't forget to put back the alpha buffers, so that they can be re-used in
-        // the next path rendering iteration!
-        self.alpha_storage.init(alpha_slots);
+                // Don't forget to put back the alpha buffers, so that they can be re-used in
+                // the next path rendering iteration!
+                self.alpha_storage.init(alpha_slots);
+            },
+        );
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
index 2a08d4b1b2..e76a8eeba1 100644
--- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
+++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs
@@ -126,6 +126,7 @@ impl SingleThreadedDispatcher {
     ///
     /// If the scene contains filter effects, uses the filter-aware path which maintains
     /// intermediate layer buffers. Otherwise, uses the simpler direct rasterization path.
+    #[inline(always)]
     fn rasterize_with<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
@@ -135,30 +136,35 @@ impl SingleThreadedDispatcher {
         encoded_paints: &[EncodedPaint],
         image_resolver: &dyn ImageResolver,
     ) {
-        let mut layer_manager = LayerManager::new();
-
-        if self.has_filters() {
-            // Use filter-aware path that maintains layer buffers for filter effects.
-            self.rasterize_with_filters::<S, F>(
-                simd,
-                buffer,
-                width,
-                height,
-                encoded_paints,
-                image_resolver,
-                &mut layer_manager,
-            );
-        } else {
-            // Use simple direct rasterization for scenes without filters.
-            self.rasterize_simple::<S, F>(
-                simd,
-                buffer,
-                width,
-                height,
-                encoded_paints,
-                image_resolver,
-            );
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut layer_manager = LayerManager::new();
+
+                if self.has_filters() {
+                    // Use filter-aware path that maintains layer buffers for filter effects.
+                    self.rasterize_with_filters::<S, F>(
+                        simd,
+                        buffer,
+                        width,
+                        height,
+                        encoded_paints,
+                        image_resolver,
+                        &mut layer_manager,
+                    );
+                } else {
+                    // Use simple direct rasterization for scenes without filters.
+                    self.rasterize_simple::<S, F>(
+                        simd,
+                        buffer,
+                        width,
+                        height,
+                        encoded_paints,
+                        image_resolver,
+                    );
+                }
+            },
+        );
     }
 
     /// Rasterizes a scene with filter effects using dependency-ordered execution.
@@ -171,6 +177,7 @@ impl SingleThreadedDispatcher {
     /// # Render Graph Execution
     /// - `FilterLayer` nodes: Render to intermediate buffer, apply filter, store result.
     /// - `RootLayer` node: Final composition to output buffer.
+    #[inline(always)]
     fn rasterize_with_filters<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
@@ -181,30 +188,36 @@ impl SingleThreadedDispatcher {
         image_resolver: &dyn ImageResolver,
         layer_manager: &mut LayerManager,
     ) {
-        let mut fine = Fine::<S, F>::new(simd);
-
-        // Process nodes in dependency order (filtered layers before their consumers).
-        for node_id in self.render_graph.execution_order() {
-            let node = &self.render_graph.nodes[node_id];
-
-            match &node.kind {
-                RenderNodeKind::FilterLayer {
-                    layer_id,
-                    filter,
-                    wtile_bbox,
-                    transform,
-                } => {
-                    // Allocate intermediate buffer for this filtered layer.
-                    let bbox_width = wtile_bbox.width_px();
-                    let bbox_height = wtile_bbox.height_px();
-                    let mut pixmap = Pixmap::new(bbox_width, bbox_height);
-                    // TODO: Re-use this allocation by adding a .configure() or similar method
-                    // to avoid allocating the internal Vec<Region> on every filtered layer.
-                    let mut regions =
-                        Regions::new(bbox_width, bbox_height, pixmap.data_as_u8_slice_mut());
-
-                    // Render each tile in the layer's bounding box.
-                    regions.update_regions(|region| {
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut fine = Fine::<S, F>::new(simd);
+
+                // Process nodes in dependency order (filtered layers before their consumers).
+                for node_id in self.render_graph.execution_order() {
+                    let node = &self.render_graph.nodes[node_id];
+
+                    match &node.kind {
+                        RenderNodeKind::FilterLayer {
+                            layer_id,
+                            filter,
+                            wtile_bbox,
+                            transform,
+                        } => {
+                            // Allocate intermediate buffer for this filtered layer.
+                            let bbox_width = wtile_bbox.width_px();
+                            let bbox_height = wtile_bbox.height_px();
+                            let mut pixmap = Pixmap::new(bbox_width, bbox_height);
+                            // TODO: Re-use this allocation by adding a .configure() or similar method
+                            // to avoid allocating the internal Vec<Region> on every filtered layer.
+                            let mut regions = Regions::new(
+                                bbox_width,
+                                bbox_height,
+                                pixmap.data_as_u8_slice_mut(),
+                            );
+
+                            // Render each tile in the layer's bounding box.
+                            regions.update_regions(|region| {
                         // Convert region-local coords to global wtile coords.
                         let x = wtile_bbox.x0() + region.x;
                         let y = wtile_bbox.y0() + region.y;
@@ -229,23 +242,23 @@ impl SingleThreadedDispatcher {
                         fine.pack(region);
                     });
 
-                    // Apply the filter effect to the completed layer.
-                    fine.filter_layer(&mut pixmap, filter, layer_manager, *transform);
+                            // Apply the filter effect to the completed layer.
+                            fine.filter_layer(&mut pixmap, filter, layer_manager, *transform);
 
-                    // Save the filtered pixmap to disk for debugging.
-                    // #[cfg(all(debug_assertions, feature = "std", feature = "png"))]
-                    // save_filtered_layer_debug(&pixmap, *layer_id);
+                            // Save the filtered pixmap to disk for debugging.
+                            // #[cfg(all(debug_assertions, feature = "std", feature = "png"))]
+                            // save_filtered_layer_debug(&pixmap, *layer_id);
 
-                    // Store the filtered result for use by dependent layers.
-                    layer_manager.register_layer(*layer_id, *wtile_bbox, pixmap);
-                }
-                RenderNodeKind::RootLayer {
-                    layer_id,
-                    wtile_bbox: _,
-                } => {
-                    // Final composition directly to output buffer.
-                    let mut regions = Regions::new(width, height, buffer);
-                    regions.update_regions(|region| {
+                            // Store the filtered result for use by dependent layers.
+                            layer_manager.register_layer(*layer_id, *wtile_bbox, pixmap);
+                        }
+                        RenderNodeKind::RootLayer {
+                            layer_id,
+                            wtile_bbox: _,
+                        } => {
+                            // Final composition directly to output buffer.
+                            let mut regions = Regions::new(width, height, buffer);
+                            regions.update_regions(|region| {
                         // Use the background color from the wide tile.
                         let bg = self.wide.get(region.x, region.y).bg;
                         self.process_layer_tile(
@@ -267,9 +280,11 @@ impl SingleThreadedDispatcher {
 
                         fine.pack(region);
                     });
+                        }
+                    }
                 }
-            }
-        }
+            },
+        );
     }
 
     /// Processes all rendering commands for a single layer within a specific tile.
@@ -288,6 +303,7 @@ impl SingleThreadedDispatcher {
     /// * `layer_manager` - Storage for filtered layer buffers.
     /// * `encoded_paints` - Paint definitions for the scene.
     /// * `image_resolver` - Resolver for looking up opaque image IDs.
+    #[inline(always)]
     fn process_layer_tile<S: Simd, F: FineKernel<S>>(
         &self,
         fine: &mut Fine<S, F>,
@@ -299,89 +315,96 @@ impl SingleThreadedDispatcher {
         encoded_paints: &[EncodedPaint],
         image_resolver: &dyn ImageResolver,
     ) {
-        let wtile = &self.wide.get(x, y);
-        fine.set_coords(x, y);
-        fine.clear(clear_color);
-
-        // Process all commands in this layer's render range.
-        // It can happen that the layer has no associated ranges in this wide tile in
-        // case they have been cleared by setting a new wide tile background, for example
-        // when filling a full-tile opaque solid color.
-        let Some(ranges) = wtile.layer_cmd_ranges.get(&layer_id) else {
-            return;
-        };
-
-        let mut cmd_idx = ranges.render_range.start;
-        while cmd_idx < ranges.render_range.end {
-            let cmd: &Cmd = &wtile.cmds[cmd_idx];
-
-            fine.run_cmd(
-                cmd,
-                &self.strip_storage.alphas,
-                encoded_paints,
-                image_resolver,
-                &self.wide.attrs,
-            );
-
-            // Special handling for filtered layer composition.
-            // Filtered layers have already been rendered and stored in layer_manager.
-            // Here we composite them into the current buffer, with special handling for clipping.
-            if let Cmd::PushBuf(LayerKind::Filtered(child_layer_id), _) = cmd {
-                // Unlike above, the unwrap is safe here because as long as the filtered layer
-                // is referenced in the wide tile, it must have associated layer ranges.
-                let filtered_ranges = wtile.layer_cmd_ranges.get(child_layer_id).unwrap();
-
-                // Check what comes after the filtered layer push to determine clipping state
-                match wtile.cmds.get(cmd_idx + 1) {
-                    // Zero-clip region: tile is completely outside the clip path.
-                    // The layer was already rendered for filtering, but we skip compositing
-                    // since this tile is entirely clipped out.
-                    // (PushZeroClip only appears for clipped filter layers)
-                    // See https://github.com/linebender/vello/pull/1541/ for why we
-                    // add the ID check.
-                    Some(Cmd::PushZeroClip(id)) if *id == *child_layer_id => {
-                        // If we have a zero-clip, it means that the whole layer should not be drawn.
-                        // Therefore, we want to skip to the very end so that only `PopBuf` will
-                        // be run. Therefore, we jump to `filtered_ranges.full_range.end - 1`.
-                        cmd_idx = filtered_ranges.full_range.end - 1;
-                        continue;
-                    }
-
-                    // Partial clip: push the clip buffer, then composite the filtered layer
-                    Some(Cmd::PushBuf(LayerKind::Clip(id), _)) if *id == *child_layer_id => {
-                        fine.run_cmd(
-                            &wtile.cmds[cmd_idx + 1],
-                            &self.strip_storage.alphas,
-                            encoded_paints,
-                            image_resolver,
-                            &self.wide.attrs,
-                        );
-                        cmd_idx += 1;
-
-                        if let Some(mut region) =
-                            layer_manager.layer_tile_region_mut(*child_layer_id, x, y)
-                        {
-                            fine.unpack(&mut region);
+        fine.simd.vectorize(
+            #[inline(always)]
+            || {
+                let wtile = &self.wide.get(x, y);
+                fine.set_coords(x, y);
+                fine.clear(clear_color);
+
+                // Process all commands in this layer's render range.
+                // It can happen that the layer has no associated ranges in this wide tile in
+                // case they have been cleared by setting a new wide tile background, for example
+                // when filling a full-tile opaque solid color.
+                let Some(ranges) = wtile.layer_cmd_ranges.get(&layer_id) else {
+                    return;
+                };
+
+                let mut cmd_idx = ranges.render_range.start;
+                while cmd_idx < ranges.render_range.end {
+                    let cmd: &Cmd = &wtile.cmds[cmd_idx];
+
+                    fine.run_cmd(
+                        cmd,
+                        &self.strip_storage.alphas,
+                        encoded_paints,
+                        image_resolver,
+                        &self.wide.attrs,
+                    );
+
+                    // Special handling for filtered layer composition.
+                    // Filtered layers have already been rendered and stored in layer_manager.
+                    // Here we composite them into the current buffer, with special handling for clipping.
+                    if let Cmd::PushBuf(LayerKind::Filtered(child_layer_id), _) = cmd {
+                        // Unlike above, the unwrap is safe here because as long as the filtered layer
+                        // is referenced in the wide tile, it must have associated layer ranges.
+                        let filtered_ranges = wtile.layer_cmd_ranges.get(child_layer_id).unwrap();
+
+                        // Check what comes after the filtered layer push to determine clipping state
+                        match wtile.cmds.get(cmd_idx + 1) {
+                            // Zero-clip region: tile is completely outside the clip path.
+                            // The layer was already rendered for filtering, but we skip compositing
+                            // since this tile is entirely clipped out.
+                            // (PushZeroClip only appears for clipped filter layers)
+                            // See https://github.com/linebender/vello/pull/1541/ for why we
+                            // add the ID check.
+                            Some(Cmd::PushZeroClip(id)) if *id == *child_layer_id => {
+                                // If we have a zero-clip, it means that the whole layer should not be drawn.
+                                // Therefore, we want to skip to the very end so that only `PopBuf` will
+                                // be run. Therefore, we jump to `filtered_ranges.full_range.end - 1`.
+                                cmd_idx = filtered_ranges.full_range.end - 1;
+                                continue;
+                            }
+
+                            // Partial clip: push the clip buffer, then composite the filtered layer
+                            Some(Cmd::PushBuf(LayerKind::Clip(id), _))
+                                if *id == *child_layer_id =>
+                            {
+                                fine.run_cmd(
+                                    &wtile.cmds[cmd_idx + 1],
+                                    &self.strip_storage.alphas,
+                                    encoded_paints,
+                                    image_resolver,
+                                    &self.wide.attrs,
+                                );
+                                cmd_idx += 1;
+
+                                if let Some(mut region) =
+                                    layer_manager.layer_tile_region_mut(*child_layer_id, x, y)
+                                {
+                                    fine.unpack(&mut region);
+                                }
+                            }
+
+                            // No clip or fully inside clip: composite the filtered layer directly
+                            _ => {
+                                if let Some(mut region) =
+                                    layer_manager.layer_tile_region_mut(*child_layer_id, x, y)
+                                {
+                                    fine.unpack(&mut region);
+                                }
+                            }
                         }
-                    }
 
-                    // No clip or fully inside clip: composite the filtered layer directly
-                    _ => {
-                        if let Some(mut region) =
-                            layer_manager.layer_tile_region_mut(*child_layer_id, x, y)
-                        {
-                            fine.unpack(&mut region);
-                        }
+                        // Skip past the filtered layer's internal commands, as they were already
+                        // rendered when the FilterLayer node was processed earlier.
+                        cmd_idx = filtered_ranges.render_range.end.max(cmd_idx + 1);
+                    } else {
+                        cmd_idx += 1;
                     }
                 }
-
-                // Skip past the filtered layer's internal commands, as they were already
-                // rendered when the FilterLayer node was processed earlier.
-                cmd_idx = filtered_ranges.render_range.end.max(cmd_idx + 1);
-            } else {
-                cmd_idx += 1;
-            }
-        }
+            },
+        );
     }
 
     /// Simple rasterization path for scenes without filter effects.
@@ -389,6 +412,7 @@ impl SingleThreadedDispatcher {
     /// This directly processes each tile's commands without maintaining intermediate
     /// layer buffers. All rendering happens in a single pass directly to the output buffer.
     /// This is more efficient than the filter-aware path when no filters are present.
+    #[inline(always)]
     fn rasterize_simple<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
@@ -398,30 +422,35 @@ impl SingleThreadedDispatcher {
         encoded_paints: &[EncodedPaint],
         image_resolver: &dyn ImageResolver,
     ) {
-        let mut regions = Regions::new(width, height, buffer);
-        let mut fine = Fine::<S, F>::new(simd);
-
-        regions.update_regions(|region| {
-            let x = region.x;
-            let y = region.y;
-
-            let wtile = self.wide.get(x, y);
-            fine.set_coords(x, y);
-
-            // Clear to background and process all commands in order.
-            fine.clear(wtile.bg);
-            for cmd in &wtile.cmds {
-                fine.run_cmd(
-                    cmd,
-                    &self.strip_storage.alphas,
-                    encoded_paints,
-                    image_resolver,
-                    &self.wide.attrs,
-                );
-            }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut regions = Regions::new(width, height, buffer);
+                let mut fine = Fine::<S, F>::new(simd);
+
+                regions.update_regions(|region| {
+                    let x = region.x;
+                    let y = region.y;
+
+                    let wtile = self.wide.get(x, y);
+                    fine.set_coords(x, y);
+
+                    // Clear to background and process all commands in order.
+                    fine.clear(wtile.bg);
+                    for cmd in &wtile.cmds {
+                        fine.run_cmd(
+                            cmd,
+                            &self.strip_storage.alphas,
+                            encoded_paints,
+                            image_resolver,
+                            &self.wide.attrs,
+                        );
+                    }
 
-            fine.pack(region);
-        });
+                    fine.pack(region);
+                });
+            },
+        );
     }
 
     /// Returns true if the scene contains any filter effects.
@@ -475,6 +504,7 @@ impl SingleThreadedDispatcher {
     ///
     /// Composites tiles sequentially, writing directly to the destination buffer
     /// at the specified offset.
+    #[inline(always)]
     fn composite_at_offset_with<S: Simd, F: FineKernel<S>>(
         &self,
         simd: S,
@@ -488,38 +518,43 @@ impl SingleThreadedDispatcher {
         encoded_paints: &[EncodedPaint],
         image_resolver: &dyn ImageResolver,
     ) {
-        let mut regions = Regions::new_at_offset(
-            width,
-            height,
-            dst_x,
-            dst_y,
-            dst_buffer_width,
-            dst_buffer_height,
-            buffer,
-        );
-        let mut fine = Fine::<S, F>::new(simd);
-
-        regions.update_regions(|region| {
-            let x = region.x;
-            let y = region.y;
-
-            let wtile = self.wide.get(x, y);
-            fine.set_coords(x, y);
-
-            // Unpack existing pixel data from the region instead of clearing,
-            // so that rendering composites onto the existing pixmap contents.
-            fine.unpack(region);
-            for cmd in &wtile.cmds {
-                fine.run_cmd(
-                    cmd,
-                    &self.strip_storage.alphas,
-                    encoded_paints,
-                    image_resolver,
-                    &self.wide.attrs,
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut regions = Regions::new_at_offset(
+                    width,
+                    height,
+                    dst_x,
+                    dst_y,
+                    dst_buffer_width,
+                    dst_buffer_height,
+                    buffer,
                 );
-            }
-            fine.pack(region);
-        });
+                let mut fine = Fine::<S, F>::new(simd);
+
+                regions.update_regions(|region| {
+                    let x = region.x;
+                    let y = region.y;
+
+                    let wtile = self.wide.get(x, y);
+                    fine.set_coords(x, y);
+
+                    // Unpack existing pixel data from the region instead of clearing,
+                    // so that rendering composites onto the existing pixmap contents.
+                    fine.unpack(region);
+                    for cmd in &wtile.cmds {
+                        fine.run_cmd(
+                            cmd,
+                            &self.strip_storage.alphas,
+                            encoded_paints,
+                            image_resolver,
+                            &self.wide.attrs,
+                        );
+                    }
+                    fine.pack(region);
+                });
+            },
+        );
     }
 }
 
diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
index 60cdbf8518..4450295e78 100644
--- a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs
@@ -14,6 +14,7 @@ pub(crate) mod sweep;
 
 const GRADIENT_INVALID_POS: u32 = u32::MAX;
 
+#[inline(always)]
 pub(crate) fn calculate_t_vals<S: Simd, U: SimdGradientKind<S>>(
     simd: S,
     kind: U,
diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs
index dad257b071..a9fbb9dd64 100644
--- a/sparse_strips/vello_cpu/src/fine/common/image.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/image.rs
@@ -457,6 +457,7 @@ pub(crate) fn extend<S: Simd>(
 }
 
 /// Calculate the weights for a single fractional value.
+#[inline(always)]
 fn weights<S: Simd>(simd: S, fract: f32x4<S>) -> [f32x4<S>; 4] {
     simd.vectorize(
         #[inline(always)]
diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
index dc84f22a00..d07293998d 100644
--- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
+++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs
@@ -210,40 +210,52 @@ impl<S: Simd> SimdRoundedBlurredRect<S> {
 trait FloatExt<S: Simd> {
     // See https://raphlinus.github.io/audio/2018/09/05/sigmoid.html for a little
     // explanation of this approximation to the erf function.
-    // Doing `inline(always)` seems to reduce performance for some reason.
+    // Keep an explicit `vectorize` cut point in the implementation; forcing this whole body to
+    // inline regresses performance.
     /// Approximate the erf function.
     fn compute_erf7(simd: S, x: Self) -> Self;
     fn powf(self, x: f32) -> Self;
 }
 
 impl<S: Simd> FloatExt<S> for f32x8<S> {
+    #[inline(always)]
     fn compute_erf7(simd: S, x: Self) -> Self {
-        // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or
-        // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64`
-        // machine precision, let alone `f32`.
-        let x = x.max(Self::splat(simd, -10.0)).min(Self::splat(simd, 10.0));
-        let x = x * Self::splat(simd, core::f32::consts::FRAC_2_SQRT_PI);
-        let xx = x * x;
-        let p1 = Self::splat(simd, 0.0104).mul_add(xx, Self::splat(simd, 0.03395));
-        let p2 = p1.mul_add(xx, Self::splat(simd, 0.24295));
-        let p3 = x * xx;
-        let x = p2.mul_add(p3, x);
-        let denom = x.mul_add(x, Self::splat(simd, 1.0)).sqrt();
-        x / denom
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or
+                // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64`
+                // machine precision, let alone `f32`.
+                let x = x.max(Self::splat(simd, -10.0)).min(Self::splat(simd, 10.0));
+                let x = x * Self::splat(simd, core::f32::consts::FRAC_2_SQRT_PI);
+                let xx = x * x;
+                let p1 = Self::splat(simd, 0.0104).mul_add(xx, Self::splat(simd, 0.03395));
+                let p2 = p1.mul_add(xx, Self::splat(simd, 0.24295));
+                let p3 = x * xx;
+                let x = p2.mul_add(p3, x);
+                let denom = x.mul_add(x, Self::splat(simd, 1.0)).sqrt();
+                x / denom
+            },
+        )
     }
 
-    #[inline]
+    #[inline(always)]
     fn powf(mut self, x: f32) -> Self {
-        // TODO: SIMD
-        self[0] = self[0].powf(x);
-        self[1] = self[1].powf(x);
-        self[2] = self[2].powf(x);
-        self[3] = self[3].powf(x);
-        self[4] = self[4].powf(x);
-        self[5] = self[5].powf(x);
-        self[6] = self[6].powf(x);
-        self[7] = self[7].powf(x);
-
-        self
+        self.simd.vectorize(
+            #[inline(always)]
+            || {
+                // TODO: SIMD
+                self[0] = self[0].powf(x);
+                self[1] = self[1].powf(x);
+                self[2] = self[2].powf(x);
+                self[3] = self[3].powf(x);
+                self[4] = self[4].powf(x);
+                self[5] = self[5].powf(x);
+                self[6] = self[6].powf(x);
+                self[7] = self[7].powf(x);
+
+                self
+            },
+        )
     }
 }
diff --git a/sparse_strips/vello_cpu/src/fine/highp/blend.rs b/sparse_strips/vello_cpu/src/fine/highp/blend.rs
index cb2174da11..eb3a252e5e 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/blend.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/blend.rs
@@ -25,55 +25,62 @@ impl<S: Simd> Channels<S> {
 
 // TODO: blending is still extremely slow, investigate whether there is something obvious we are
 // missing that other renderers do.
+#[inline(always)]
 pub(crate) fn mix<S: Simd>(src_c: f32x16<S>, bg: f32x16<S>, blend_mode: BlendMode) -> f32x16<S> {
-    if matches!(blend_mode.mix, Mix::Normal) {
-        return src_c;
-    }
-    // See https://www.w3.org/TR/compositing-1/#blending
-    let simd = src_c.simd;
-
-    let split = |input: f32x16<S>| {
-        let mut storage = [0.0; 16];
-        simd.store_interleaved_128_f32x16(input, &mut storage);
-        let input_v = f32x16::from_slice(simd, &storage);
-
-        let p1 = simd.split_f32x16(input_v);
-        let (r, g) = simd.split_f32x8(p1.0);
-        let (b, a) = simd.split_f32x8(p1.1);
-
-        (Channels { r, g, b }, a)
-    };
-
-    let (bg_channels, bg_a) = split(bg);
-    let (src_channels, src_a) = split(src_c);
-
-    let unpremultiplied_bg = bg_channels.unpremultiply(bg_a);
-    let unpremultiplied_src = src_channels.unpremultiply(src_a);
-
-    let mut res_bg = unpremultiplied_bg;
-    let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg);
-
-    let apply_alpha = |unpremultiplied_src_channel: f32x4<S>,
-                       mix_src_channel: f32x4<S>,
-                       dest_channel: &mut f32x4<S>| {
-        let p1 = (1.0 - bg_a) * unpremultiplied_src_channel;
-        let p2 = bg_a * mix_src_channel;
-
-        *dest_channel = (p1 + p2).premultiply(src_a);
-    };
-
-    apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r);
-    apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g);
-    apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b);
-
-    let combined = simd.combine_f32x8(
-        simd.combine_f32x4(res_bg.r, res_bg.g),
-        simd.combine_f32x4(res_bg.b, src_a),
-    );
-
-    let mut storage = [0.0; 16];
-    simd.store_interleaved_128_f32x16(combined, &mut storage);
-    f32x16::from_slice(simd, &storage)
+    src_c.simd.vectorize(
+        #[inline(always)]
+        || {
+            if matches!(blend_mode.mix, Mix::Normal) {
+                src_c
+            } else {
+                // See https://www.w3.org/TR/compositing-1/#blending
+                let simd = src_c.simd;
+
+                let split = |input: f32x16<S>| {
+                    let mut storage = [0.0; 16];
+                    simd.store_interleaved_128_f32x16(input, &mut storage);
+                    let input_v = f32x16::from_slice(simd, &storage);
+
+                    let p1 = simd.split_f32x16(input_v);
+                    let (r, g) = simd.split_f32x8(p1.0);
+                    let (b, a) = simd.split_f32x8(p1.1);
+
+                    (Channels { r, g, b }, a)
+                };
+
+                let (bg_channels, bg_a) = split(bg);
+                let (src_channels, src_a) = split(src_c);
+
+                let unpremultiplied_bg = bg_channels.unpremultiply(bg_a);
+                let unpremultiplied_src = src_channels.unpremultiply(src_a);
+
+                let mut res_bg = unpremultiplied_bg;
+                let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg);
+
+                let apply_alpha = |unpremultiplied_src_channel: f32x4<S>,
+                                   mix_src_channel: f32x4<S>,
+                                   dest_channel: &mut f32x4<S>| {
+                    let p1 = (1.0 - bg_a) * unpremultiplied_src_channel;
+                    let p2 = bg_a * mix_src_channel;
+
+                    *dest_channel = (p1 + p2).premultiply(src_a);
+                };
+
+                apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r);
+                apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g);
+                apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b);
+
+                let combined = simd.combine_f32x8(
+                    simd.combine_f32x4(res_bg.r, res_bg.g),
+                    simd.combine_f32x4(res_bg.b, src_a),
+                );
+
+                let mut storage = [0.0; 16];
+                simd.store_interleaved_128_f32x16(combined, &mut storage);
+                f32x16::from_slice(simd, &storage)
+            }
+        },
+    )
 }
 
 trait MixExt {
@@ -81,25 +88,29 @@ trait MixExt {
 }
 
 impl MixExt for BlendMode {
+    #[inline(always)]
     fn mix<S: Simd>(&self, src: Channels<S>, bg: Channels<S>) -> Channels<S> {
-        match self.mix {
-            Mix::Normal => src,
-            Mix::Multiply => Multiply::mix(src, bg),
-            Mix::Screen => Screen::mix(src, bg),
-            Mix::Overlay => Overlay::mix(src, bg),
-            Mix::Darken => Darken::mix(src, bg),
-            Mix::Lighten => Lighten::mix(src, bg),
-            Mix::ColorDodge => ColorDodge::mix(src, bg),
-            Mix::ColorBurn => ColorBurn::mix(src, bg),
-            Mix::HardLight => HardLight::mix(src, bg),
-            Mix::SoftLight => SoftLight::mix(src, bg),
-            Mix::Difference => Difference::mix(src, bg),
-            Mix::Exclusion => Exclusion::mix(src, bg),
-            Mix::Luminosity => Luminosity::mix(src, bg),
-            Mix::Color => Color::mix(src, bg),
-            Mix::Hue => Hue::mix(src, bg),
-            Mix::Saturation => Saturation::mix(src, bg),
-        }
+        src.r.simd.vectorize(
+            #[inline(always)]
+            || match self.mix {
+                Mix::Normal => src,
+                Mix::Multiply => Multiply::mix(src, bg),
+                Mix::Screen => Screen::mix(src, bg),
+                Mix::Overlay => Overlay::mix(src, bg),
+                Mix::Darken => Darken::mix(src, bg),
+                Mix::Lighten => Lighten::mix(src, bg),
+                Mix::ColorDodge => ColorDodge::mix(src, bg),
+                Mix::ColorBurn => ColorBurn::mix(src, bg),
+                Mix::HardLight => HardLight::mix(src, bg),
+                Mix::SoftLight => SoftLight::mix(src, bg),
+                Mix::Difference => Difference::mix(src, bg),
+                Mix::Exclusion => Exclusion::mix(src, bg),
+                Mix::Luminosity => Luminosity::mix(src, bg),
+                Mix::Color => Color::mix(src, bg),
+                Mix::Hue => Hue::mix(src, bg),
+                Mix::Saturation => Saturation::mix(src, bg),
+            },
+        )
     }
 }
 
@@ -118,14 +129,20 @@ impl Screen {
 }
 
 impl HardLight {
+    #[inline(always)]
     fn single<S: Simd>(src: f32x4<S>, bg: f32x4<S>) -> f32x4<S> {
-        let two = f32x4::splat(src.simd, 2.0);
+        src.simd.vectorize(
+            #[inline(always)]
+            || {
+                let two = f32x4::splat(src.simd, 2.0);
 
-        let mask = src.simd.simd_le_f32x4(src, f32x4::splat(src.simd, 0.5));
-        let opt1 = Multiply::single(bg, src * two);
-        let opt2 = Screen::single(bg, two * src - 1.0);
+                let mask = src.simd.simd_le_f32x4(src, f32x4::splat(src.simd, 0.5));
+                let opt1 = Multiply::single(bg, src * two);
+                let opt2 = Screen::single(bg, two * src - 1.0);
 
-        src.simd.select_f32x4(mask, opt1, opt2)
+                src.simd.select_f32x4(mask, opt1, opt2)
+            },
+        )
     }
 }
 
@@ -254,57 +271,84 @@ non_separable_mix!(Luminosity, |cs: &mut Channels<S>, cb: &mut Channels<S>| {
     *cb
 });
 
+#[inline(always)]
 fn lum<S: Simd>(r: f32x4<S>, g: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    0.3 * r + 0.59 * g + 0.11 * b
+    r.simd.vectorize(
+        #[inline(always)]
+        || 0.3 * r + 0.59 * g + 0.11 * b,
+    )
 }
 
+#[inline(always)]
 fn sat<S: Simd>(r: f32x4<S>, g: f32x4<S>, b: f32x4<S>) -> f32x4<S> {
-    r.max(g).max(b) - r.min(g).min(b)
+    r.simd.vectorize(
+        #[inline(always)]
+        || r.max(g).max(b) - r.min(g).min(b),
+    )
 }
 
+#[inline(always)]
 fn clip_color<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>) {
-    let simd = r.simd;
-
-    let l = lum(*r, *g, *b);
-    let n = r.min(g.min(*b));
-    let x = r.max(g.max(*b));
-
-    for c in [r, g, b] {
-        *c = simd.select_f32x4(
-            simd.simd_lt_f32x4(n, f32x4::splat(simd, 0.0)),
-            l + (((*c - l) * l) / (l - n)),
-            *c,
-        );
-
-        *c = simd.select_f32x4(
-            simd.simd_gt_f32x4(x, f32x4::splat(simd, 1.0)),
-            l + (((*c - l) * (1.0 - l)) / (x - l)),
-            *c,
-        );
-    }
+    r.simd.vectorize(
+        #[inline(always)]
+        || {
+            let simd = r.simd;
+
+            let l = lum(*r, *g, *b);
+            let n = r.min(g.min(*b));
+            let x = r.max(g.max(*b));
+
+            for c in [r, g, b] {
+                *c = simd.select_f32x4(
+                    simd.simd_lt_f32x4(n, f32x4::splat(simd, 0.0)),
+                    l + (((*c - l) * l) / (l - n)),
+                    *c,
+                );
+
+                *c = simd.select_f32x4(
+                    simd.simd_gt_f32x4(x, f32x4::splat(simd, 1.0)),
+                    l + (((*c - l) * (1.0 - l)) / (x - l)),
+                    *c,
+                );
+            }
+        },
+    );
 }
 
+#[inline(always)]
 fn set_lum<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>, l: f32x4<S>) {
-    let d = l - lum(*r, *g, *b);
-    *r += d;
-    *g += d;
-    *b += d;
-
-    clip_color(r, g, b);
+    r.simd.vectorize(
+        #[inline(always)]
+        || {
+            let d = l - lum(*r, *g, *b);
+            *r += d;
+            *g += d;
+            *b += d;
+
+            clip_color(r, g, b);
+        },
+    );
 }
 
 // Adapted from tiny-skia
+#[inline(always)]
 fn set_sat<S: Simd>(r: &mut f32x4<S>, g: &mut f32x4<S>, b: &mut f32x4<S>, s: f32x4<S>) {
-    let simd = r.simd;
-    let zero = f32x4::splat(simd, 0.0);
-    let mn = r.min(g.min(*b));
-    let mx = r.max(g.max(*b));
-    let sat = mx - mn;
-
-    // Map min channel to 0, max channel to s, and scale the middle proportionally.
-    let scale = |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat);
-
-    *r = scale(*r);
-    *g = scale(*g);
-    *b = scale(*b);
+    r.simd.vectorize(
+        #[inline(always)]
+        || {
+            let simd = r.simd;
+            let zero = f32x4::splat(simd, 0.0);
+            let mn = r.min(g.min(*b));
+            let mx = r.max(g.max(*b));
+            let sat = mx - mn;
+
+            // Map min channel to 0, max channel to s, and scale the middle proportionally.
+            let scale =
+                |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat);
+
+            *r = scale(*r);
+            *g = scale(*g);
+            *b = scale(*b);
+        },
+    );
 }
diff --git a/sparse_strips/vello_cpu/src/fine/highp/compose.rs b/sparse_strips/vello_cpu/src/fine/highp/compose.rs
index 32983c9abe..fa9c0f00bf 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/compose.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/compose.rs
@@ -16,6 +16,7 @@ pub(crate) trait ComposeExt {
 }
 
 impl ComposeExt for BlendMode {
+    #[inline(always)]
     fn compose<S: Simd>(
         &self,
         simd: S,
@@ -23,47 +24,52 @@ impl ComposeExt for BlendMode {
         bg_c: f32x16<S>,
         alpha_mask: Option<f32x16<S>>,
     ) -> f32x16<S> {
-        // There some non-obvious subtleties worth highlighting here.
-        // We support two kinds of blending (in this case, we focus on compositing specifically):
-        // - Isolated blending, where layers as a whole are blended together with their backdrop.
-        //   If we are currently performing this kind of blending, `alpha_mask` will always be `None`.
-        //   After all, there is no concrete shape opacity associated with a layer. Instead, we are
-        //   just compositing the RGBA values at _all_ positions of the source layer with the backdrop
-        //   layer. For example, if the backdrop contains a green rectangle and source layer is just
-        //   empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared,
-        //   because we are compositing the whole source layer with the whole backdrop, and not
-        //   just the parts of the source layer that have actually be drawn on.
-        // - Non-isolated blending, where a single path is blended with the backdrop. In this case,
-        //   `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently
-        //   compositing. Remember that strips always have a fixed height of 4, because of this, the
-        //   strips might cover areas that aren't actually covered by the path (and just have an alpha
-        //   value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for
-        //   non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`.
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                // There some non-obvious subtleties worth highlighting here.
+                // We support two kinds of blending (in this case, we focus on compositing specifically):
+                // - Isolated blending, where layers as a whole are blended together with their backdrop.
+                //   If we are currently performing this kind of blending, `alpha_mask` will always be `None`.
+                //   After all, there is no concrete shape opacity associated with a layer. Instead, we are
+                //   just compositing the RGBA values at _all_ positions of the source layer with the backdrop
+                //   layer. For example, if the backdrop contains a green rectangle and source layer is just
+                //   empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared,
+                //   because we are compositing the whole source layer with the whole backdrop, and not
+                //   just the parts of the source layer that have actually be drawn on.
+                // - Non-isolated blending, where a single path is blended with the backdrop. In this case,
+                //   `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently
+                //   compositing. Remember that strips always have a fixed height of 4, because of this, the
+                //   strips might cover areas that aren't actually covered by the path (and just have an alpha
+                //   value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for
+                //   non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`.
 
-        let mut res = match self.compose {
-            Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
-            Compose::Clear => Clear::compose(simd, src_c, bg_c),
-            Compose::Copy => Copy::compose(simd, src_c, bg_c),
-            Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
-            Compose::Dest => Dest::compose(simd, src_c, bg_c),
-            Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
-            Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
-            Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
-            Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
-            Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
-            Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
-            Compose::Xor => Xor::compose(simd, src_c, bg_c),
-            Compose::Plus => Plus::compose(simd, src_c, bg_c),
-            // Have not been able to find a formula for this, so just fallback to Plus.
-            Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
-        };
+                let mut res = match self.compose {
+                    Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
+                    Compose::Clear => Clear::compose(simd, src_c, bg_c),
+                    Compose::Copy => Copy::compose(simd, src_c, bg_c),
+                    Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
+                    Compose::Dest => Dest::compose(simd, src_c, bg_c),
+                    Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
+                    Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
+                    Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
+                    Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
+                    Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
+                    Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
+                    Compose::Xor => Xor::compose(simd, src_c, bg_c),
+                    Compose::Plus => Plus::compose(simd, src_c, bg_c),
+                    // Have not been able to find a formula for this, so just fallback to Plus.
+                    Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
+                };
 
-        if let Some(alpha_mask) = alpha_mask {
-            let alpha_mask_inv = 1.0 - alpha_mask;
-            res = alpha_mask * res + alpha_mask_inv * bg_c;
-        }
+                if let Some(alpha_mask) = alpha_mask {
+                    let alpha_mask_inv = 1.0 - alpha_mask;
+                    res = alpha_mask * res + alpha_mask_inv * bg_c;
+                }
 
-        res
+                res
+            },
+        )
     }
 }
 
@@ -72,20 +78,26 @@ macro_rules! compose {
         struct $name;
 
         impl $name {
+            #[inline(always)]
             fn compose<S: Simd>(simd: S, src_c: f32x16<S>, bg_c: f32x16<S>) -> f32x16<S> {
-                let al_b = bg_c.splat_4th();
-                let al_s = src_c.splat_4th();
+                simd.vectorize(
+                    #[inline(always)]
+                    || {
+                        let al_b = bg_c.splat_4th();
+                        let al_s = src_c.splat_4th();
 
-                let fa = $fa(simd, al_s, al_b);
-                let fb = $fb(simd, al_s, al_b);
+                        let fa = $fa(simd, al_s, al_b);
+                        let fb = $fb(simd, al_s, al_b);
 
-                if $sat {
-                    (src_c * fa + fb * bg_c)
-                        .min(f32x16::splat(simd, 1.0))
-                        .max(f32x16::splat(simd, 0.0))
-                } else {
-                    src_c * fa + fb * bg_c
-                }
+                        if $sat {
+                            (src_c * fa + fb * bg_c)
+                                .min(f32x16::splat(simd, 1.0))
+                                .max(f32x16::splat(simd, 0.0))
+                        } else {
+                            src_c * fa + fb * bg_c
+                        }
+                    },
+                )
             }
         }
     };
diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
index 0948001bee..c7294573a7 100644
--- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs
@@ -379,18 +379,24 @@ mod fill {
     }
 
     /// Applies blend mode compositing to a buffer without per-pixel masks.
+    #[inline(always)]
     pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
         simd: S,
         dest: &mut [f32],
         src: T,
         blend_mode: BlendMode,
     ) {
-        for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
-            let bg_v = f32x16::from_slice(simd, next_dest);
-            let src_c = blend::mix(next_src, bg_v, blend_mode);
-            let res = blend_mode.compose(simd, src_c, bg_v, None);
-            res.store_slice(next_dest);
-        }
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) {
+                    let bg_v = f32x16::from_slice(simd, next_dest);
+                    let src_c = blend::mix(next_src, bg_v, blend_mode);
+                    let res = blend_mode.compose(simd, src_c, bg_v, None);
+                    res.store_slice(next_dest);
+                }
+            },
+        );
     }
 
     /// Performs the core alpha compositing calculation.
@@ -449,6 +455,7 @@ mod alpha_fill {
     /// Composites a buffer of colors with per-pixel alpha masks.
     ///
     /// Each pixel's source alpha is modulated by its corresponding mask value.
+    #[inline(always)]
     pub(super) fn alpha_composite_arbitrary<S: Simd, T: Iterator<Item = f32x16<S>>>(
         simd: S,
         dest: &mut [f32],
@@ -471,6 +478,7 @@ mod alpha_fill {
     }
 
     /// Applies blend mode compositing with per-pixel alpha masks.
+    #[inline(always)]
     pub(super) fn blend<S: Simd, T: Iterator<Item = f32x16<S>>>(
         simd: S,
         dest: &mut [f32],
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
index 7b4a252acf..7ef974751a 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs
@@ -7,65 +7,79 @@ use vello_common::fearless_simd::*;
 use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32};
 
 // TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed.
+#[inline(always)]
 pub(crate) fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
-    if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) {
-        return res;
-    }
-
-    // Fallback for blend modes that aren't supported in u8.
-
-    let to_f32 = |val: u8x32<S>| {
-        let (a, b) = src_c.simd.split_u8x32(val);
-        let mut a = u8_to_f32(a);
-        let mut b = u8_to_f32(b);
-        a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
-        b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
-        (a, b)
-    };
-
-    let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
-        let val1 =
-            f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
-        let val2 =
-            f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));
-
-        val1.simd.combine_u8x16(val1, val2)
-    };
-
-    let (mut src_1, mut src_2) = to_f32(src_c);
-    let (bg_1, bg_2) = to_f32(bg_c);
-
-    src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
-    src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
+    src_c.simd.vectorize(
+        #[inline(always)]
+        || {
+            if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) {
+                return res;
+            }
 
-    to_u8(src_1, src_2)
+            // Fallback for blend modes that aren't supported in u8.
+
+            let to_f32 = |val: u8x32<S>| {
+                let (a, b) = src_c.simd.split_u8x32(val);
+                let mut a = u8_to_f32(a);
+                let mut b = u8_to_f32(b);
+                a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
+                b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
+                (a, b)
+            };
+
+            let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
+                let val1 = f32_to_u8(
+                    f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)),
+                );
+                let val2 = f32_to_u8(
+                    f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)),
+                );
+
+                val1.simd.combine_u8x16(val1, val2)
+            };
+
+            let (mut src_1, mut src_2) = to_f32(src_c);
+            let (bg_1, bg_2) = to_f32(bg_c);
+
+            src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
+            src_2 = highp::blend::mix(src_2, bg_2, blend_mode);
+
+            to_u8(src_1, src_2)
+        },
+    )
 }
 
+#[inline(always)]
 fn try_u8_mix<S: Simd>(blend_mode: BlendMode, src_c: u8x32<S>, bg_c: u8x32<S>) -> Option<u8x32<S>> {
-    // We implement the u8 fast path for blend modes that
-    // 1) are separable.
-    // 2) don't have too many divisions, since integer normalization is
-    // relatively expensive.
-    // In the future, it's possible to do further experimentation to see whether
-    // some more blend modes are worth doing in integer space.
-    Some(match blend_mode.mix {
-        Mix::Normal => src_c,
-        Mix::Multiply => Multiply::mix(src_c, bg_c),
-        Mix::Screen => Screen::mix(src_c, bg_c),
-        Mix::Overlay => Overlay::mix(src_c, bg_c),
-        Mix::Darken => Darken::mix(src_c, bg_c),
-        Mix::Lighten => Lighten::mix(src_c, bg_c),
-        Mix::HardLight => HardLight::mix(src_c, bg_c),
-        Mix::Difference => Difference::mix(src_c, bg_c),
-        Mix::Exclusion => Exclusion::mix(src_c, bg_c),
-        Mix::ColorDodge
-        | Mix::ColorBurn
-        | Mix::SoftLight
-        | Mix::Luminosity
-        | Mix::Color
-        | Mix::Hue
-        | Mix::Saturation => return None,
-    })
+    src_c.simd.vectorize(
+        #[inline(always)]
+        || {
+            // We implement the u8 fast path for blend modes that
+            // 1) are separable.
+            // 2) don't have too many divisions, since integer normalization is
+            // relatively expensive.
+            // In the future, it's possible to do further experimentation to see whether
+            // some more blend modes are worth doing in integer space.
+            match blend_mode.mix {
+                Mix::Normal => Some(src_c),
+                Mix::Multiply => Some(Multiply::mix(src_c, bg_c)),
+                Mix::Screen => Some(Screen::mix(src_c, bg_c)),
+                Mix::Overlay => Some(Overlay::mix(src_c, bg_c)),
+                Mix::Darken => Some(Darken::mix(src_c, bg_c)),
+                Mix::Lighten => Some(Lighten::mix(src_c, bg_c)),
+                Mix::HardLight => Some(HardLight::mix(src_c, bg_c)),
+                Mix::Difference => Some(Difference::mix(src_c, bg_c)),
+                Mix::Exclusion => Some(Exclusion::mix(src_c, bg_c)),
+                Mix::ColorDodge
+                | Mix::ColorBurn
+                | Mix::SoftLight
+                | Mix::Luminosity
+                | Mix::Color
+                | Mix::Hue
+                | Mix::Saturation => None,
+            }
+        },
+    )
 }
 
 macro_rules! u8_mix {
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
index c03d43bd4a..e5e360f29c 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs
@@ -18,6 +18,7 @@ pub(crate) trait ComposeExt {
 }
 
 impl ComposeExt for BlendMode {
+    #[inline(always)]
     fn compose<S: Simd>(
         &self,
         simd: S,
@@ -25,32 +26,37 @@ impl ComposeExt for BlendMode {
         bg_c: u8x32<S>,
         alpha_mask: Option<u8x32<S>>,
     ) -> u8x32<S> {
-        let mut res = match self.compose {
-            Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
-            Compose::Clear => Clear::compose(simd, src_c, bg_c),
-            Compose::Copy => Copy::compose(simd, src_c, bg_c),
-            Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
-            Compose::Dest => Dest::compose(simd, src_c, bg_c),
-            Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
-            Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
-            Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
-            Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
-            Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
-            Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
-            Compose::Xor => Xor::compose(simd, src_c, bg_c),
-            Compose::Plus => Plus::compose(simd, src_c, bg_c),
-            // Have not been able to find a formula for this, so just fallback to Plus.
-            Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
-        };
+        simd.vectorize(
+            #[inline(always)]
+            || {
+                let mut res = match self.compose {
+                    Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c),
+                    Compose::Clear => Clear::compose(simd, src_c, bg_c),
+                    Compose::Copy => Copy::compose(simd, src_c, bg_c),
+                    Compose::DestOver => DestOver::compose(simd, src_c, bg_c),
+                    Compose::Dest => Dest::compose(simd, src_c, bg_c),
+                    Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c),
+                    Compose::DestIn => DestIn::compose(simd, src_c, bg_c),
+                    Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c),
+                    Compose::DestOut => DestOut::compose(simd, src_c, bg_c),
+                    Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c),
+                    Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c),
+                    Compose::Xor => Xor::compose(simd, src_c, bg_c),
+                    Compose::Plus => Plus::compose(simd, src_c, bg_c),
+                    // Have not been able to find a formula for this, so just fallback to Plus.
+                    Compose::PlusLighter => Plus::compose(simd, src_c, bg_c),
+                };
 
-        if let Some(alpha_mask) = alpha_mask {
-            let alpha_mask_inv = 255 - alpha_mask;
-            let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res);
-            let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c);
-            res = simd.narrow_u16x32((p1 + p2).div_255());
-        }
+                if let Some(alpha_mask) = alpha_mask {
+                    let alpha_mask_inv = 255 - alpha_mask;
+                    let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res);
+                    let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c);
+                    res = simd.narrow_u16x32((p1 + p2).div_255());
+                }
 
-        res
+                res
+            },
+        )
     }
 }
 
@@ -59,23 +65,29 @@ macro_rules! compose {
         struct $name;
 
         impl $name {
+            #[inline(always)]
             fn compose<S: Simd>(simd: S, src_c: u8x32<S>, bg_c: u8x32<S>) -> u8x32<S> {
-                let al_b = bg_c.splat_4th();
-                let al_s = src_c.splat_4th();
+                simd.vectorize(
+                    #[inline(always)]
+                    || {
+                        let al_b = bg_c.splat_4th();
+                        let al_s = src_c.splat_4th();
 
-                let fa = $fa(simd, al_s, al_b);
-                let fb = $fb(simd, al_s, al_b);
+                        let fa = $fa(simd, al_s, al_b);
+                        let fb = $fb(simd, al_s, al_b);
 
-                if $sat {
-                    simd.narrow_u16x32(
-                        (simd.widen_u8x32(src_c.normalized_mul(fa))
-                            + simd.widen_u8x32(fb.normalized_mul(bg_c)))
-                        .min(u16x32::splat(simd, 255))
-                        .max(u16x32::splat(simd, 0)),
-                    )
-                } else {
-                    src_c.normalized_mul(fa) + fb.normalized_mul(bg_c)
-                }
+                        if $sat {
+                            simd.narrow_u16x32(
+                                (simd.widen_u8x32(src_c.normalized_mul(fa))
+                                    + simd.widen_u8x32(fb.normalized_mul(bg_c)))
+                                .min(u16x32::splat(simd, 255))
+                                .max(u16x32::splat(simd, 0)),
+                            )
+                        } else {
+                            src_c.normalized_mul(fa) + fb.normalized_mul(bg_c)
+                        }
+                    },
+                )
             }
         }
     };
diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
index 3e9e88b79f..029e4e7e24 100644
--- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
+++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs
@@ -358,6 +358,7 @@ mod fill {
     use vello_common::util::normalized_mul_u8x32;
 
     /// Applies blend mode compositing to a buffer without per-pixel masks.
+    #[inline(always)]
     pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
         simd: S,
         dest: &mut [u8],
@@ -385,6 +386,7 @@ mod fill {
     /// Composites a solid color onto a buffer using alpha blending.
     ///
     /// Uses the "over" operator: `result = src + bg * (1 - src_alpha)`
+    #[inline(always)]
     pub(super) fn alpha_composite_solid<S: Simd>(s: S, dest: &mut [u8], src: [u8; 4]) {
         s.vectorize(
             #[inline(always)]
@@ -409,6 +411,7 @@ mod fill {
     /// Composites a buffer of colors onto another buffer using alpha blending.
     ///
     /// Each source pixel is composited individually based on its alpha channel.
+    #[inline(always)]
     pub(super) fn alpha_composite<S: Simd, T: Iterator<Item = u8x32<S>>>(
         simd: S,
         dest: &mut [u8],
@@ -456,6 +459,7 @@ mod alpha_fill {
     use vello_common::util::{Div255Ext, normalized_mul_u8x32};
 
     /// Applies blend mode compositing with per-pixel alpha masks.
+    #[inline(always)]
     pub(super) fn blend<S: Simd, T: Iterator<Item = u8x32<S>>>(
         simd: S,
         dest: &mut [u8],