From 46065505e6e591fd8730c3f38f700ae095dcd621 Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 11:16:16 +0200 Subject: [PATCH 1/7] Always inline three kernel methods --- sparse_strips/vello_common/src/clip.rs | 1 + sparse_strips/vello_common/src/rect.rs | 1 + sparse_strips/vello_common/src/tile.rs | 1 + 3 files changed, 3 insertions(+) diff --git a/sparse_strips/vello_common/src/clip.rs b/sparse_strips/vello_common/src/clip.rs index a305e76f84..3988090b6d 100644 --- a/sparse_strips/vello_common/src/clip.rs +++ b/sparse_strips/vello_common/src/clip.rs @@ -230,6 +230,7 @@ pub fn intersect( /// /// This is all that this method does. It just looks more complicated as the logic for iterating /// in lock step is a bit tricky. +#[inline(always)] fn intersect_impl( simd: S, path_1: PathDataRef<'_>, diff --git a/sparse_strips/vello_common/src/rect.rs b/sparse_strips/vello_common/src/rect.rs index 55f98b7160..f4dfaeb33b 100644 --- a/sparse_strips/vello_common/src/rect.rs +++ b/sparse_strips/vello_common/src/rect.rs @@ -40,6 +40,7 @@ pub fn render(level: Level, rect: Rect, strip_buf: &mut Vec, alpha_buf: & /// /// The x-alpha masks for the left/right edge tiles are y-independent, so they /// are precomputed once and reused across all interior rows. +#[inline(always)] fn render_impl(s: S, rect: Rect, strip_buf: &mut Vec, alpha_buf: &mut Vec) { if rect.is_zero_area() { return; diff --git a/sparse_strips/vello_common/src/tile.rs b/sparse_strips/vello_common/src/tile.rs index 59f8885ecb..9c32a8943e 100644 --- a/sparse_strips/vello_common/src/tile.rs +++ b/sparse_strips/vello_common/src/tile.rs @@ -503,6 +503,7 @@ impl Tiles { )) } + #[inline(always)] fn make_tiles_analytic_aa_impl( &mut self, s: S, From a06ea8a3b365f5438740aeb1a04b8777b5b65410 Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 11:28:35 +0200 Subject: [PATCH 2/7] Vectorize `new` methods --- .../vello_cpu/src/fine/common/gradient/mod.rs | 27 +++-- .../src/fine/common/gradient/radial.rs | 43 ++++--- .../src/fine/common/gradient/sweep.rs | 13 +- .../vello_cpu/src/fine/common/image.rs | 114 ++++++++++-------- .../src/fine/common/rounded_blurred_rect.rs | 99 ++++++++------- .../vello_cpu/src/fine/lowp/gradient.rs | 23 ++-- .../vello_cpu/src/fine/lowp/image.rs | 103 ++++++++-------- 7 files changed, 235 insertions(+), 187 deletions(-) diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs index 60cdbf8518..0386b17eff 100644 --- a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs @@ -53,17 +53,22 @@ pub(crate) struct GradientPainter<'a, S: Simd> { impl<'a, S: Simd> GradientPainter<'a, S> { pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self { - let lut = gradient.f32_lut(simd); - let scale_factor: f32x8 = f32x8::splat(simd, lut.scale_factor()); - - Self { - gradient, - scale_factor, - lut, - t_vals: t_vals.chunks_exact(8), - has_undefined: gradient.has_undefined, - simd, - } + simd.vectorize( + #[inline(always)] + || { + let lut = gradient.f32_lut(simd); + let scale_factor: f32x8 = f32x8::splat(simd, lut.scale_factor()); + + Self { + gradient, + scale_factor, + lut, + t_vals: t_vals.chunks_exact(8), + has_undefined: gradient.has_undefined, + simd, + } + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs index 78ce5fb648..b7db4d30e4 100644 --- a/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs +++ b/sparse_strips/vello_cpu/src/fine/common/gradient/radial.rs @@ -26,26 +26,31 @@ pub(crate) struct SimdRadialKind { impl SimdRadialKind { pub(crate) fn new(simd: S, kind: &RadialKind) -> Self { - let inner = match kind { - RadialKind::Radial { bias, scale } => SimdRadialKindInner::Radial { - bias: f32x8::splat(simd, *bias), - scale: f32x8::splat(simd, *scale), - }, - RadialKind::Strip { scaled_r0_squared } => SimdRadialKindInner::Strip { - scaled_r0_squared: f32x8::splat(simd, *scaled_r0_squared), - }, - RadialKind::Focal { - focal_data, - fp0, - fp1, - } => SimdRadialKindInner::Focal { - fp0: f32x8::splat(simd, *fp0), - fp1: f32x8::splat(simd, *fp1), - focal_data: *focal_data, - }, - }; + simd.vectorize( + #[inline(always)] + || { + let inner = match kind { + RadialKind::Radial { bias, scale } => SimdRadialKindInner::Radial { + bias: f32x8::splat(simd, *bias), + scale: f32x8::splat(simd, *scale), + }, + RadialKind::Strip { scaled_r0_squared } => SimdRadialKindInner::Strip { + scaled_r0_squared: f32x8::splat(simd, *scaled_r0_squared), + }, + RadialKind::Focal { + focal_data, + fp0, + fp1, + } => SimdRadialKindInner::Focal { + fp0: f32x8::splat(simd, *fp0), + fp1: f32x8::splat(simd, *fp1), + focal_data: *focal_data, + }, + }; - Self { inner } + Self { inner } + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs index 02a0a401ff..7b38029e55 100644 --- a/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs +++ b/sparse_strips/vello_cpu/src/fine/common/gradient/sweep.rs @@ -15,11 +15,14 @@ pub(crate) struct SimdSweepKind { impl SimdSweepKind { pub(crate) fn new(simd: S, kind: &SweepKind) -> Self { - Self { - start_angle: f32x8::splat(simd, kind.start_angle), - inv_angle_delta: f32x8::splat(simd, kind.inv_angle_delta), - simd, - } + simd.vectorize( + #[inline(always)] + || Self { + start_angle: f32x8::splat(simd, kind.start_angle), + inv_angle_delta: f32x8::splat(simd, kind.inv_angle_delta), + simd, + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs index dad257b071..671f9061cb 100644 --- a/sparse_strips/vello_cpu/src/fine/common/image.rs +++ b/sparse_strips/vello_cpu/src/fine/common/image.rs @@ -29,33 +29,38 @@ impl<'a, S: Simd> PlainNNImagePainter<'a, S> { ) -> Self { let data = ImagePainterData::new(simd, image, pixmap, start_x, start_y); - let y_positions = extend( - simd, - f32x4::splat_pos( - simd, - data.cur_pos.y as f32, - data.x_advances.1, - data.y_advances.1, - ), - image.sampler.y_extend, - data.height, - data.height_inv, - ); - - let cur_x_pos = f32x4::splat_pos( - simd, - data.cur_pos.x as f32, - data.x_advances.0, - data.y_advances.0, - ); - - Self { - data, - advance: image.x_advance.x as f32, - y_positions, - cur_x_pos, - simd, - } + simd.vectorize( + #[inline(always)] + || { + let y_positions = extend( + simd, + f32x4::splat_pos( + simd, + data.cur_pos.y as f32, + data.x_advances.1, + data.y_advances.1, + ), + image.sampler.y_extend, + data.height, + data.height_inv, + ); + + let cur_x_pos = f32x4::splat_pos( + simd, + data.cur_pos.x as f32, + data.x_advances.0, + data.y_advances.0, + ); + + Self { + data, + advance: image.x_advance.x as f32, + y_positions, + cur_x_pos, + simd, + } + }, + ) } } @@ -366,31 +371,36 @@ impl<'a, S: Simd> ImagePainterData<'a, S> { start_x: f64, start_y: f64, ) -> Self { - let width = pixmap.width() as f32; - let height = pixmap.height() as f32; - let start_pos = image.transform * Point::new(start_x, start_y); - - let width_inv = f32x4::splat(simd, 1.0 / width); - let height_inv = f32x4::splat(simd, 1.0 / height); - let width = f32x4::splat(simd, width); - let width_u32 = u32x4::splat(simd, pixmap.width() as u32); - let height = f32x4::splat(simd, height); - - let x_advances = (image.x_advance.x as f32, image.x_advance.y as f32); - let y_advances = (image.y_advance.x as f32, image.y_advance.y as f32); - - Self { - cur_pos: start_pos, - pixmap, - x_advances, - y_advances, - image, - width, - height, - width_u32, - width_inv, - height_inv, - } + simd.vectorize( + #[inline(always)] + || { + let width = pixmap.width() as f32; + let height = pixmap.height() as f32; + let start_pos = image.transform * Point::new(start_x, start_y); + + let width_inv = f32x4::splat(simd, 1.0 / width); + let height_inv = f32x4::splat(simd, 1.0 / height); + let width = f32x4::splat(simd, width); + let width_u32 = u32x4::splat(simd, pixmap.width() as u32); + let height = f32x4::splat(simd, height); + + let x_advances = (image.x_advance.x as f32, image.x_advance.y as f32); + let y_advances = (image.y_advance.x as f32, image.y_advance.y as f32); + + Self { + cur_pos: start_pos, + pixmap, + x_advances, + y_advances, + image, + width, + height, + width_u32, + width_inv, + height_inv, + } + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs index dc84f22a00..5dfc93a292 100644 --- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs +++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs @@ -29,23 +29,33 @@ impl BlurredRoundedRectFiller { start_x: f64, start_y: f64, ) -> Self { - let start_pos = rect.transform * Point::new(start_x, start_y); - let color_components = rect.color.as_premul_f32().components; - let r = f32x8::splat(simd, color_components[0]); - let g = f32x8::splat(simd, color_components[1]); - let b = f32x8::splat(simd, color_components[2]); - let a = f32x8::splat(simd, color_components[3]); - let simd_rect = SimdRoundedBlurredRect::new(rect, simd); - let alpha_calculator = - AlphaCalculator::new(start_pos, rect.x_advance, rect.y_advance, simd_rect, simd); + simd.vectorize( + #[inline(always)] + || { + let start_pos = rect.transform * Point::new(start_x, start_y); + let color_components = rect.color.as_premul_f32().components; + let r = f32x8::splat(simd, color_components[0]); + let g = f32x8::splat(simd, color_components[1]); + let b = f32x8::splat(simd, color_components[2]); + let a = f32x8::splat(simd, color_components[3]); + let simd_rect = SimdRoundedBlurredRect::new(rect, simd); + let alpha_calculator = AlphaCalculator::new( + start_pos, + rect.x_advance, + rect.y_advance, + simd_rect, + simd, + ); - Self { - alpha_calculator, - r, - g, - b, - a, - } + Self { + alpha_calculator, + r, + g, + b, + a, + } + }, + ) } } @@ -177,33 +187,38 @@ struct SimdRoundedBlurredRect { impl SimdRoundedBlurredRect { fn new(encoded: &EncodedBlurredRoundedRectangle, s: S) -> Self { - let h = f32x8::splat(s, encoded.h); - let w = f32x8::splat(s, encoded.w); - let width = f32x8::splat(s, encoded.width); - let height = f32x8::splat(s, encoded.height); - let r1 = f32x8::splat(s, encoded.r1); - let exponent = encoded.exponent; - let recip_exponent = encoded.recip_exponent; - let scale = f32x8::splat(s, encoded.scale); - let min_edge = f32x8::splat(s, encoded.min_edge); - let std_dev_inv = f32x8::splat(s, encoded.std_dev_inv); - let v0 = f32x8::splat(s, 0.0); - let v1 = f32x8::splat(s, 0.5); + s.vectorize( + #[inline(always)] + || { + let h = f32x8::splat(s, encoded.h); + let w = f32x8::splat(s, encoded.w); + let width = f32x8::splat(s, encoded.width); + let height = f32x8::splat(s, encoded.height); + let r1 = f32x8::splat(s, encoded.r1); + let exponent = encoded.exponent; + let recip_exponent = encoded.recip_exponent; + let scale = f32x8::splat(s, encoded.scale); + let min_edge = f32x8::splat(s, encoded.min_edge); + let std_dev_inv = f32x8::splat(s, encoded.std_dev_inv); + let v0 = f32x8::splat(s, 0.0); + let v1 = f32x8::splat(s, 0.5); - Self { - exponent, - recip_exponent, - scale, - std_dev_inv, - min_edge, - w, - v0, - v1, - h, - width, - height, - r1, - } + Self { + exponent, + recip_exponent, + scale, + std_dev_inv, + min_edge, + w, + v0, + v1, + h, + width, + height, + r1, + } + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs b/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs index 210818a524..3ff1fa599a 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/gradient.rs @@ -20,16 +20,21 @@ pub(crate) struct GradientPainter<'a, S: Simd> { impl<'a, S: Simd> GradientPainter<'a, S> { pub(crate) fn new(simd: S, gradient: &'a EncodedGradient, t_vals: &'a [f32]) -> Self { - let lut = gradient.u8_lut(simd); - let scale_factor = f32x16::splat(simd, lut.scale_factor()); + simd.vectorize( + #[inline(always)] + || { + let lut = gradient.u8_lut(simd); + let scale_factor = f32x16::splat(simd, lut.scale_factor()); - Self { - gradient, - scale_factor, - lut: lut.lut(), - t_vals: t_vals.chunks_exact(16), - simd, - } + Self { + gradient, + scale_factor, + lut: lut.lut(), + t_vals: t_vals.chunks_exact(16), + simd, + } + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/lowp/image.rs b/sparse_strips/vello_cpu/src/fine/lowp/image.rs index f68e2162cf..1b0226a62c 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/image.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/image.rs @@ -145,55 +145,60 @@ impl<'a, S: Simd> PlainBilinearImagePainter<'a, S> { ) -> Self { let data = ImagePainterData::new(simd, image, pixmap, start_x, start_y); - // For axis-aligned images, y doesn't change across the strip - let y_positions = f32x4::splat_pos( - simd, - data.cur_pos.y as f32, - data.x_advances.1, - data.y_advances.1, - ); - - // Pre-compute y extend positions - let y_pos1 = extend( - simd, - y_positions - 0.5, - image.sampler.y_extend, - data.height, - data.height_inv, - ); - let y_pos2 = extend( - simd, - y_positions + 0.5, - image.sampler.y_extend, - data.height, - data.height_inv, - ); - - // Pre-compute y interpolation weights - let fy = f32_to_u8(element_wise_splat( - simd, - fract_floor(y_positions + 0.5).mul_add(255.0, 0.5), - )); - let fy = simd.widen_u8x16(fy); - let fy_inv = u16x16::splat(simd, 255) - fy; - - let cur_x_pos = f32x4::splat_pos( - simd, - data.cur_pos.x as f32, - data.x_advances.0, - data.y_advances.0, - ); - - Self { - data, - y_pos1, - y_pos2, - fy, - fy_inv, - cur_x_pos, - advance: image.x_advance.x as f32, - simd, - } + simd.vectorize( + #[inline(always)] + || { + // For axis-aligned images, y doesn't change across the strip + let y_positions = f32x4::splat_pos( + simd, + data.cur_pos.y as f32, + data.x_advances.1, + data.y_advances.1, + ); + + // Pre-compute y extend positions + let y_pos1 = extend( + simd, + y_positions - 0.5, + image.sampler.y_extend, + data.height, + data.height_inv, + ); + let y_pos2 = extend( + simd, + y_positions + 0.5, + image.sampler.y_extend, + data.height, + data.height_inv, + ); + + // Pre-compute y interpolation weights + let fy = f32_to_u8(element_wise_splat( + simd, + fract_floor(y_positions + 0.5).mul_add(255.0, 0.5), + )); + let fy = simd.widen_u8x16(fy); + let fy_inv = u16x16::splat(simd, 255) - fy; + + let cur_x_pos = f32x4::splat_pos( + simd, + data.cur_pos.x as f32, + data.x_advances.0, + data.y_advances.0, + ); + + Self { + data, + y_pos1, + y_pos2, + fy, + fy_inv, + cur_x_pos, + advance: image.x_advance.x as f32, + simd, + } + }, + ) } } From 797a30e13f7b8c7edec9ea7e4db78d39cc82439c Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 11:45:32 +0200 Subject: [PATCH 3/7] Always inline next methods --- .../vello_cpu/src/fine/common/image.rs | 2 + .../src/fine/common/rounded_blurred_rect.rs | 54 ++++++++++++------- .../vello_cpu/src/fine/lowp/image.rs | 1 + 3 files changed, 38 insertions(+), 19 deletions(-) diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs index 671f9061cb..58bb151bdc 100644 --- a/sparse_strips/vello_cpu/src/fine/common/image.rs +++ b/sparse_strips/vello_cpu/src/fine/common/image.rs @@ -111,6 +111,7 @@ impl<'a, S: Simd> NNImagePainter<'a, S> { impl Iterator for NNImagePainter<'_, S> { type Item = u8x16; + #[inline(always)] fn next(&mut self) -> Option { let x_positions = extend( self.simd, @@ -180,6 +181,7 @@ impl<'a, S: Simd, const QUALITY: u8> FilteredImagePainter<'a, S, QUALITY> { impl Iterator for FilteredImagePainter<'_, S, QUALITY> { type Item = f32x16; + #[inline(always)] fn next(&mut self) -> Option { let x_positions = f32x4::splat_pos( self.simd, diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs index 5dfc93a292..fc78cf45e5 100644 --- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs +++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs @@ -62,6 +62,7 @@ impl BlurredRoundedRectFiller { impl Iterator for BlurredRoundedRectFiller { type Item = ShaderResultF32; + #[inline(always)] fn next(&mut self) -> Option { let next = self.alpha_calculator.next().unwrap(); let r = self.r * next; @@ -75,30 +76,44 @@ impl Iterator for BlurredRoundedRectFiller { impl crate::fine::Painter for BlurredRoundedRectFiller { fn paint_u8(&mut self, buf: &mut [u8]) { - for chunk in buf.chunks_exact_mut(64) { - let first = self.next().unwrap(); - let simd = first.r.simd; - let second = self.next().unwrap(); + self.a.simd.vectorize( + #[inline(always)] + || { + for chunk in buf.chunks_exact_mut(64) { + let first = self.next().unwrap(); + let simd = first.r.simd; + let second = self.next().unwrap(); - let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r)); - let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g)); - let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b)); - let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a)); + let r = u8x16::from_f32(simd, simd.combine_f32x8(first.r, second.r)); + let g = u8x16::from_f32(simd, simd.combine_f32x8(first.g, second.g)); + let b = u8x16::from_f32(simd, simd.combine_f32x8(first.b, second.b)); + let a = u8x16::from_f32(simd, simd.combine_f32x8(first.a, second.a)); - let combined = simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a)); + let combined = + simd.combine_u8x32(simd.combine_u8x16(r, g), simd.combine_u8x16(b, a)); - simd.store_interleaved_128_u8x64(combined, (&mut chunk[..]).try_into().unwrap()); - } + simd.store_interleaved_128_u8x64( + combined, + (&mut chunk[..]).try_into().unwrap(), + ); + } + }, + ); } fn paint_f32(&mut self, buf: &mut [f32]) { - for chunk in buf.chunks_exact_mut(32) { - let (c1, c2) = self.next().unwrap().get(); - c1.simd - .store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap()); - c2.simd - .store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap()); - } + self.a.simd.vectorize( + #[inline(always)] + || { + for chunk in buf.chunks_exact_mut(32) { + let (c1, c2) = self.next().unwrap().get(); + c1.simd + .store_interleaved_128_f32x16(c1, (&mut chunk[..16]).try_into().unwrap()); + c2.simd + .store_interleaved_128_f32x16(c2, (&mut chunk[16..]).try_into().unwrap()); + } + }, + ); } } @@ -132,6 +147,7 @@ impl AlphaCalculator { impl Iterator for AlphaCalculator { type Item = f32x8; + #[inline(always)] fn next(&mut self) -> Option { let i = f32x8::splat_pos( self.simd, @@ -225,13 +241,13 @@ impl SimdRoundedBlurredRect { trait FloatExt { // See https://raphlinus.github.io/audio/2018/09/05/sigmoid.html for a little // explanation of this approximation to the erf function. - // Doing `inline(always)` seems to reduce performance for some reason. /// Approximate the erf function. fn compute_erf7(simd: S, x: Self) -> Self; fn powf(self, x: f32) -> Self; } impl FloatExt for f32x8 { + #[inline(always)] fn compute_erf7(simd: S, x: Self) -> Self { // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64` diff --git a/sparse_strips/vello_cpu/src/fine/lowp/image.rs b/sparse_strips/vello_cpu/src/fine/lowp/image.rs index 1b0226a62c..c5bbfc7349 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/image.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/image.rs @@ -34,6 +34,7 @@ impl<'a, S: Simd> BilinearImagePainter<'a, S> { impl Iterator for BilinearImagePainter<'_, S> { type Item = u8x16; + #[inline(always)] fn next(&mut self) -> Option { let x_positions = f32x4::splat_pos( self.simd, From 9049df262eb99692d95a0fa857d1912970d10768 Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 11:48:31 +0200 Subject: [PATCH 4/7] Fix blending inlining --- .../vello_cpu/src/fine/highp/blend.rs | 92 ++++++++++++------- .../vello_cpu/src/fine/highp/compose.rs | 91 ++++++++++-------- sparse_strips/vello_cpu/src/fine/highp/mod.rs | 17 ++-- .../vello_cpu/src/fine/lowp/blend.rs | 49 ++++++---- .../vello_cpu/src/fine/lowp/compose.rs | 63 ++++++++----- 5 files changed, 191 insertions(+), 121 deletions(-) diff --git a/sparse_strips/vello_cpu/src/fine/highp/blend.rs b/sparse_strips/vello_cpu/src/fine/highp/blend.rs index cb2174da11..93b1f0107f 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/blend.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/blend.rs @@ -23,29 +23,23 @@ impl Channels { } } -// TODO: blending is still extremely slow, investigate whether there is something obvious we are -// missing that other renderers do. pub(crate) fn mix(src_c: f32x16, bg: f32x16, blend_mode: BlendMode) -> f32x16 { + src_c.simd.vectorize( + #[inline(always)] + || mix_inner(src_c, bg, blend_mode), + ) +} + +#[inline(always)] +fn mix_inner(src_c: f32x16, bg: f32x16, blend_mode: BlendMode) -> f32x16 { if matches!(blend_mode.mix, Mix::Normal) { return src_c; } // See https://www.w3.org/TR/compositing-1/#blending let simd = src_c.simd; - let split = |input: f32x16| { - let mut storage = [0.0; 16]; - simd.store_interleaved_128_f32x16(input, &mut storage); - let input_v = f32x16::from_slice(simd, &storage); - - let p1 = simd.split_f32x16(input_v); - let (r, g) = simd.split_f32x8(p1.0); - let (b, a) = simd.split_f32x8(p1.1); - - (Channels { r, g, b }, a) - }; - - let (bg_channels, bg_a) = split(bg); - let (src_channels, src_a) = split(src_c); + let (bg_channels, bg_a) = split(simd, bg); + let (src_channels, src_a) = split(simd, src_c); let unpremultiplied_bg = bg_channels.unpremultiply(bg_a); let unpremultiplied_src = src_channels.unpremultiply(src_a); @@ -53,18 +47,9 @@ pub(crate) fn mix(src_c: f32x16, bg: f32x16, blend_mode: BlendMod let mut res_bg = unpremultiplied_bg; let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg); - let apply_alpha = |unpremultiplied_src_channel: f32x4, - mix_src_channel: f32x4, - dest_channel: &mut f32x4| { - let p1 = (1.0 - bg_a) * unpremultiplied_src_channel; - let p2 = bg_a * mix_src_channel; - - *dest_channel = (p1 + p2).premultiply(src_a); - }; - - apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r); - apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g); - apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b); + res_bg.r = apply_alpha(bg_a, src_a, unpremultiplied_src.r, mix_src.r); + res_bg.g = apply_alpha(bg_a, src_a, unpremultiplied_src.g, mix_src.g); + res_bg.b = apply_alpha(bg_a, src_a, unpremultiplied_src.b, mix_src.b); let combined = simd.combine_f32x8( simd.combine_f32x4(res_bg.r, res_bg.g), @@ -76,11 +61,38 @@ pub(crate) fn mix(src_c: f32x16, bg: f32x16, blend_mode: BlendMod f32x16::from_slice(simd, &storage) } +#[inline(always)] +fn split(simd: S, input: f32x16) -> (Channels, f32x4) { + let mut storage = [0.0; 16]; + simd.store_interleaved_128_f32x16(input, &mut storage); + let input_v = f32x16::from_slice(simd, &storage); + + let p1 = simd.split_f32x16(input_v); + let (r, g) = simd.split_f32x8(p1.0); + let (b, a) = simd.split_f32x8(p1.1); + + (Channels { r, g, b }, a) +} + +#[inline(always)] +fn apply_alpha( + bg_a: f32x4, + src_a: f32x4, + unpremultiplied_src_channel: f32x4, + mix_src_channel: f32x4, +) -> f32x4 { + let p1 = (1.0 - bg_a) * unpremultiplied_src_channel; + let p2 = bg_a * mix_src_channel; + + (p1 + p2).premultiply(src_a) +} + trait MixExt { fn mix(&self, src: Channels, bg: Channels) -> Channels; } impl MixExt for BlendMode { + #[inline(always)] fn mix(&self, src: Channels, bg: Channels) -> Channels { match self.mix { Mix::Normal => src, @@ -118,6 +130,7 @@ impl Screen { } impl HardLight { + #[inline(always)] fn single(src: f32x4, bg: f32x4) -> f32x4 { let two = f32x4::splat(src.simd, 2.0); @@ -254,14 +267,17 @@ non_separable_mix!(Luminosity, |cs: &mut Channels, cb: &mut Channels| { *cb }); +#[inline(always)] fn lum(r: f32x4, g: f32x4, b: f32x4) -> f32x4 { 0.3 * r + 0.59 * g + 0.11 * b } +#[inline(always)] fn sat(r: f32x4, g: f32x4, b: f32x4) -> f32x4 { r.max(g).max(b) - r.min(g).min(b) } +#[inline(always)] fn clip_color(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4) { let simd = r.simd; @@ -284,6 +300,7 @@ fn clip_color(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4) { } } +#[inline(always)] fn set_lum(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4, l: f32x4) { let d = l - lum(*r, *g, *b); *r += d; @@ -294,17 +311,24 @@ fn set_lum(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4, l: f32 } // Adapted from tiny-skia +#[inline(always)] fn set_sat(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4, s: f32x4) { - let simd = r.simd; - let zero = f32x4::splat(simd, 0.0); let mn = r.min(g.min(*b)); let mx = r.max(g.max(*b)); let sat = mx - mn; // Map min channel to 0, max channel to s, and scale the middle proportionally. - let scale = |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat); + *r = scale_sat_channel(*r, mn, sat, s); + *g = scale_sat_channel(*g, mn, sat, s); + *b = scale_sat_channel(*b, mn, sat, s); +} - *r = scale(*r); - *g = scale(*g); - *b = scale(*b); +#[inline(always)] +fn scale_sat_channel(c: f32x4, mn: f32x4, sat: f32x4, s: f32x4) -> f32x4 { + let simd = c.simd; + simd.select_f32x4( + simd.simd_eq_f32x4(sat, f32x4::splat(simd, 0.0)), + f32x4::splat(simd, 0.0), + (c - mn) * s / sat, + ) } diff --git a/sparse_strips/vello_cpu/src/fine/highp/compose.rs b/sparse_strips/vello_cpu/src/fine/highp/compose.rs index 32983c9abe..3e3b689059 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/compose.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/compose.rs @@ -23,48 +23,62 @@ impl ComposeExt for BlendMode { bg_c: f32x16, alpha_mask: Option>, ) -> f32x16 { - // There some non-obvious subtleties worth highlighting here. - // We support two kinds of blending (in this case, we focus on compositing specifically): - // - Isolated blending, where layers as a whole are blended together with their backdrop. - // If we are currently performing this kind of blending, `alpha_mask` will always be `None`. - // After all, there is no concrete shape opacity associated with a layer. Instead, we are - // just compositing the RGBA values at _all_ positions of the source layer with the backdrop - // layer. For example, if the backdrop contains a green rectangle and source layer is just - // empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared, - // because we are compositing the whole source layer with the whole backdrop, and not - // just the parts of the source layer that have actually be drawn on. - // - Non-isolated blending, where a single path is blended with the backdrop. In this case, - // `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently - // compositing. Remember that strips always have a fixed height of 4, because of this, the - // strips might cover areas that aren't actually covered by the path (and just have an alpha - // value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for - // non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`. + simd.vectorize( + #[inline(always)] + || compose_inner(*self, simd, src_c, bg_c, alpha_mask), + ) + } +} - let mut res = match self.compose { - Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), - Compose::Clear => Clear::compose(simd, src_c, bg_c), - Compose::Copy => Copy::compose(simd, src_c, bg_c), - Compose::DestOver => DestOver::compose(simd, src_c, bg_c), - Compose::Dest => Dest::compose(simd, src_c, bg_c), - Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), - Compose::DestIn => DestIn::compose(simd, src_c, bg_c), - Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), - Compose::DestOut => DestOut::compose(simd, src_c, bg_c), - Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), - Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), - Compose::Xor => Xor::compose(simd, src_c, bg_c), - Compose::Plus => Plus::compose(simd, src_c, bg_c), - // Have not been able to find a formula for this, so just fallback to Plus. - Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), - }; +#[inline(always)] +fn compose_inner( + blend_mode: BlendMode, + simd: S, + src_c: f32x16, + bg_c: f32x16, + alpha_mask: Option>, +) -> f32x16 { + // There some non-obvious subtleties worth highlighting here. + // We support two kinds of blending (in this case, we focus on compositing specifically): + // - Isolated blending, where layers as a whole are blended together with their backdrop. + // If we are currently performing this kind of blending, `alpha_mask` will always be `None`. + // After all, there is no concrete shape opacity associated with a layer. Instead, we are + // just compositing the RGBA values at _all_ positions of the source layer with the backdrop + // layer. For example, if the backdrop contains a green rectangle and source layer is just + // empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared, + // because we are compositing the whole source layer with the whole backdrop, and not + // just the parts of the source layer that have actually be drawn on. + // - Non-isolated blending, where a single path is blended with the backdrop. In this case, + // `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently + // compositing. Remember that strips always have a fixed height of 4, because of this, the + // strips might cover areas that aren't actually covered by the path (and just have an alpha + // value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for + // non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`. - if let Some(alpha_mask) = alpha_mask { - let alpha_mask_inv = 1.0 - alpha_mask; - res = alpha_mask * res + alpha_mask_inv * bg_c; - } + let mut res = match blend_mode.compose { + Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), + Compose::Clear => Clear::compose(simd, src_c, bg_c), + Compose::Copy => Copy::compose(simd, src_c, bg_c), + Compose::DestOver => DestOver::compose(simd, src_c, bg_c), + Compose::Dest => Dest::compose(simd, src_c, bg_c), + Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), + Compose::DestIn => DestIn::compose(simd, src_c, bg_c), + Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), + Compose::DestOut => DestOut::compose(simd, src_c, bg_c), + Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), + Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), + Compose::Xor => Xor::compose(simd, src_c, bg_c), + Compose::Plus => Plus::compose(simd, src_c, bg_c), + // Have not been able to find a formula for this, so just fallback to Plus. + Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), + }; - res + if let Some(alpha_mask) = alpha_mask { + let alpha_mask_inv = 1.0 - alpha_mask; + res = alpha_mask * res + alpha_mask_inv * bg_c; } + + res } macro_rules! compose { @@ -72,6 +86,7 @@ macro_rules! compose { struct $name; impl $name { + #[inline(always)] fn compose(simd: S, src_c: f32x16, bg_c: f32x16) -> f32x16 { let al_b = bg_c.splat_4th(); let al_s = src_c.splat_4th(); diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs index 0948001bee..eb2236af43 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs @@ -385,12 +385,17 @@ mod fill { src: T, blend_mode: BlendMode, ) { - for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) { - let bg_v = f32x16::from_slice(simd, next_dest); - let src_c = blend::mix(next_src, bg_v, blend_mode); - let res = blend_mode.compose(simd, src_c, bg_v, None); - res.store_slice(next_dest); - } + simd.vectorize( + #[inline(always)] + || { + for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) { + let bg_v = f32x16::from_slice(simd, next_dest); + let src_c = blend::mix(next_src, bg_v, blend_mode); + let res = blend_mode.compose(simd, src_c, bg_v, None); + res.store_slice(next_dest); + } + }, + ); } /// Performs the core alpha compositing calculation. diff --git a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs index 318b8d0a0a..aa7dbdc9bb 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs @@ -6,32 +6,21 @@ use crate::peniko::{BlendMode, Mix}; use vello_common::fearless_simd::*; use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32}; -// TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed. pub(crate) fn mix(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMode) -> u8x32 { + src_c.simd.vectorize( + #[inline(always)] + || mix_inner(src_c, bg_c, blend_mode), + ) +} + +#[inline(always)] +fn mix_inner(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMode) -> u8x32 { if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) { return res; } // Fallback for blend modes that aren't supported in u8. - let to_f32 = |val: u8x32| { - let (a, b) = src_c.simd.split_u8x32(val); - let mut a = u8_to_f32(a); - let mut b = u8_to_f32(b); - a *= f32x16::splat(src_c.simd, 1.0 / 255.0); - b *= f32x16::splat(src_c.simd, 1.0 / 255.0); - (a, b) - }; - - let to_u8 = |val1: f32x16, val2: f32x16| { - let val1 = - f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); - let val2 = - f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); - - val1.simd.combine_u8x16(val1, val2) - }; - let (mut src_1, mut src_2) = to_f32(src_c); let (bg_1, bg_2) = to_f32(bg_c); @@ -41,6 +30,28 @@ pub(crate) fn mix(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMod to_u8(src_1, src_2) } +#[inline(always)] +fn to_f32(val: u8x32) -> (f32x16, f32x16) { + let simd = val.simd; + let (a, b) = simd.split_u8x32(val); + let mut a = u8_to_f32(a); + let mut b = u8_to_f32(b); + a *= f32x16::splat(simd, 1.0 / 255.0); + b *= f32x16::splat(simd, 1.0 / 255.0); + (a, b) +} + +#[inline(always)] +fn to_u8(val1: f32x16, val2: f32x16) -> u8x32 { + let val1 = + f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); + let val2 = + f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); + + val1.simd.combine_u8x16(val1, val2) +} + +#[inline(always)] fn try_u8_mix(blend_mode: BlendMode, src_c: u8x32, bg_c: u8x32) -> Option> { // We implement the u8 fast path for blend modes that // 1) are separable. diff --git a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs index c03d43bd4a..c44de0dbb3 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs @@ -25,33 +25,47 @@ impl ComposeExt for BlendMode { bg_c: u8x32, alpha_mask: Option>, ) -> u8x32 { - let mut res = match self.compose { - Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), - Compose::Clear => Clear::compose(simd, src_c, bg_c), - Compose::Copy => Copy::compose(simd, src_c, bg_c), - Compose::DestOver => DestOver::compose(simd, src_c, bg_c), - Compose::Dest => Dest::compose(simd, src_c, bg_c), - Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), - Compose::DestIn => DestIn::compose(simd, src_c, bg_c), - Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), - Compose::DestOut => DestOut::compose(simd, src_c, bg_c), - Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), - Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), - Compose::Xor => Xor::compose(simd, src_c, bg_c), - Compose::Plus => Plus::compose(simd, src_c, bg_c), - // Have not been able to find a formula for this, so just fallback to Plus. - Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), - }; + simd.vectorize( + #[inline(always)] + || compose_inner(*self, simd, src_c, bg_c, alpha_mask), + ) + } +} - if let Some(alpha_mask) = alpha_mask { - let alpha_mask_inv = 255 - alpha_mask; - let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res); - let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c); - res = simd.narrow_u16x32((p1 + p2).div_255()); - } +#[inline(always)] +fn compose_inner( + blend_mode: BlendMode, + simd: S, + src_c: u8x32, + bg_c: u8x32, + alpha_mask: Option>, +) -> u8x32 { + let mut res = match blend_mode.compose { + Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), + Compose::Clear => Clear::compose(simd, src_c, bg_c), + Compose::Copy => Copy::compose(simd, src_c, bg_c), + Compose::DestOver => DestOver::compose(simd, src_c, bg_c), + Compose::Dest => Dest::compose(simd, src_c, bg_c), + Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), + Compose::DestIn => DestIn::compose(simd, src_c, bg_c), + Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), + Compose::DestOut => DestOut::compose(simd, src_c, bg_c), + Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), + Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), + Compose::Xor => Xor::compose(simd, src_c, bg_c), + Compose::Plus => Plus::compose(simd, src_c, bg_c), + // Have not been able to find a formula for this, so just fallback to Plus. + Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), + }; - res + if let Some(alpha_mask) = alpha_mask { + let alpha_mask_inv = 255 - alpha_mask; + let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res); + let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c); + res = simd.narrow_u16x32((p1 + p2).div_255()); } + + res } macro_rules! compose { @@ -59,6 +73,7 @@ macro_rules! compose { struct $name; impl $name { + #[inline(always)] fn compose(simd: S, src_c: u8x32, bg_c: u8x32) -> u8x32 { let al_b = bg_c.splat_4th(); let al_s = src_c.splat_4th(); From 97bd00a6ee38a0a233be41783a717e57419662b8 Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 12:21:41 +0200 Subject: [PATCH 5/7] More tweaks --- sparse_strips/vello_common/src/encode.rs | 10 ++++++ sparse_strips/vello_cpu/src/fine/highp/mod.rs | 32 ++++++++++--------- sparse_strips/vello_cpu/src/fine/lowp/mod.rs | 31 ++++++++++-------- sparse_strips/vello_cpu/src/fine/mod.rs | 19 +++++------ 4 files changed, 54 insertions(+), 38 deletions(-) diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs index 5161fe021a..d9160ce2c3 100644 --- a/sparse_strips/vello_common/src/encode.rs +++ b/sparse_strips/vello_common/src/encode.rs @@ -985,6 +985,7 @@ pub trait FromF32Color: Sized + Debug + Copy + Clone { impl FromF32Color for f32 { const ZERO: Self = 0.0; + #[inline(always)] fn from_f32(color: f32x4) -> [Self; 4] { color.into() } @@ -993,6 +994,7 @@ impl FromF32Color for f32 { impl FromF32Color for u8 { const ZERO: Self = 0; + #[inline(always)] fn from_f32(mut color: f32x4) -> [Self; 4] { let simd = color.simd; color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5)); @@ -1016,6 +1018,14 @@ pub struct GradientLut { impl GradientLut { /// Create a new lookup table. fn new(simd: S, ranges: &[GradientRange]) -> Self { + simd.vectorize( + #[inline(always)] + || Self::new_inner(simd, ranges), + ) + } + + #[inline(always)] + fn new_inner(simd: S, ranges: &[GradientRange]) -> Self { let lut_size = determine_lut_size(ranges); let mut lut = vec![[T::ZERO; 4]; lut_size]; diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs index eb2236af43..eac28ca36c 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs @@ -162,28 +162,30 @@ impl FineKernel for F32Kernel { painter.paint_f32(dest); } - #[inline(always)] fn apply_tint(simd: S, dest: &mut [Self::Numeric], tint: &Tint) { let premul = tint.color.premultiply(); let [r, g, b, a] = premul.components; - let tint_v = f32x16::block_splat(f32x4::from_slice(simd, &[r, g, b, a])); simd.vectorize( #[inline(always)] - || match tint.mode { - TintMode::AlphaMask => { - for chunk in dest.chunks_exact_mut(16) { - let pixel = f32x16::from_slice(simd, chunk); - let alphas = pixel.splat_4th(); - let tinted = tint_v * alphas; - tinted.store_slice(chunk); + || { + let tint_v = f32x16::block_splat(f32x4::from_slice(simd, &[r, g, b, a])); + + match tint.mode { + TintMode::AlphaMask => { + for chunk in dest.chunks_exact_mut(16) { + let pixel = f32x16::from_slice(simd, chunk); + let alphas = pixel.splat_4th(); + let tinted = tint_v * alphas; + tinted.store_slice(chunk); + } } - } - TintMode::Multiply => { - for chunk in dest.chunks_exact_mut(16) { - let pixel = f32x16::from_slice(simd, chunk); - let tinted = pixel * tint_v; - tinted.store_slice(chunk); + TintMode::Multiply => { + for chunk in dest.chunks_exact_mut(16) { + let pixel = f32x16::from_slice(simd, chunk); + let tinted = pixel * tint_v; + tinted.store_slice(chunk); + } } } }, diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs index 3e9e88b79f..e6e8981684 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs @@ -203,24 +203,27 @@ impl FineKernel for U8Kernel { let [r, g, b, a] = premul.components; let to_u8 = |v: f32| (v * 255.0 + 0.5) as u8; let color = u32::from_ne_bytes([to_u8(r), to_u8(g), to_u8(b), to_u8(a)]); - let tint_v = u32x8::block_splat(u32x4::splat(simd, color)).to_bytes(); simd.vectorize( #[inline(always)] - || match tint.mode { - TintMode::AlphaMask => { - for chunk in dest.chunks_exact_mut(32) { - let pixel = u8x32::from_slice(simd, chunk); - let alphas = pixel.splat_4th(); - let tinted = tint_v.normalized_mul(alphas); - tinted.store_slice(chunk); + || { + let tint_v = u32x8::block_splat(u32x4::splat(simd, color)).to_bytes(); + + match tint.mode { + TintMode::AlphaMask => { + for chunk in dest.chunks_exact_mut(32) { + let pixel = u8x32::from_slice(simd, chunk); + let alphas = pixel.splat_4th(); + let tinted = tint_v.normalized_mul(alphas); + tinted.store_slice(chunk); + } } - } - TintMode::Multiply => { - for chunk in dest.chunks_exact_mut(32) { - let pixel = u8x32::from_slice(simd, chunk); - let tinted = pixel.normalized_mul(tint_v); - tinted.store_slice(chunk); + TintMode::Multiply => { + for chunk in dest.chunks_exact_mut(32) { + let pixel = u8x32::from_slice(simd, chunk); + let tinted = pixel.normalized_mul(tint_v); + tinted.store_slice(chunk); + } } } }, diff --git a/sparse_strips/vello_cpu/src/fine/mod.rs b/sparse_strips/vello_cpu/src/fine/mod.rs index c16c8282df..0a87abf014 100644 --- a/sparse_strips/vello_cpu/src/fine/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/mod.rs @@ -641,15 +641,12 @@ impl> Fine { Cmd::Opacity(o) => { if *o != 1.0 { let blend_buf = self.blend_buf.last_mut().unwrap(); - - T::apply_mask( - self.simd, - blend_buf, - iter::repeat(T::NumericVec::from_f32( - self.simd, - f32x16::splat(self.simd, *o), - )), + let opacity = self.simd.vectorize( + #[inline(always)] + || T::NumericVec::from_f32(self.simd, f32x16::splat(self.simd, *o)), ); + + T::apply_mask(self.simd, blend_buf, iter::repeat(opacity)); } } Cmd::PushZeroClip(_) | Cmd::PopZeroClip => { @@ -704,13 +701,17 @@ impl> Fine { } else { let start_x = self.wide_coords.0 * WideTile::WIDTH + x as u16; let start_y = self.wide_coords.1 * Tile::HEIGHT; + let src = self.simd.vectorize( + #[inline(always)] + || T::Composite::from_color(self.simd, color), + ); T::blend( self.simd, blend_buf, start_x, start_y, - iter::repeat(T::Composite::from_color(self.simd, color)), + iter::repeat(src), blend_mode, alphas, mask, From ec4d32098b2c7008a45b79828507f04d7fb5a03d Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 14:04:49 +0200 Subject: [PATCH 6/7] Add comments --- sparse_strips/vello_common/src/encode.rs | 2 ++ sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs | 2 ++ sparse_strips/vello_cpu/src/dispatch/single_threaded.rs | 4 ++++ 3 files changed, 8 insertions(+) diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs index d9160ce2c3..8805dee87a 100644 --- a/sparse_strips/vello_common/src/encode.rs +++ b/sparse_strips/vello_common/src/encode.rs @@ -778,12 +778,14 @@ pub struct EncodedGradient { impl EncodedGradient { /// Get the lookup table for sampling u8-based gradient values. + // No need to vectorize here, as vectorization happens in the constructor. pub fn u8_lut(&self, simd: S) -> &GradientLut { self.u8_lut .get_or_init(|| GradientLut::new(simd, &self.ranges)) } /// Get the lookup table for sampling f32-based gradient values. + // No need to vectorize here, as vectorization happens in the constructor. pub fn f32_lut(&self, simd: S) -> &GradientLut { self.f32_lut .get_or_init(|| GradientLut::new(simd, &self.ranges)) diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs index 22fb125f90..e27b8a3749 100644 --- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs @@ -358,6 +358,8 @@ impl MultiThreadedDispatcher { } } + // No need to vectorize here, as vectorization happens in each of the + // functions that are called within. fn rasterize_with>( &self, simd: S, diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs index 2a08d4b1b2..6f11c95683 100644 --- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs @@ -118,6 +118,10 @@ impl SingleThreadedDispatcher { dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height, encoded_paints, image_resolver)); } + // Note: We purposefully don't add `vectorize` to each of the functions + // like `rasterize_with`, `composite_at_offset`, etc. since vectoriation + // instead is applied wherever necessary in child functions. + /// Core rasterization dispatcher that chooses between simple and filter-aware paths. /// /// # Type Parameters From f204c08bcbc2ba7c53f801dc15253a6e6a7d0973 Mon Sep 17 00:00:00 2001 From: Laurenz Stampfl Date: Sat, 30 May 2026 14:09:16 +0200 Subject: [PATCH 7/7] Reformat --- sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs | 2 +- sparse_strips/vello_cpu/src/dispatch/single_threaded.rs | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs index e27b8a3749..a587225e05 100644 --- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs @@ -358,7 +358,7 @@ impl MultiThreadedDispatcher { } } - // No need to vectorize here, as vectorization happens in each of the + // No need to vectorize here, as vectorization happens in each of the // functions that are called within. fn rasterize_with>( &self, diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs index 6f11c95683..b0053cead1 100644 --- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs @@ -121,7 +121,7 @@ impl SingleThreadedDispatcher { // Note: We purposefully don't add `vectorize` to each of the functions // like `rasterize_with`, `composite_at_offset`, etc. since vectoriation // instead is applied wherever necessary in child functions. - + /// Core rasterization dispatcher that chooses between simple and filter-aware paths. /// /// # Type Parameters