From bc699bb5795ff1d796b53b6ef6ce6f4f75816535 Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 15:14:15 +0100 Subject: [PATCH 1/2] Fix benchmark proc macro to actually create a #[target_feature(enable = ...)] context --- sparse_strips/vello_dev_macros/src/bench.rs | 23 ++++++++++++++++----- 1 file changed, 18 insertions(+), 5 deletions(-) diff --git a/sparse_strips/vello_dev_macros/src/bench.rs b/sparse_strips/vello_dev_macros/src/bench.rs index 67e12cf31a..e30abdd412 100644 --- a/sparse_strips/vello_dev_macros/src/bench.rs +++ b/sparse_strips/vello_dev_macros/src/bench.rs @@ -4,7 +4,7 @@ use proc_macro::TokenStream; use proc_macro2::Ident; use quote::quote; -use syn::{ItemFn, parse_macro_input}; +use syn::{ItemFn, parse_macro_input, parse_quote}; pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStream { let mut input_fn = parse_macro_input!(item as ItemFn); @@ -14,6 +14,7 @@ pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStrea let inner_fn_name = Ident::new(&format!("{input_fn_name}_inner"), input_fn_name.span()); input_fn.sig.ident = inner_fn_name.clone(); + input_fn.attrs.push(parse_quote!(#[inline(always)])); let expanded = quote! { #input_fn @@ -37,14 +38,26 @@ pub(crate) fn vello_bench_inner(_: TokenStream, item: TokenStream) -> TokenStrea format!("{}/{}_{}", module, suffix1, suffix2) } + #[inline(always)] fn run_integer(b: &mut Bencher, simd: S) { - let mut fine = Fine::::new(simd); - #inner_fn_name(b, &mut fine); + simd.vectorize( + #[inline(always)] + || { + let mut fine = Fine::::new(simd); + #inner_fn_name(b, &mut fine); + }, + ); } + #[inline(always)] fn run_float(b: &mut Bencher, simd: S) { - let mut fine = Fine::::new(simd); - #inner_fn_name(b, &mut fine); + simd.vectorize( + #[inline(always)] + || { + let mut fine = Fine::::new(simd); + #inner_fn_name(b, &mut fine); + }, + ); } // Uncomment this to enable u8_scalar benchmarks. From 34c840dd767104fc0ee8243882694e335927a45b Mon Sep 17 00:00:00 2001 From: "Sergey \"Shnatsel\" Davidoff" Date: Wed, 27 May 2026 17:37:46 +0100 Subject: [PATCH 2/2] Insert vectorize() into all places that should be #[inline(always)] for feature propagation to work, but currently aren't --- sparse_strips/vello_common/src/clip.rs | 256 ++++--- sparse_strips/vello_common/src/encode.rs | 173 +++-- sparse_strips/vello_common/src/rect.rs | 174 ++--- sparse_strips/vello_common/src/tile.rs | 712 +++++++++--------- .../vello_cpu/src/dispatch/multi_threaded.rs | 82 +- .../vello_cpu/src/dispatch/single_threaded.rs | 431 ++++++----- .../vello_cpu/src/fine/common/gradient/mod.rs | 1 + .../vello_cpu/src/fine/common/image.rs | 1 + .../src/fine/common/rounded_blurred_rect.rs | 62 +- .../vello_cpu/src/fine/highp/blend.rs | 264 ++++--- .../vello_cpu/src/fine/highp/compose.rs | 110 +-- sparse_strips/vello_cpu/src/fine/highp/mod.rs | 20 +- .../vello_cpu/src/fine/lowp/blend.rs | 122 +-- .../vello_cpu/src/fine/lowp/compose.rs | 88 ++- sparse_strips/vello_cpu/src/fine/lowp/mod.rs | 4 + 15 files changed, 1368 insertions(+), 1132 deletions(-) diff --git a/sparse_strips/vello_common/src/clip.rs b/sparse_strips/vello_common/src/clip.rs index a305e76f84..c68f1489c9 100644 --- a/sparse_strips/vello_common/src/clip.rs +++ b/sparse_strips/vello_common/src/clip.rs @@ -230,135 +230,163 @@ pub fn intersect( /// /// This is all that this method does. It just looks more complicated as the logic for iterating /// in lock step is a bit tricky. +#[inline(always)] fn intersect_impl( simd: S, path_1: PathDataRef<'_>, path_2: PathDataRef<'_>, target: &mut StripStorage, ) { - // In case either path is empty, the clip path should be empty. - if path_1.strips.is_empty() || path_2.strips.is_empty() { - return; - } - - // Ignore any y values that are outside the bounding box of either of the two paths, as - // those are guaranteed to have neither fill nor strip regions. - let mut cur_y = path_1.strips[0].strip_y().min(path_2.strips[0].strip_y()); - let end_y = path_1.strips[path_1.strips.len() - 1] - .strip_y() - .min(path_2.strips[path_2.strips.len() - 1].strip_y()); - - let mut path_1_idx = 0; - let mut path_2_idx = 0; - let mut strip_state = None; - - // Iterate over each strip row and handle them. - while cur_y <= end_y { - // For each row, we create two iterators that alternatingly yield the strips and fill - // regions in that row, until the last strip has been reached. - let mut p1_iter = RowIterator::new(path_1, &mut path_1_idx, cur_y); - let mut p2_iter = RowIterator::new(path_2, &mut path_2_idx, cur_y); - - let mut p1_region = p1_iter.next(); - let mut p2_region = p2_iter.next(); - - // If at least one region is none, it means that we reached the end of the row - // for that path, meaning that we exceeded the bounding box of that path and no - // additional strips should be generated for that row, even if the other path might - // still have more strips left. They will all be clipped away. So only consider it - // if both paths have a region left. - while let (Some(region_1), Some(region_2)) = (p1_region, p2_region) { - match region_1.overlap_relationship(®ion_2) { - // This means there is no overlap between the regions, so we need to advance - // the iterator of the region that is further behind. - OverlapRelationship::Advance(advance) => { - match advance { - Advance::Left => p1_region = p1_iter.next(), - Advance::Right => p2_region = p2_iter.next(), - }; - - continue; - } - // We have an overlap! - OverlapRelationship::Overlap(overlap) => { - match (region_1, region_2) { - // Both regions are a fill. Flush the current strip and start a new - // one at the end of the overlap region setting `fill_gap` to true, - // so that the whole area before that will be filled with a sparse - // fill. - (Region::Fill(_), Region::Fill(_)) => { - flush_strip(&mut strip_state, &mut target.strips, cur_y); - start_strip(&mut strip_state, &target.alphas, overlap.end, true); - } - // One fill one strip, so we simply use the alpha mask from the strip region. - (Region::Strip(s), Region::Fill(_)) - | (Region::Fill(_), Region::Strip(s)) => { - // If possible, don't create a new strip but just extend the current one. - if should_create_new_strip(&strip_state, &target.alphas, overlap.start) - { - flush_strip(&mut strip_state, &mut target.strips, cur_y); - start_strip(&mut strip_state, &target.alphas, overlap.start, false); - } + simd.vectorize( + #[inline(always)] + || { + // In case either path is empty, the clip path should be empty. + if path_1.strips.is_empty() || path_2.strips.is_empty() { + return; + } - let s_alphas = &s.alphas[(overlap.start - s.start) as usize * 4..] - [..overlap.width() as usize * 4]; - target.alphas.extend_from_slice(s_alphas); + // Ignore any y values that are outside the bounding box of either of the two paths, as + // those are guaranteed to have neither fill nor strip regions. + let mut cur_y = path_1.strips[0].strip_y().min(path_2.strips[0].strip_y()); + let end_y = path_1.strips[path_1.strips.len() - 1] + .strip_y() + .min(path_2.strips[path_2.strips.len() - 1].strip_y()); + + let mut path_1_idx = 0; + let mut path_2_idx = 0; + let mut strip_state = None; + + // Iterate over each strip row and handle them. + while cur_y <= end_y { + // For each row, we create two iterators that alternatingly yield the strips and fill + // regions in that row, until the last strip has been reached. + let mut p1_iter = RowIterator::new(path_1, &mut path_1_idx, cur_y); + let mut p2_iter = RowIterator::new(path_2, &mut path_2_idx, cur_y); + + let mut p1_region = p1_iter.next(); + let mut p2_region = p2_iter.next(); + + // If at least one region is none, it means that we reached the end of the row + // for that path, meaning that we exceeded the bounding box of that path and no + // additional strips should be generated for that row, even if the other path might + // still have more strips left. They will all be clipped away. So only consider it + // if both paths have a region left. + while let (Some(region_1), Some(region_2)) = (p1_region, p2_region) { + match region_1.overlap_relationship(®ion_2) { + // This means there is no overlap between the regions, so we need to advance + // the iterator of the region that is further behind. + OverlapRelationship::Advance(advance) => { + match advance { + Advance::Left => p1_region = p1_iter.next(), + Advance::Right => p2_region = p2_iter.next(), + }; + + continue; } - // Two strips, we need to multiply the opacity masks from both paths. - (Region::Strip(s_region_1), Region::Strip(s_region_2)) => { - // Once again, only create a new strip if we can't extend the current one. - if should_create_new_strip(&strip_state, &target.alphas, overlap.start) - { - flush_strip(&mut strip_state, &mut target.strips, cur_y); - start_strip(&mut strip_state, &target.alphas, overlap.start, false); + // We have an overlap! + OverlapRelationship::Overlap(overlap) => { + match (region_1, region_2) { + // Both regions are a fill. Flush the current strip and start a new + // one at the end of the overlap region setting `fill_gap` to true, + // so that the whole area before that will be filled with a sparse + // fill. + (Region::Fill(_), Region::Fill(_)) => { + flush_strip(&mut strip_state, &mut target.strips, cur_y); + start_strip( + &mut strip_state, + &target.alphas, + overlap.end, + true, + ); + } + // One fill one strip, so we simply use the alpha mask from the strip region. + (Region::Strip(s), Region::Fill(_)) + | (Region::Fill(_), Region::Strip(s)) => { + // If possible, don't create a new strip but just extend the current one. + if should_create_new_strip( + &strip_state, + &target.alphas, + overlap.start, + ) { + flush_strip(&mut strip_state, &mut target.strips, cur_y); + start_strip( + &mut strip_state, + &target.alphas, + overlap.start, + false, + ); + } + + let s_alphas = &s.alphas + [(overlap.start - s.start) as usize * 4..] + [..overlap.width() as usize * 4]; + target.alphas.extend_from_slice(s_alphas); + } + // Two strips, we need to multiply the opacity masks from both paths. + (Region::Strip(s_region_1), Region::Strip(s_region_2)) => { + // Once again, only create a new strip if we can't extend the current one. + if should_create_new_strip( + &strip_state, + &target.alphas, + overlap.start, + ) { + flush_strip(&mut strip_state, &mut target.strips, cur_y); + start_strip( + &mut strip_state, + &target.alphas, + overlap.start, + false, + ); + } + + let num_blocks = overlap.width() / Tile::HEIGHT; + + // Get the right alpha values for the specific position. + let s1_alphas = s_region_1.alphas + [(overlap.start - s_region_1.start) as usize * 4..] + .chunks_exact(16) + .take(num_blocks as usize); + let s2_alphas = s_region_2.alphas + [(overlap.start - s_region_2.start) as usize * 4..] + .chunks_exact(16) + .take(num_blocks as usize); + + for (s1_alpha, s2_alpha) in s1_alphas.zip(s2_alphas) { + let s1 = u8x16::from_slice(simd, s1_alpha); + let s2 = u8x16::from_slice(simd, s2_alpha); + + // Combine them. + let res = simd.narrow_u16x16(normalized_mul_u8x16(s1, s2)); + target.alphas.extend(res.as_slice()); + } + } } - let num_blocks = overlap.width() / Tile::HEIGHT; - - // Get the right alpha values for the specific position. - let s1_alphas = s_region_1.alphas - [(overlap.start - s_region_1.start) as usize * 4..] - .chunks_exact(16) - .take(num_blocks as usize); - let s2_alphas = s_region_2.alphas - [(overlap.start - s_region_2.start) as usize * 4..] - .chunks_exact(16) - .take(num_blocks as usize); - - for (s1_alpha, s2_alpha) in s1_alphas.zip(s2_alphas) { - let s1 = u8x16::from_slice(simd, s1_alpha); - let s2 = u8x16::from_slice(simd, s2_alpha); - - // Combine them. - let res = simd.narrow_u16x16(normalized_mul_u8x16(s1, s2)); - target.alphas.extend(res.as_slice()); - } + // Advance the iterator of the path whose region's end is further behind. + match overlap.advance { + Advance::Left => p1_region = p1_iter.next(), + Advance::Right => p2_region = p2_iter.next(), + }; } } - - // Advance the iterator of the path whose region's end is further behind. - match overlap.advance { - Advance::Left => p1_region = p1_iter.next(), - Advance::Right => p2_region = p2_iter.next(), - }; } - } - } - // Flush the strip before advancing to the next strip row. - flush_strip(&mut strip_state, &mut target.strips, cur_y); - cur_y += 1; - } + // Flush the strip before advancing to the next strip row. + flush_strip(&mut strip_state, &mut target.strips, cur_y); + cur_y += 1; + } - // Push the sentinel strip, if one wasn't already pushed. - if !target.strips.last().is_some_and(Strip::is_sentinel) { - target.strips.push(Strip::new( - u16::MAX, - end_y * Tile::HEIGHT, - target.alphas.len() as u32, - false, - )); - } + // Push the sentinel strip, if one wasn't already pushed. + if !target.strips.last().is_some_and(Strip::is_sentinel) { + target.strips.push(Strip::new( + u16::MAX, + end_y * Tile::HEIGHT, + target.alphas.len() as u32, + false, + )); + } + }, + ); } /// An overlap between two regions. diff --git a/sparse_strips/vello_common/src/encode.rs b/sparse_strips/vello_common/src/encode.rs index 5161fe021a..a309050d2a 100644 --- a/sparse_strips/vello_common/src/encode.rs +++ b/sparse_strips/vello_common/src/encode.rs @@ -778,15 +778,27 @@ pub struct EncodedGradient { impl EncodedGradient { /// Get the lookup table for sampling u8-based gradient values. + #[inline(always)] pub fn u8_lut(&self, simd: S) -> &GradientLut { - self.u8_lut - .get_or_init(|| GradientLut::new(simd, &self.ranges)) + simd.vectorize( + #[inline(always)] + || { + self.u8_lut + .get_or_init(|| GradientLut::new(simd, &self.ranges)) + }, + ) } /// Get the lookup table for sampling f32-based gradient values. + #[inline(always)] pub fn f32_lut(&self, simd: S) -> &GradientLut { - self.f32_lut - .get_or_init(|| GradientLut::new(simd, &self.ranges)) + simd.vectorize( + #[inline(always)] + || { + self.f32_lut + .get_or_init(|| GradientLut::new(simd, &self.ranges)) + }, + ) } } @@ -985,24 +997,34 @@ pub trait FromF32Color: Sized + Debug + Copy + Clone { impl FromF32Color for f32 { const ZERO: Self = 0.0; + #[inline(always)] fn from_f32(color: f32x4) -> [Self; 4] { - color.into() + color.simd.vectorize( + #[inline(always)] + || color.into(), + ) } } impl FromF32Color for u8 { const ZERO: Self = 0; + #[inline(always)] fn from_f32(mut color: f32x4) -> [Self; 4] { - let simd = color.simd; - color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5)); - - [ - color[0] as Self, - color[1] as Self, - color[2] as Self, - color[3] as Self, - ] + color.simd.vectorize( + #[inline(always)] + || { + let simd = color.simd; + color = color.mul_add(f32x4::splat(simd, 255.0), f32x4::splat(simd, 0.5)); + + [ + color[0] as Self, + color[1] as Self, + color[2] as Self, + color[3] as Self, + ] + }, + ) } } @@ -1015,68 +1037,79 @@ pub struct GradientLut { impl GradientLut { /// Create a new lookup table. + #[inline(always)] fn new(simd: S, ranges: &[GradientRange]) -> Self { - let lut_size = determine_lut_size(ranges); - let mut lut = vec![[T::ZERO; 4]; lut_size]; - - // Calculate how many indices are covered by each range. - let ramps = { - let mut ramps = Vec::with_capacity(ranges.len()); - let mut prev_idx = 0; - - for range in ranges { - let max_idx = (range.x1 * lut_size as f32) as usize; - - ramps.push((prev_idx..max_idx, range)); - prev_idx = max_idx; - } - - ramps - }; - - let scale = lut_size as f32 - 1.0; - - let inv_lut_scale = f32x4::splat(simd, 1.0 / scale); - let add_factor = f32x4::from_slice(simd, &[0.0, 1.0, 2.0, 3.0]) * inv_lut_scale; - - for (ramp_range, range) in ramps { - let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias)); - let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale)); + simd.vectorize( + #[inline(always)] + || { + let lut_size = determine_lut_size(ranges); + let mut lut = vec![[T::ZERO; 4]; lut_size]; + + // Calculate how many indices are covered by each range. + let ramps = { + let mut ramps = Vec::with_capacity(ranges.len()); + let mut prev_idx = 0; + + for range in ranges { + let max_idx = (range.x1 * lut_size as f32) as usize; + + ramps.push((prev_idx..max_idx, range)); + prev_idx = max_idx; + } - ramp_range.clone().step_by(4).for_each(|idx| { - let t_vals = f32x4::splat(simd, idx as f32).mul_add(inv_lut_scale, add_factor); + ramps + }; - let t_vals = element_wise_splat(simd, t_vals); + let scale = lut_size as f32 - 1.0; + + let inv_lut_scale = f32x4::splat(simd, 1.0 / scale); + let add_factor = f32x4::from_slice(simd, &[0.0, 1.0, 2.0, 3.0]) * inv_lut_scale; + + for (ramp_range, range) in ramps { + let biases = f32x16::block_splat(f32x4::from_slice(simd, &range.bias)); + let scales = f32x16::block_splat(f32x4::from_slice(simd, &range.scale)); + + ramp_range.clone().step_by(4).for_each(|idx| { + let t_vals = + f32x4::splat(simd, idx as f32).mul_add(inv_lut_scale, add_factor); + + let t_vals = element_wise_splat(simd, t_vals); + + let mut result = scales.mul_add(t_vals, biases); + let alphas = result.splat_4th(); + // Premultiply colors, since we did interpolation in unpremultiplied space. + if range.interpolation_alpha_space + == InterpolationAlphaSpace::Unpremultiplied + { + result = { + let mask = mask32x16::block_splat(mask32x4::from_slice( + simd, + &[-1, -1, -1, 0], + )); + simd.select_f32x16(mask, result * alphas, alphas) + }; + } - let mut result = scales.mul_add(t_vals, biases); - let alphas = result.splat_4th(); - // Premultiply colors, since we did interpolation in unpremultiplied space. - if range.interpolation_alpha_space == InterpolationAlphaSpace::Unpremultiplied { - result = { - let mask = - mask32x16::block_splat(mask32x4::from_slice(simd, &[-1, -1, -1, 0])); - simd.select_f32x16(mask, result * alphas, alphas) - }; + // Due to floating-point impreciseness, it can happen that + // values either become greater than 1 or the RGB channels + // become greater than the alpha channel. To prevent overflows + // in later parts of the pipeline, we need to take the minimum here. + result = result.min(1.0).min(alphas); + let (im1, im2) = simd.split_f32x16(result); + let (r1, r2) = simd.split_f32x8(im1); + let (r3, r4) = simd.split_f32x8(im2); + let rs = [r1, r2, r3, r4].map(T::from_f32); + + // We always compute 4 samples at a time, but a gradient ramp does not necessarily + // start at a multiple of 4, therefore we might have to truncate. + let lut = &mut lut[idx..(idx + 4).min(lut_size)]; + lut.copy_from_slice(&rs[..lut.len()]); + }); } - // Due to floating-point impreciseness, it can happen that - // values either become greater than 1 or the RGB channels - // become greater than the alpha channel. To prevent overflows - // in later parts of the pipeline, we need to take the minimum here. - result = result.min(1.0).min(alphas); - let (im1, im2) = simd.split_f32x16(result); - let (r1, r2) = simd.split_f32x8(im1); - let (r3, r4) = simd.split_f32x8(im2); - let rs = [r1, r2, r3, r4].map(T::from_f32); - - // We always compute 4 samples at a time, but a gradient ramp does not necessarily - // start at a multiple of 4, therefore we might have to truncate. - let lut = &mut lut[idx..(idx + 4).min(lut_size)]; - lut.copy_from_slice(&rs[..lut.len()]); - }); - } - - Self { lut, scale } + Self { lut, scale } + }, + ) } /// Get the sample value at a specific index. diff --git a/sparse_strips/vello_common/src/rect.rs b/sparse_strips/vello_common/src/rect.rs index 55f98b7160..8513072d12 100644 --- a/sparse_strips/vello_common/src/rect.rs +++ b/sparse_strips/vello_common/src/rect.rs @@ -40,96 +40,102 @@ pub fn render(level: Level, rect: Rect, strip_buf: &mut Vec, alpha_buf: & /// /// The x-alpha masks for the left/right edge tiles are y-independent, so they /// are precomputed once and reused across all interior rows. +#[inline(always)] fn render_impl(s: S, rect: Rect, strip_buf: &mut Vec, alpha_buf: &mut Vec) { - if rect.is_zero_area() { - return; - } - - let rect_x0 = rect.x0 as f32; - let rect_y0 = rect.y0 as f32; - let rect_x1 = rect.x1 as f32; - let rect_y1 = rect.y1 as f32; - - // Integer pixel bounds. - let px_x0 = rect_x0.floor() as u16; - let px_y0 = rect_y0.floor() as u16; - let px_y1 = rect_y1.ceil() as u16; - - let left_tile_x = (px_x0 / Tile::WIDTH) * Tile::WIDTH; - // Inclusive, so don't use `ceil` here but just `rect_x1` directly. - let right_tile_x = (rect_x1 as u16 / Tile::WIDTH) * Tile::WIDTH; - - let y0 = (px_y0 / Tile::HEIGHT) * Tile::HEIGHT; - // Note: y1 is exclusive, but it's gonna break for the very last tile if we have a height of u16::MAX. - let y1 = (px_y1.saturating_add(Tile::HEIGHT - 1) / Tile::HEIGHT) * Tile::HEIGHT; - // Include one tile past the right edge so the right-edge tile column is - // covered by the edge-row wide-strip loop. - let x_end = right_tile_x.saturating_add(Tile::WIDTH); - - if x_end <= left_tile_x || y1 <= y0 { - return; - } + s.vectorize( + #[inline(always)] + || { + if rect.is_zero_area() { + return; + } - let tile_start_y = y0 / Tile::HEIGHT; - let tile_end_y = y1 / Tile::HEIGHT; - - // A right strip is only needed when the rect spans more than one tile column. - let needs_right_strip = right_tile_x > left_tile_x; - - let left_x_cov = coverage(left_tile_x, rect_x0, rect_x1); - let right_x_cov = coverage(right_tile_x, rect_x0, rect_x1); - let left_x_mask = alpha_mask_from_x_coverage(s, &left_x_cov); - let right_x_mask = alpha_mask_from_x_coverage(s, &right_x_cov); - - for tile_y in tile_start_y..tile_end_y { - let strip_y = tile_y * Tile::HEIGHT; - let strip_y_f = strip_y as f32; - let strip_y_end_f = strip_y as f32 + Tile::HEIGHT as f32; - - // A row is an "edge" if the rect's top or bottom boundary falls - // *inside* it (i.e. partial vertical coverage). - let is_top_edge = strip_y_f < rect_y0 && rect_y0 < strip_y_end_f; - let is_bottom_edge = strip_y_f < rect_y1 && rect_y1 < strip_y_end_f; - - if is_top_edge || is_bottom_edge { - let alpha_start = alpha_buf.len() as u32; - - let y_cov = coverage(strip_y, rect_y0, rect_y1); - let mut col = left_tile_x; - // TODO: Can this result in an infinite loop in case x_end == u16::MAX? - while col + Tile::WIDTH <= x_end { - // TODO: We could optimize this so this is only computed for the left-most and right-most - // tile of the edge, all intermediate tiles have full horizontal coverage. - let x_cov = coverage(col, rect_x0, rect_x1); - let combined = combined_tile_alpha(s, &x_cov, &y_cov); - alpha_buf.extend_from_slice(combined.as_slice()); - col += Tile::WIDTH; + let rect_x0 = rect.x0 as f32; + let rect_y0 = rect.y0 as f32; + let rect_x1 = rect.x1 as f32; + let rect_y1 = rect.y1 as f32; + + // Integer pixel bounds. + let px_x0 = rect_x0.floor() as u16; + let px_y0 = rect_y0.floor() as u16; + let px_y1 = rect_y1.ceil() as u16; + + let left_tile_x = (px_x0 / Tile::WIDTH) * Tile::WIDTH; + // Inclusive, so don't use `ceil` here but just `rect_x1` directly. + let right_tile_x = (rect_x1 as u16 / Tile::WIDTH) * Tile::WIDTH; + + let y0 = (px_y0 / Tile::HEIGHT) * Tile::HEIGHT; + // Note: y1 is exclusive, but it's gonna break for the very last tile if we have a height of u16::MAX. + let y1 = (px_y1.saturating_add(Tile::HEIGHT - 1) / Tile::HEIGHT) * Tile::HEIGHT; + // Include one tile past the right edge so the right-edge tile column is + // covered by the edge-row wide-strip loop. + let x_end = right_tile_x.saturating_add(Tile::WIDTH); + + if x_end <= left_tile_x || y1 <= y0 { + return; } - strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false)); - } else { - let alpha_start = alpha_buf.len() as u32; - alpha_buf.extend_from_slice(left_x_mask.as_slice()); - strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false)); - - if needs_right_strip { - // `fill_gap = true` tells the renderer to fill solid 0xFF - // between the previous strip's end and this strip's start. - let alpha_start = alpha_buf.len() as u32; - alpha_buf.extend_from_slice(right_x_mask.as_slice()); - strip_buf.push(Strip::new(right_tile_x, strip_y, alpha_start, true)); + let tile_start_y = y0 / Tile::HEIGHT; + let tile_end_y = y1 / Tile::HEIGHT; + + // A right strip is only needed when the rect spans more than one tile column. + let needs_right_strip = right_tile_x > left_tile_x; + + let left_x_cov = coverage(left_tile_x, rect_x0, rect_x1); + let right_x_cov = coverage(right_tile_x, rect_x0, rect_x1); + let left_x_mask = alpha_mask_from_x_coverage(s, &left_x_cov); + let right_x_mask = alpha_mask_from_x_coverage(s, &right_x_cov); + + for tile_y in tile_start_y..tile_end_y { + let strip_y = tile_y * Tile::HEIGHT; + let strip_y_f = strip_y as f32; + let strip_y_end_f = strip_y as f32 + Tile::HEIGHT as f32; + + // A row is an "edge" if the rect's top or bottom boundary falls + // *inside* it (i.e. partial vertical coverage). + let is_top_edge = strip_y_f < rect_y0 && rect_y0 < strip_y_end_f; + let is_bottom_edge = strip_y_f < rect_y1 && rect_y1 < strip_y_end_f; + + if is_top_edge || is_bottom_edge { + let alpha_start = alpha_buf.len() as u32; + + let y_cov = coverage(strip_y, rect_y0, rect_y1); + let mut col = left_tile_x; + // TODO: Can this result in an infinite loop in case x_end == u16::MAX? + while col + Tile::WIDTH <= x_end { + // TODO: We could optimize this so this is only computed for the left-most and right-most + // tile of the edge, all intermediate tiles have full horizontal coverage. + let x_cov = coverage(col, rect_x0, rect_x1); + let combined = combined_tile_alpha(s, &x_cov, &y_cov); + alpha_buf.extend_from_slice(combined.as_slice()); + col += Tile::WIDTH; + } + + strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false)); + } else { + let alpha_start = alpha_buf.len() as u32; + alpha_buf.extend_from_slice(left_x_mask.as_slice()); + strip_buf.push(Strip::new(left_tile_x, strip_y, alpha_start, false)); + + if needs_right_strip { + // `fill_gap = true` tells the renderer to fill solid 0xFF + // between the previous strip's end and this strip's start. + let alpha_start = alpha_buf.len() as u32; + alpha_buf.extend_from_slice(right_x_mask.as_slice()); + strip_buf.push(Strip::new(right_tile_x, strip_y, alpha_start, true)); + } + } } - } - } - // Sentinel strip: marks the end of the strip list for this shape. - let last_strip_y = (tile_end_y - 1) * Tile::HEIGHT; - strip_buf.push(Strip::new( - u16::MAX, - last_strip_y, - alpha_buf.len() as u32, - false, - )); + // Sentinel strip: marks the end of the strip list for this shape. + let last_strip_y = (tile_end_y - 1) * Tile::HEIGHT; + strip_buf.push(Strip::new( + u16::MAX, + last_strip_y, + alpha_buf.len() as u32, + false, + )); + }, + ); } /// Compute fractional pixel coverage for `N` consecutive pixels starting at `start`. diff --git a/sparse_strips/vello_common/src/tile.rs b/sparse_strips/vello_common/src/tile.rs index b64e2d7024..ef3dcb078f 100644 --- a/sparse_strips/vello_common/src/tile.rs +++ b/sparse_strips/vello_common/src/tile.rs @@ -503,6 +503,7 @@ impl Tiles { )) } + #[inline(always)] fn make_tiles_analytic_aa_impl( &mut self, s: S, @@ -510,408 +511,423 @@ impl Tiles { width: u16, height: u16, ) -> bool { - self.reset(); + s.vectorize( + #[inline(always)] + || { + self.reset(); - if width == 0 || height == 0 { - return self.windings.culled; - } + if width == 0 || height == 0 { + return self.windings.culled; + } - debug_assert!( - lines.len() <= MAX_LINES_PER_PATH as usize, - "Max. number of lines per path exceeded. Max is {}, got {}.", - MAX_LINES_PER_PATH, - lines.len() - ); + debug_assert!( + lines.len() <= MAX_LINES_PER_PATH as usize, + "Max. number of lines per path exceeded. Max is {}, got {}.", + MAX_LINES_PER_PATH, + lines.len() + ); - let tile_columns = width.div_ceil(Tile::WIDTH); - let tile_rows = height.div_ceil(Tile::HEIGHT); + let tile_columns = width.div_ceil(Tile::WIDTH); + let tile_rows = height.div_ceil(Tile::HEIGHT); - let px_top = f32x4::from_slice(s, &[0.0, 1.0, 2.0, 3.0]); - let px_bottom = px_top + f32x4::splat(s, 1.0); - let simd_zero = f32x4::splat(s, 0.0); - let tile_height_f32 = Tile::HEIGHT as f32; + let px_top = f32x4::from_slice(s, &[0.0, 1.0, 2.0, 3.0]); + let px_bottom = px_top + f32x4::splat(s, 1.0); + let simd_zero = f32x4::splat(s, 0.0); + let tile_height_f32 = Tile::HEIGHT as f32; - for (line_idx, line) in lines.iter().take(MAX_LINES_PER_PATH as usize).enumerate() { - let line_idx = line_idx as u32; + for (line_idx, line) in lines.iter().take(MAX_LINES_PER_PATH as usize).enumerate() { + let line_idx = line_idx as u32; - let p0_x = line.p0.x / f32::from(Tile::WIDTH); - let p0_y = line.p0.y / f32::from(Tile::HEIGHT); - let p1_x = line.p1.x / f32::from(Tile::WIDTH); - let p1_y = line.p1.y / f32::from(Tile::HEIGHT); + let p0_x = line.p0.x / f32::from(Tile::WIDTH); + let p0_y = line.p0.y / f32::from(Tile::HEIGHT); + let p1_x = line.p1.x / f32::from(Tile::WIDTH); + let p1_y = line.p1.y / f32::from(Tile::HEIGHT); - let (line_left_x, line_right_x) = if p0_x < p1_x { - (p0_x, p1_x) - } else { - (p1_x, p0_x) - }; + let (line_left_x, line_right_x) = if p0_x < p1_x { + (p0_x, p1_x) + } else { + (p1_x, p0_x) + }; - // Lines whose left-most endpoint exceed the right edge of the viewport are culled - if line_left_x > tile_columns as f32 { - continue; - } + // Lines whose left-most endpoint exceed the right edge of the viewport are culled + if line_left_x > tile_columns as f32 { + continue; + } - let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y { - (p0_y, p0_x, p1_y, p1_x) - } else { - (p1_y, p1_x, p0_y, p0_x) - }; + let (line_top_y, line_top_x, line_bottom_y, line_bottom_x) = if p0_y < p1_y { + (p0_y, p0_x, p1_y, p1_x) + } else { + (p1_y, p1_x, p0_y, p0_x) + }; - // The `as u16` casts here intentionally clamp negative coordinates to 0. - let y_top_tiles = (line_top_y as u16).min(tile_rows); - let line_bottom_y_ceil = line_bottom_y.ceil(); - let y_bottom_tiles = (line_bottom_y_ceil as u16).min(tile_rows); + // The `as u16` casts here intentionally clamp negative coordinates to 0. + let y_top_tiles = (line_top_y as u16).min(tile_rows); + let line_bottom_y_ceil = line_bottom_y.ceil(); + let y_bottom_tiles = (line_bottom_y_ceil as u16).min(tile_rows); + + // If y_top_tiles == y_bottom_tiles, then the line is either completely above or below + // the viewport OR it is perfectly horizontal and aligned to the tile grid, contributing + // no winding. In either case, it should be culled. + if y_top_tiles >= y_bottom_tiles { + // Technically, the `>` part of the `>=` is unnecessary due to clamping, but this + // gives stronger signal + continue; + } - // If y_top_tiles == y_bottom_tiles, then the line is either completely above or below - // the viewport OR it is perfectly horizontal and aligned to the tile grid, contributing - // no winding. In either case, it should be culled. - if y_top_tiles >= y_bottom_tiles { - // Technically, the `>` part of the `>=` is unnecessary due to clamping, but this - // gives stronger signal - continue; - } + let dir = if p0_y >= p1_y { 1 } else { -1 }; + let f_dir = dir as f32; + let f_dir_v = f32x4::splat(s, f_dir); - let dir = if p0_y >= p1_y { 1 } else { -1 }; - let f_dir = dir as f32; - let f_dir_v = f32x4::splat(s, f_dir); + macro_rules! calc_fractional_coverage { + ($y_idx:expr, $segment_top_y:expr, $segment_bottom_y:expr) => {{ + let y_idx_f32 = f32::from($y_idx); + let local_y_start = ($segment_top_y - y_idx_f32) * tile_height_f32; + let local_y_end = ($segment_bottom_y - y_idx_f32) * tile_height_f32; - macro_rules! calc_fractional_coverage { - ($y_idx:expr, $segment_top_y:expr, $segment_bottom_y:expr) => {{ - let y_idx_f32 = f32::from($y_idx); - let local_y_start = ($segment_top_y - y_idx_f32) * tile_height_f32; - let local_y_end = ($segment_bottom_y - y_idx_f32) * tile_height_f32; + let start_v = f32x4::splat(s, local_y_start); + let end_v = f32x4::splat(s, local_y_end); - let start_v = f32x4::splat(s, local_y_start); - let end_v = f32x4::splat(s, local_y_end); + (px_bottom.min(end_v) - px_top.max(start_v)).max(simd_zero) + }}; + } - (px_bottom.min(end_v) - px_top.max(start_v)).max(simd_zero) - }}; - } + // Lines fully to the left of the viewport are not visible but still produce winding + // which we record here and forward to the rendering stage. + if line_right_x < 0.0 { + let is_start_culled = line_top_y < 0.0; - // Lines fully to the left of the viewport are not visible but still produce winding - // which we record here and forward to the rendering stage. - if line_right_x < 0.0 { - let is_start_culled = line_top_y < 0.0; - - // This branch is for handling the "start" of the line. In case - // the line reaches above the viewport, we are already in the - // middle so we can skip that part. - if !is_start_culled { - self.windings.mark_row_active(y_top_tiles as usize); - - // Note: In theory, == should be enough, but just as - // additional safety against numerical precision errors we - // use <=. - let at_top_of_tile = line_top_y <= f32::from(y_top_tiles); - if at_top_of_tile { - self.windings.coarse[y_top_tiles as usize] += dir; - } + // This branch is for handling the "start" of the line. In case + // the line reaches above the viewport, we are already in the + // middle so we can skip that part. + if !is_start_culled { + self.windings.mark_row_active(y_top_tiles as usize); + + // Note: In theory, == should be enough, but just as + // additional safety against numerical precision errors we + // use <=. + let at_top_of_tile = line_top_y <= f32::from(y_top_tiles); + if at_top_of_tile { + self.windings.coarse[y_top_tiles as usize] += dir; + } - let fractional_coverage = - calc_fractional_coverage!(y_top_tiles, line_top_y, line_bottom_y); - let target_row = &mut self.windings.partial[y_top_tiles as usize]; - let current = f32x4::from_slice(s, target_row); + let fractional_coverage = + calc_fractional_coverage!(y_top_tiles, line_top_y, line_bottom_y); + let target_row = &mut self.windings.partial[y_top_tiles as usize]; + let current = f32x4::from_slice(s, target_row); - // See comment below on the double counting risk! - let double_count = if at_top_of_tile { - f_dir_v - } else { - f32x4::splat(s, 0.0) - }; - let next = fractional_coverage.mul_add(f_dir_v, current - double_count); - next.store_slice(target_row); - } + // See comment below on the double counting risk! + let double_count = if at_top_of_tile { + f_dir_v + } else { + f32x4::splat(s, 0.0) + }; + let next = fractional_coverage.mul_add(f_dir_v, current - double_count); + next.store_slice(target_row); + } - let y_start_middle = if is_start_culled { - y_top_tiles - } else { - y_top_tiles + 1 - }; - let line_bottom_floor = line_bottom_y.floor(); - let y_end_middle = (line_bottom_floor as u16).min(tile_rows); + let y_start_middle = if is_start_culled { + y_top_tiles + } else { + y_top_tiles + 1 + }; + let line_bottom_floor = line_bottom_y.floor(); + let y_end_middle = (line_bottom_floor as u16).min(tile_rows); - for y_idx in y_start_middle..y_end_middle { - self.windings.coarse[y_idx as usize] += dir; - } - self.windings - .mark_row_range_active(y_start_middle as usize, y_end_middle as usize); + for y_idx in y_start_middle..y_end_middle { + self.windings.coarse[y_idx as usize] += dir; + } + self.windings + .mark_row_range_active(y_start_middle as usize, y_end_middle as usize); - if line_bottom_y != line_bottom_floor + if line_bottom_y != line_bottom_floor && y_end_middle < tile_rows // Prevent double-processing, unless the start was off-screen and hasn't been // handled yet. && (is_start_culled || y_end_middle != y_top_tiles) - { - self.windings.mark_row_active(y_end_middle as usize); - // Ends implicitly cross the top. - self.windings.coarse[y_end_middle as usize] += dir; - let fractional_coverage = - calc_fractional_coverage!(y_end_middle, line_top_y, line_bottom_y); - let target_row = &mut self.windings.partial[y_end_middle as usize]; - let current = f32x4::from_slice(s, target_row); - // Subtract the inverse direction to avoid double counting with the coarse winding. - let next = fractional_coverage.mul_add(f_dir_v, current - f_dir_v); - next.store_slice(target_row); - } - - self.windings.culled = true; - continue; - } - - // Get tile coordinates for start/end points, use i32 to preserve negative coordinates. - let p0_tile_x = line_top_x.floor() as i32; - let p0_tile_y = line_top_y.floor() as i32; - let p1_tile_x = line_bottom_x.floor() as i32; - let p1_tile_y = line_bottom_y.floor() as i32; - - // Special-case out lines which are fully contained within a tile. - let not_same_tile = p0_tile_y != p1_tile_y || p0_tile_x != p1_tile_x; - if not_same_tile { - // Case vertical lines: By definition, these cannot be horizontally crossing, and - // thus require no additional left-edge culling handling. - if line_left_x == line_right_x { - let x = (line_left_x as u16).min(tile_columns.saturating_sub(1)); - - // Row Start, not culled. - let is_start_culled = line_top_y < 0.0; - if !is_start_culled { - let winding = - ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT; - let tile = Tile::new_clamped(x, y_top_tiles, line_idx, winding); - self.tile_buf.push(tile); - } - - // Middle - // If the start was culled, the first tile inside the viewport is a middle. - let y_start = if is_start_culled { - y_top_tiles - } else { - y_top_tiles + 1 - }; + { + self.windings.mark_row_active(y_end_middle as usize); + // Ends implicitly cross the top. + self.windings.coarse[y_end_middle as usize] += dir; + let fractional_coverage = + calc_fractional_coverage!(y_end_middle, line_top_y, line_bottom_y); + let target_row = &mut self.windings.partial[y_end_middle as usize]; + let current = f32x4::from_slice(s, target_row); + // Subtract the inverse direction to avoid double counting with the coarse winding. + let next = fractional_coverage.mul_add(f_dir_v, current - f_dir_v); + next.store_slice(target_row); + } - for y_idx in y_start..y_bottom_tiles { - let tile = Tile::new_clamped(x, y_idx, line_idx, W); - self.tile_buf.push(tile); + self.windings.culled = true; + continue; } - } else { - // General case, any line which crosses more than one tile and is not vertical. - let dx = p1_x - p0_x; - let dy = p1_y - p0_y; - let x_slope = dx / dy; - let dx_dir = (line_bottom_x >= line_top_x) as u32; - let not_dx_dir = dx_dir ^ 1; - let w_start_base = dx_dir << WINDING_SHIFT; - let w_end_base = not_dx_dir << WINDING_SHIFT; - - let push_row_extents = { - #[inline(always)] - |tile_buf: &mut Vec, - y_idx: u16, - row_left_x: f32, - row_right_x: f32, - w_start: u32, - w_end: u32, - w_single: u32| { - let x_start = row_left_x as u16; - let x_end = (row_right_x as u16).min(tile_columns - 1); - - if x_start <= x_end { - let winding = if x_start == x_end { w_single } else { w_start }; - - tile_buf.push(Tile::new(x_start, y_idx, line_idx, winding)); + // Get tile coordinates for start/end points, use i32 to preserve negative coordinates. + let p0_tile_x = line_top_x.floor() as i32; + let p0_tile_y = line_top_y.floor() as i32; + let p1_tile_x = line_bottom_x.floor() as i32; + let p1_tile_y = line_bottom_y.floor() as i32; + + // Special-case out lines which are fully contained within a tile. + let not_same_tile = p0_tile_y != p1_tile_y || p0_tile_x != p1_tile_x; + if not_same_tile { + // Case vertical lines: By definition, these cannot be horizontally crossing, and + // thus require no additional left-edge culling handling. + if line_left_x == line_right_x { + let x = (line_left_x as u16).min(tile_columns.saturating_sub(1)); + + // Row Start, not culled. + let is_start_culled = line_top_y < 0.0; + if !is_start_culled { + let winding = ((f32::from(y_top_tiles) >= line_top_y) as u32) + << WINDING_SHIFT; + let tile = Tile::new_clamped(x, y_top_tiles, line_idx, winding); + self.tile_buf.push(tile); + } - for x_idx in x_start.saturating_add(1)..x_end { - tile_buf.push(Tile::new(x_idx, y_idx, line_idx, 0)); - } + // Middle + // If the start was culled, the first tile inside the viewport is a middle. + let y_start = if is_start_culled { + y_top_tiles + } else { + y_top_tiles + 1 + }; - if x_start < x_end { - tile_buf.push(Tile::new(x_end, y_idx, line_idx, w_end)); - } + for y_idx in y_start..y_bottom_tiles { + let tile = Tile::new_clamped(x, y_idx, line_idx, W); + self.tile_buf.push(tile); } - } - }; + } else { + // General case, any line which crosses more than one tile and is not vertical. + let dx = p1_x - p0_x; + let dy = p1_y - p0_y; + let x_slope = dx / dy; + let dx_dir = (line_bottom_x >= line_top_x) as u32; + let not_dx_dir = dx_dir ^ 1; + + let w_start_base = dx_dir << WINDING_SHIFT; + let w_end_base = not_dx_dir << WINDING_SHIFT; + + let push_row_extents = { + #[inline(always)] + |tile_buf: &mut Vec, + y_idx: u16, + row_left_x: f32, + row_right_x: f32, + w_start: u32, + w_end: u32, + w_single: u32| { + let x_start = row_left_x as u16; + let x_end = (row_right_x as u16).min(tile_columns - 1); + + if x_start <= x_end { + let winding = + if x_start == x_end { w_single } else { w_start }; + + tile_buf.push(Tile::new(x_start, y_idx, line_idx, winding)); + + for x_idx in x_start.saturating_add(1)..x_end { + tile_buf.push(Tile::new(x_idx, y_idx, line_idx, 0)); + } + + if x_start < x_end { + tile_buf.push(Tile::new(x_end, y_idx, line_idx, w_end)); + } + } + } + }; - let mut push_row = { - #[inline(always)] - |y_idx: u16, - row_top_y: f32, - row_bottom_y: f32, - w_start: u32, - w_end: u32, - w_single: u32| { - let row_top_x = p0_x + (row_top_y - p0_y) * x_slope; - let row_bottom_x = p0_x + (row_bottom_y - p0_y) * x_slope; - - // TODO: Evaluate whether we need the second max/min. - let row_left_x = f32::min(row_top_x, row_bottom_x).max(line_left_x); - let row_right_x = f32::max(row_top_x, row_bottom_x).min(line_right_x); - - if row_left_x < 0.0 { - self.windings.culled = true; - - if row_right_x < 0.0 { - // Although the line may cross the left edge, the rightmost point in - // this row may still be fully left of the viewport. In this case, - // record the winding and emit no tiles. - self.windings.mark_row_active(y_idx as usize); - - let crosses_top = (w_single & W) != 0; - if crosses_top { - self.windings.coarse[y_idx as usize] += dir; + let mut push_row = { + #[inline(always)] + |y_idx: u16, + row_top_y: f32, + row_bottom_y: f32, + w_start: u32, + w_end: u32, + w_single: u32| { + let row_top_x = p0_x + (row_top_y - p0_y) * x_slope; + let row_bottom_x = p0_x + (row_bottom_y - p0_y) * x_slope; + + // TODO: Evaluate whether we need the second max/min. + let row_left_x = + f32::min(row_top_x, row_bottom_x).max(line_left_x); + let row_right_x = + f32::max(row_top_x, row_bottom_x).min(line_right_x); + + if row_left_x < 0.0 { + self.windings.culled = true; + + if row_right_x < 0.0 { + // Although the line may cross the left edge, the rightmost point in + // this row may still be fully left of the viewport. In this case, + // record the winding and emit no tiles. + self.windings.mark_row_active(y_idx as usize); + + let crosses_top = (w_single & W) != 0; + if crosses_top { + self.windings.coarse[y_idx as usize] += dir; + } + + let fractional_coverage = calc_fractional_coverage!( + y_idx, + row_top_y, + row_bottom_y + ); + let target_row = + &mut self.windings.partial[y_idx as usize]; + let current = f32x4::from_slice(s, target_row); + + let double_count = if crosses_top { + f_dir_v + } else { + f32x4::splat(s, 0.0) + }; + let next = fractional_coverage + .mul_add(f_dir_v, current - double_count); + next.store_slice(target_row); + + return; + } else { + // The line crosses into the viewport in this row. Record only the + // fractional portion of the winding, as the coarse winding will + // naturally get included by the clamped tile logic! + let y_slope = dy / dx; + let y_intersect = row_top_y - (row_top_x * y_slope); + + let (off_screen_top_y, off_screen_bottom_y) = + if row_top_x < 0.0 { + (row_top_y, f32::min(row_bottom_y, y_intersect)) + } else { + (f32::max(row_top_y, y_intersect), row_bottom_y) + }; + + if off_screen_top_y < off_screen_bottom_y { + self.windings.mark_row_active(y_idx as usize); + let fractional_coverage = calc_fractional_coverage!( + y_idx, + off_screen_top_y, + off_screen_bottom_y + ); + let target_row = + &mut self.windings.partial[y_idx as usize]; + let current = f32x4::from_slice(s, target_row); + let next = + fractional_coverage.mul_add(f_dir_v, current); + next.store_slice(target_row); + } + } } - let fractional_coverage = - calc_fractional_coverage!(y_idx, row_top_y, row_bottom_y); - let target_row = &mut self.windings.partial[y_idx as usize]; - let current = f32x4::from_slice(s, target_row); + push_row_extents( + &mut self.tile_buf, + y_idx, + row_left_x, + row_right_x, + w_start, + w_end, + w_single, + ); + } + }; - let double_count = if crosses_top { - f_dir_v + let is_start_culled = line_top_y < 0.0; + // This branch is taken in case the line is completely inside + // the viewport, allowing us to save many calculations that + // otherwise would need to be made viewport culling work. + if line_left_x >= 0.0 && line_right_x < tile_columns as f32 { + if !is_start_culled { + let y = f32::from(y_top_tiles); + let row_bottom_y = (y + 1.0).min(line_bottom_y); + let row_bottom_x = if row_bottom_y == line_bottom_y { + line_bottom_x } else { - f32x4::splat(s, 0.0) + p0_x + (row_bottom_y - p0_y) * x_slope }; - let next = fractional_coverage - .mul_add(f_dir_v, current - double_count); - next.store_slice(target_row); + let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT; + push_row_extents( + &mut self.tile_buf, + y_top_tiles, + f32::min(line_top_x, row_bottom_x), + f32::max(line_top_x, row_bottom_x), + w_start_base & mask, + w_end_base & mask, + W & mask, + ); + } - return; + let y_start = if is_start_culled { + y_top_tiles } else { - // The line crosses into the viewport in this row. Record only the - // fractional portion of the winding, as the coarse winding will - // naturally get included by the clamped tile logic! - let y_slope = dy / dx; - let y_intersect = row_top_y - (row_top_x * y_slope); - - let (off_screen_top_y, off_screen_bottom_y) = if row_top_x < 0.0 - { - (row_top_y, f32::min(row_bottom_y, y_intersect)) - } else { - (f32::max(row_top_y, y_intersect), row_bottom_y) - }; + y_top_tiles + 1 + }; - if off_screen_top_y < off_screen_bottom_y { - self.windings.mark_row_active(y_idx as usize); - let fractional_coverage = calc_fractional_coverage!( + if y_start < y_bottom_tiles { + let mut row_top_x = + p0_x + (f32::from(y_start) - p0_y) * x_slope; + for y_idx in y_start..y_bottom_tiles { + let y = f32::from(y_idx); + // Note: We purposefully don't precompute it once + // and just increment by `x_slope` after every iteration + // to avoid errors due to floating point inaccuracies. + let row_bottom_x = if line_bottom_y < y + 1.0 { + line_bottom_x + } else { + p0_x + (y + 1.0 - p0_y) * x_slope + }; + push_row_extents( + &mut self.tile_buf, y_idx, - off_screen_top_y, - off_screen_bottom_y + f32::min(row_top_x, row_bottom_x), + f32::max(row_top_x, row_bottom_x), + w_start_base, + w_end_base, + W, ); - let target_row = &mut self.windings.partial[y_idx as usize]; - let current = f32x4::from_slice(s, target_row); - let next = fractional_coverage.mul_add(f_dir_v, current); - next.store_slice(target_row); + row_top_x = row_bottom_x; } } - } - - push_row_extents( - &mut self.tile_buf, - y_idx, - row_left_x, - row_right_x, - w_start, - w_end, - w_single, - ); - } - }; - - let is_start_culled = line_top_y < 0.0; - // This branch is taken in case the line is completely inside - // the viewport, allowing us to save many calculations that - // otherwise would need to be made viewport culling work. - if line_left_x >= 0.0 && line_right_x < tile_columns as f32 { - if !is_start_culled { - let y = f32::from(y_top_tiles); - let row_bottom_y = (y + 1.0).min(line_bottom_y); - let row_bottom_x = if row_bottom_y == line_bottom_y { - line_bottom_x } else { - p0_x + (row_bottom_y - p0_y) * x_slope - }; - let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT; - push_row_extents( - &mut self.tile_buf, - y_top_tiles, - f32::min(line_top_x, row_bottom_x), - f32::max(line_top_x, row_bottom_x), - w_start_base & mask, - w_end_base & mask, - W & mask, - ); - } - - let y_start = if is_start_culled { - y_top_tiles - } else { - y_top_tiles + 1 - }; + if !is_start_culled { + let y = f32::from(y_top_tiles); + let row_bottom_y = (y + 1.0).min(line_bottom_y); + let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT; + push_row( + y_top_tiles, + line_top_y, + row_bottom_y, + w_start_base & mask, + w_end_base & mask, + W & mask, + ); + } - if y_start < y_bottom_tiles { - let mut row_top_x = p0_x + (f32::from(y_start) - p0_y) * x_slope; - for y_idx in y_start..y_bottom_tiles { - let y = f32::from(y_idx); - // Note: We purposefully don't precompute it once - // and just increment by `x_slope` after every iteration - // to avoid errors due to floating point inaccuracies. - let row_bottom_x = if line_bottom_y < y + 1.0 { - line_bottom_x + let y_start = if is_start_culled { + y_top_tiles } else { - p0_x + (y + 1.0 - p0_y) * x_slope + y_top_tiles + 1 }; - push_row_extents( - &mut self.tile_buf, - y_idx, - f32::min(row_top_x, row_bottom_x), - f32::max(row_top_x, row_bottom_x), - w_start_base, - w_end_base, - W, - ); - row_top_x = row_bottom_x; + + for y_idx in y_start..y_bottom_tiles { + let y = f32::from(y_idx); + let row_bottom_y = (y + 1.0).min(line_bottom_y); + push_row(y_idx, y, row_bottom_y, w_start_base, w_end_base, W); + } } } } else { - if !is_start_culled { - let y = f32::from(y_top_tiles); - let row_bottom_y = (y + 1.0).min(line_bottom_y); - let mask = ((y >= line_top_y) as u32) << WINDING_SHIFT; - push_row( - y_top_tiles, - line_top_y, - row_bottom_y, - w_start_base & mask, - w_end_base & mask, - W & mask, - ); - } - - let y_start = if is_start_culled { - y_top_tiles - } else { - y_top_tiles + 1 - }; - - for y_idx in y_start..y_bottom_tiles { - let y = f32::from(y_idx); - let row_bottom_y = (y + 1.0).min(line_bottom_y); - push_row(y_idx, y, row_bottom_y, w_start_base, w_end_base, W); - } + // Case line is fully contained within a single tile: These also cannot cross edges! + let tile = Tile::new_clamped( + (line_left_x as u16).min(tile_columns + 1), + y_top_tiles, + line_idx, + ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT, + ); + self.tile_buf.push(tile); } } - } else { - // Case line is fully contained within a single tile: These also cannot cross edges! - let tile = Tile::new_clamped( - (line_left_x as u16).min(tile_columns + 1), - y_top_tiles, - line_idx, - ((f32::from(y_top_tiles) >= line_top_y) as u32) << WINDING_SHIFT, - ); - self.tile_buf.push(tile); - } - } - self.windings.culled + self.windings.culled + }, + ) } /// Generates tile commands for MSAA (Multisample Anti-Aliasing) rasterization. diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs index 22fb125f90..16fbb2ff35 100644 --- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs @@ -358,6 +358,7 @@ impl MultiThreadedDispatcher { } } + #[inline(always)] fn rasterize_with>( &self, simd: S, @@ -367,44 +368,53 @@ impl MultiThreadedDispatcher { encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut buffer = Regions::new(width, height, buffer); - let fines = ThreadLocal::new(); - let wide = &self.wide; - let alpha_slots = self.alpha_storage.take(); - - self.thread_pool.install(|| { - buffer.update_regions_par(|region| { - let x = region.x; - let y = region.y; - - let mut fine = fines - .get_or(|| RefCell::new(Fine::::new(simd))) - .borrow_mut(); - - let wtile = wide.get(x, y); - fine.set_coords(x, y); - - fine.clear(wtile.bg); - for cmd in &wtile.cmds { - let thread_idx = match cmd { - Cmd::AlphaFill(a) => Some(wide.attrs.fill[a.attrs_idx as usize].thread_idx), - Cmd::ClipStrip(a) => Some(wide.attrs.clip[a.attrs_idx as usize].thread_idx), - _ => None, - }; - - let alphas = thread_idx - .map(|i| alpha_slots[i as usize].as_slice()) - .unwrap_or(&[]); - fine.run_cmd(cmd, alphas, encoded_paints, image_resolver, &wide.attrs); - } + simd.vectorize( + #[inline(always)] + || { + let mut buffer = Regions::new(width, height, buffer); + let fines = ThreadLocal::new(); + let wide = &self.wide; + let alpha_slots = self.alpha_storage.take(); + + self.thread_pool.install(|| { + buffer.update_regions_par(|region| { + let x = region.x; + let y = region.y; + + let mut fine = fines + .get_or(|| RefCell::new(Fine::::new(simd))) + .borrow_mut(); + + let wtile = wide.get(x, y); + fine.set_coords(x, y); + + fine.clear(wtile.bg); + for cmd in &wtile.cmds { + let thread_idx = match cmd { + Cmd::AlphaFill(a) => { + Some(wide.attrs.fill[a.attrs_idx as usize].thread_idx) + } + Cmd::ClipStrip(a) => { + Some(wide.attrs.clip[a.attrs_idx as usize].thread_idx) + } + _ => None, + }; + + let alphas = thread_idx + .map(|i| alpha_slots[i as usize].as_slice()) + .unwrap_or(&[]); + fine.run_cmd(cmd, alphas, encoded_paints, image_resolver, &wide.attrs); + } - fine.pack(region); - }); - }); + fine.pack(region); + }); + }); - // Don't forget to put back the alpha buffers, so that they can be re-used in - // the next path rendering iteration! - self.alpha_storage.init(alpha_slots); + // Don't forget to put back the alpha buffers, so that they can be re-used in + // the next path rendering iteration! + self.alpha_storage.init(alpha_slots); + }, + ); } } diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs index 2a08d4b1b2..e76a8eeba1 100644 --- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs @@ -126,6 +126,7 @@ impl SingleThreadedDispatcher { /// /// If the scene contains filter effects, uses the filter-aware path which maintains /// intermediate layer buffers. Otherwise, uses the simpler direct rasterization path. + #[inline(always)] fn rasterize_with>( &self, simd: S, @@ -135,30 +136,35 @@ impl SingleThreadedDispatcher { encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut layer_manager = LayerManager::new(); - - if self.has_filters() { - // Use filter-aware path that maintains layer buffers for filter effects. - self.rasterize_with_filters::( - simd, - buffer, - width, - height, - encoded_paints, - image_resolver, - &mut layer_manager, - ); - } else { - // Use simple direct rasterization for scenes without filters. - self.rasterize_simple::( - simd, - buffer, - width, - height, - encoded_paints, - image_resolver, - ); - } + simd.vectorize( + #[inline(always)] + || { + let mut layer_manager = LayerManager::new(); + + if self.has_filters() { + // Use filter-aware path that maintains layer buffers for filter effects. + self.rasterize_with_filters::( + simd, + buffer, + width, + height, + encoded_paints, + image_resolver, + &mut layer_manager, + ); + } else { + // Use simple direct rasterization for scenes without filters. + self.rasterize_simple::( + simd, + buffer, + width, + height, + encoded_paints, + image_resolver, + ); + } + }, + ); } /// Rasterizes a scene with filter effects using dependency-ordered execution. @@ -171,6 +177,7 @@ impl SingleThreadedDispatcher { /// # Render Graph Execution /// - `FilterLayer` nodes: Render to intermediate buffer, apply filter, store result. /// - `RootLayer` node: Final composition to output buffer. + #[inline(always)] fn rasterize_with_filters>( &self, simd: S, @@ -181,30 +188,36 @@ impl SingleThreadedDispatcher { image_resolver: &dyn ImageResolver, layer_manager: &mut LayerManager, ) { - let mut fine = Fine::::new(simd); - - // Process nodes in dependency order (filtered layers before their consumers). - for node_id in self.render_graph.execution_order() { - let node = &self.render_graph.nodes[node_id]; - - match &node.kind { - RenderNodeKind::FilterLayer { - layer_id, - filter, - wtile_bbox, - transform, - } => { - // Allocate intermediate buffer for this filtered layer. - let bbox_width = wtile_bbox.width_px(); - let bbox_height = wtile_bbox.height_px(); - let mut pixmap = Pixmap::new(bbox_width, bbox_height); - // TODO: Re-use this allocation by adding a .configure() or similar method - // to avoid allocating the internal Vec on every filtered layer. - let mut regions = - Regions::new(bbox_width, bbox_height, pixmap.data_as_u8_slice_mut()); - - // Render each tile in the layer's bounding box. - regions.update_regions(|region| { + simd.vectorize( + #[inline(always)] + || { + let mut fine = Fine::::new(simd); + + // Process nodes in dependency order (filtered layers before their consumers). + for node_id in self.render_graph.execution_order() { + let node = &self.render_graph.nodes[node_id]; + + match &node.kind { + RenderNodeKind::FilterLayer { + layer_id, + filter, + wtile_bbox, + transform, + } => { + // Allocate intermediate buffer for this filtered layer. + let bbox_width = wtile_bbox.width_px(); + let bbox_height = wtile_bbox.height_px(); + let mut pixmap = Pixmap::new(bbox_width, bbox_height); + // TODO: Re-use this allocation by adding a .configure() or similar method + // to avoid allocating the internal Vec on every filtered layer. + let mut regions = Regions::new( + bbox_width, + bbox_height, + pixmap.data_as_u8_slice_mut(), + ); + + // Render each tile in the layer's bounding box. + regions.update_regions(|region| { // Convert region-local coords to global wtile coords. let x = wtile_bbox.x0() + region.x; let y = wtile_bbox.y0() + region.y; @@ -229,23 +242,23 @@ impl SingleThreadedDispatcher { fine.pack(region); }); - // Apply the filter effect to the completed layer. - fine.filter_layer(&mut pixmap, filter, layer_manager, *transform); + // Apply the filter effect to the completed layer. + fine.filter_layer(&mut pixmap, filter, layer_manager, *transform); - // Save the filtered pixmap to disk for debugging. - // #[cfg(all(debug_assertions, feature = "std", feature = "png"))] - // save_filtered_layer_debug(&pixmap, *layer_id); + // Save the filtered pixmap to disk for debugging. + // #[cfg(all(debug_assertions, feature = "std", feature = "png"))] + // save_filtered_layer_debug(&pixmap, *layer_id); - // Store the filtered result for use by dependent layers. - layer_manager.register_layer(*layer_id, *wtile_bbox, pixmap); - } - RenderNodeKind::RootLayer { - layer_id, - wtile_bbox: _, - } => { - // Final composition directly to output buffer. - let mut regions = Regions::new(width, height, buffer); - regions.update_regions(|region| { + // Store the filtered result for use by dependent layers. + layer_manager.register_layer(*layer_id, *wtile_bbox, pixmap); + } + RenderNodeKind::RootLayer { + layer_id, + wtile_bbox: _, + } => { + // Final composition directly to output buffer. + let mut regions = Regions::new(width, height, buffer); + regions.update_regions(|region| { // Use the background color from the wide tile. let bg = self.wide.get(region.x, region.y).bg; self.process_layer_tile( @@ -267,9 +280,11 @@ impl SingleThreadedDispatcher { fine.pack(region); }); + } + } } - } - } + }, + ); } /// Processes all rendering commands for a single layer within a specific tile. @@ -288,6 +303,7 @@ impl SingleThreadedDispatcher { /// * `layer_manager` - Storage for filtered layer buffers. /// * `encoded_paints` - Paint definitions for the scene. /// * `image_resolver` - Resolver for looking up opaque image IDs. + #[inline(always)] fn process_layer_tile>( &self, fine: &mut Fine, @@ -299,89 +315,96 @@ impl SingleThreadedDispatcher { encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let wtile = &self.wide.get(x, y); - fine.set_coords(x, y); - fine.clear(clear_color); - - // Process all commands in this layer's render range. - // It can happen that the layer has no associated ranges in this wide tile in - // case they have been cleared by setting a new wide tile background, for example - // when filling a full-tile opaque solid color. - let Some(ranges) = wtile.layer_cmd_ranges.get(&layer_id) else { - return; - }; - - let mut cmd_idx = ranges.render_range.start; - while cmd_idx < ranges.render_range.end { - let cmd: &Cmd = &wtile.cmds[cmd_idx]; - - fine.run_cmd( - cmd, - &self.strip_storage.alphas, - encoded_paints, - image_resolver, - &self.wide.attrs, - ); - - // Special handling for filtered layer composition. - // Filtered layers have already been rendered and stored in layer_manager. - // Here we composite them into the current buffer, with special handling for clipping. - if let Cmd::PushBuf(LayerKind::Filtered(child_layer_id), _) = cmd { - // Unlike above, the unwrap is safe here because as long as the filtered layer - // is referenced in the wide tile, it must have associated layer ranges. - let filtered_ranges = wtile.layer_cmd_ranges.get(child_layer_id).unwrap(); - - // Check what comes after the filtered layer push to determine clipping state - match wtile.cmds.get(cmd_idx + 1) { - // Zero-clip region: tile is completely outside the clip path. - // The layer was already rendered for filtering, but we skip compositing - // since this tile is entirely clipped out. - // (PushZeroClip only appears for clipped filter layers) - // See https://github.com/linebender/vello/pull/1541/ for why we - // add the ID check. - Some(Cmd::PushZeroClip(id)) if *id == *child_layer_id => { - // If we have a zero-clip, it means that the whole layer should not be drawn. - // Therefore, we want to skip to the very end so that only `PopBuf` will - // be run. Therefore, we jump to `filtered_ranges.full_range.end - 1`. - cmd_idx = filtered_ranges.full_range.end - 1; - continue; - } - - // Partial clip: push the clip buffer, then composite the filtered layer - Some(Cmd::PushBuf(LayerKind::Clip(id), _)) if *id == *child_layer_id => { - fine.run_cmd( - &wtile.cmds[cmd_idx + 1], - &self.strip_storage.alphas, - encoded_paints, - image_resolver, - &self.wide.attrs, - ); - cmd_idx += 1; - - if let Some(mut region) = - layer_manager.layer_tile_region_mut(*child_layer_id, x, y) - { - fine.unpack(&mut region); + fine.simd.vectorize( + #[inline(always)] + || { + let wtile = &self.wide.get(x, y); + fine.set_coords(x, y); + fine.clear(clear_color); + + // Process all commands in this layer's render range. + // It can happen that the layer has no associated ranges in this wide tile in + // case they have been cleared by setting a new wide tile background, for example + // when filling a full-tile opaque solid color. + let Some(ranges) = wtile.layer_cmd_ranges.get(&layer_id) else { + return; + }; + + let mut cmd_idx = ranges.render_range.start; + while cmd_idx < ranges.render_range.end { + let cmd: &Cmd = &wtile.cmds[cmd_idx]; + + fine.run_cmd( + cmd, + &self.strip_storage.alphas, + encoded_paints, + image_resolver, + &self.wide.attrs, + ); + + // Special handling for filtered layer composition. + // Filtered layers have already been rendered and stored in layer_manager. + // Here we composite them into the current buffer, with special handling for clipping. + if let Cmd::PushBuf(LayerKind::Filtered(child_layer_id), _) = cmd { + // Unlike above, the unwrap is safe here because as long as the filtered layer + // is referenced in the wide tile, it must have associated layer ranges. + let filtered_ranges = wtile.layer_cmd_ranges.get(child_layer_id).unwrap(); + + // Check what comes after the filtered layer push to determine clipping state + match wtile.cmds.get(cmd_idx + 1) { + // Zero-clip region: tile is completely outside the clip path. + // The layer was already rendered for filtering, but we skip compositing + // since this tile is entirely clipped out. + // (PushZeroClip only appears for clipped filter layers) + // See https://github.com/linebender/vello/pull/1541/ for why we + // add the ID check. + Some(Cmd::PushZeroClip(id)) if *id == *child_layer_id => { + // If we have a zero-clip, it means that the whole layer should not be drawn. + // Therefore, we want to skip to the very end so that only `PopBuf` will + // be run. Therefore, we jump to `filtered_ranges.full_range.end - 1`. + cmd_idx = filtered_ranges.full_range.end - 1; + continue; + } + + // Partial clip: push the clip buffer, then composite the filtered layer + Some(Cmd::PushBuf(LayerKind::Clip(id), _)) + if *id == *child_layer_id => + { + fine.run_cmd( + &wtile.cmds[cmd_idx + 1], + &self.strip_storage.alphas, + encoded_paints, + image_resolver, + &self.wide.attrs, + ); + cmd_idx += 1; + + if let Some(mut region) = + layer_manager.layer_tile_region_mut(*child_layer_id, x, y) + { + fine.unpack(&mut region); + } + } + + // No clip or fully inside clip: composite the filtered layer directly + _ => { + if let Some(mut region) = + layer_manager.layer_tile_region_mut(*child_layer_id, x, y) + { + fine.unpack(&mut region); + } + } } - } - // No clip or fully inside clip: composite the filtered layer directly - _ => { - if let Some(mut region) = - layer_manager.layer_tile_region_mut(*child_layer_id, x, y) - { - fine.unpack(&mut region); - } + // Skip past the filtered layer's internal commands, as they were already + // rendered when the FilterLayer node was processed earlier. + cmd_idx = filtered_ranges.render_range.end.max(cmd_idx + 1); + } else { + cmd_idx += 1; } } - - // Skip past the filtered layer's internal commands, as they were already - // rendered when the FilterLayer node was processed earlier. - cmd_idx = filtered_ranges.render_range.end.max(cmd_idx + 1); - } else { - cmd_idx += 1; - } - } + }, + ); } /// Simple rasterization path for scenes without filter effects. @@ -389,6 +412,7 @@ impl SingleThreadedDispatcher { /// This directly processes each tile's commands without maintaining intermediate /// layer buffers. All rendering happens in a single pass directly to the output buffer. /// This is more efficient than the filter-aware path when no filters are present. + #[inline(always)] fn rasterize_simple>( &self, simd: S, @@ -398,30 +422,35 @@ impl SingleThreadedDispatcher { encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut regions = Regions::new(width, height, buffer); - let mut fine = Fine::::new(simd); - - regions.update_regions(|region| { - let x = region.x; - let y = region.y; - - let wtile = self.wide.get(x, y); - fine.set_coords(x, y); - - // Clear to background and process all commands in order. - fine.clear(wtile.bg); - for cmd in &wtile.cmds { - fine.run_cmd( - cmd, - &self.strip_storage.alphas, - encoded_paints, - image_resolver, - &self.wide.attrs, - ); - } + simd.vectorize( + #[inline(always)] + || { + let mut regions = Regions::new(width, height, buffer); + let mut fine = Fine::::new(simd); + + regions.update_regions(|region| { + let x = region.x; + let y = region.y; + + let wtile = self.wide.get(x, y); + fine.set_coords(x, y); + + // Clear to background and process all commands in order. + fine.clear(wtile.bg); + for cmd in &wtile.cmds { + fine.run_cmd( + cmd, + &self.strip_storage.alphas, + encoded_paints, + image_resolver, + &self.wide.attrs, + ); + } - fine.pack(region); - }); + fine.pack(region); + }); + }, + ); } /// Returns true if the scene contains any filter effects. @@ -475,6 +504,7 @@ impl SingleThreadedDispatcher { /// /// Composites tiles sequentially, writing directly to the destination buffer /// at the specified offset. + #[inline(always)] fn composite_at_offset_with>( &self, simd: S, @@ -488,38 +518,43 @@ impl SingleThreadedDispatcher { encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut regions = Regions::new_at_offset( - width, - height, - dst_x, - dst_y, - dst_buffer_width, - dst_buffer_height, - buffer, - ); - let mut fine = Fine::::new(simd); - - regions.update_regions(|region| { - let x = region.x; - let y = region.y; - - let wtile = self.wide.get(x, y); - fine.set_coords(x, y); - - // Unpack existing pixel data from the region instead of clearing, - // so that rendering composites onto the existing pixmap contents. - fine.unpack(region); - for cmd in &wtile.cmds { - fine.run_cmd( - cmd, - &self.strip_storage.alphas, - encoded_paints, - image_resolver, - &self.wide.attrs, + simd.vectorize( + #[inline(always)] + || { + let mut regions = Regions::new_at_offset( + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + buffer, ); - } - fine.pack(region); - }); + let mut fine = Fine::::new(simd); + + regions.update_regions(|region| { + let x = region.x; + let y = region.y; + + let wtile = self.wide.get(x, y); + fine.set_coords(x, y); + + // Unpack existing pixel data from the region instead of clearing, + // so that rendering composites onto the existing pixmap contents. + fine.unpack(region); + for cmd in &wtile.cmds { + fine.run_cmd( + cmd, + &self.strip_storage.alphas, + encoded_paints, + image_resolver, + &self.wide.attrs, + ); + } + fine.pack(region); + }); + }, + ); } } diff --git a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs index 60cdbf8518..4450295e78 100644 --- a/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/common/gradient/mod.rs @@ -14,6 +14,7 @@ pub(crate) mod sweep; const GRADIENT_INVALID_POS: u32 = u32::MAX; +#[inline(always)] pub(crate) fn calculate_t_vals>( simd: S, kind: U, diff --git a/sparse_strips/vello_cpu/src/fine/common/image.rs b/sparse_strips/vello_cpu/src/fine/common/image.rs index dad257b071..a9fbb9dd64 100644 --- a/sparse_strips/vello_cpu/src/fine/common/image.rs +++ b/sparse_strips/vello_cpu/src/fine/common/image.rs @@ -457,6 +457,7 @@ pub(crate) fn extend( } /// Calculate the weights for a single fractional value. +#[inline(always)] fn weights(simd: S, fract: f32x4) -> [f32x4; 4] { simd.vectorize( #[inline(always)] diff --git a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs index dc84f22a00..d07293998d 100644 --- a/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs +++ b/sparse_strips/vello_cpu/src/fine/common/rounded_blurred_rect.rs @@ -210,40 +210,52 @@ impl SimdRoundedBlurredRect { trait FloatExt { // See https://raphlinus.github.io/audio/2018/09/05/sigmoid.html for a little // explanation of this approximation to the erf function. - // Doing `inline(always)` seems to reduce performance for some reason. + // Keep an explicit `vectorize` cut point in the implementation; forcing this whole body to + // inline regresses performance. /// Approximate the erf function. fn compute_erf7(simd: S, x: Self) -> Self; fn powf(self, x: f32) -> Self; } impl FloatExt for f32x8 { + #[inline(always)] fn compute_erf7(simd: S, x: Self) -> Self { - // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or - // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64` - // machine precision, let alone `f32`. - let x = x.max(Self::splat(simd, -10.0)).min(Self::splat(simd, 10.0)); - let x = x * Self::splat(simd, core::f32::consts::FRAC_2_SQRT_PI); - let xx = x * x; - let p1 = Self::splat(simd, 0.0104).mul_add(xx, Self::splat(simd, 0.03395)); - let p2 = p1.mul_add(xx, Self::splat(simd, 0.24295)); - let p3 = x * xx; - let x = p2.mul_add(p3, x); - let denom = x.mul_add(x, Self::splat(simd, 1.0)).sqrt(); - x / denom + simd.vectorize( + #[inline(always)] + || { + // Clamp `x`, because for large `x` the terms here become `inf`, causing the result to be 0 or + // `NaN`. This clamping doesn't lose any information, because `erf(±10) ≈ 1` well within `f64` + // machine precision, let alone `f32`. + let x = x.max(Self::splat(simd, -10.0)).min(Self::splat(simd, 10.0)); + let x = x * Self::splat(simd, core::f32::consts::FRAC_2_SQRT_PI); + let xx = x * x; + let p1 = Self::splat(simd, 0.0104).mul_add(xx, Self::splat(simd, 0.03395)); + let p2 = p1.mul_add(xx, Self::splat(simd, 0.24295)); + let p3 = x * xx; + let x = p2.mul_add(p3, x); + let denom = x.mul_add(x, Self::splat(simd, 1.0)).sqrt(); + x / denom + }, + ) } - #[inline] + #[inline(always)] fn powf(mut self, x: f32) -> Self { - // TODO: SIMD - self[0] = self[0].powf(x); - self[1] = self[1].powf(x); - self[2] = self[2].powf(x); - self[3] = self[3].powf(x); - self[4] = self[4].powf(x); - self[5] = self[5].powf(x); - self[6] = self[6].powf(x); - self[7] = self[7].powf(x); - - self + self.simd.vectorize( + #[inline(always)] + || { + // TODO: SIMD + self[0] = self[0].powf(x); + self[1] = self[1].powf(x); + self[2] = self[2].powf(x); + self[3] = self[3].powf(x); + self[4] = self[4].powf(x); + self[5] = self[5].powf(x); + self[6] = self[6].powf(x); + self[7] = self[7].powf(x); + + self + }, + ) } } diff --git a/sparse_strips/vello_cpu/src/fine/highp/blend.rs b/sparse_strips/vello_cpu/src/fine/highp/blend.rs index cb2174da11..eb3a252e5e 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/blend.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/blend.rs @@ -25,55 +25,62 @@ impl Channels { // TODO: blending is still extremely slow, investigate whether there is something obvious we are // missing that other renderers do. +#[inline(always)] pub(crate) fn mix(src_c: f32x16, bg: f32x16, blend_mode: BlendMode) -> f32x16 { - if matches!(blend_mode.mix, Mix::Normal) { - return src_c; - } - // See https://www.w3.org/TR/compositing-1/#blending - let simd = src_c.simd; - - let split = |input: f32x16| { - let mut storage = [0.0; 16]; - simd.store_interleaved_128_f32x16(input, &mut storage); - let input_v = f32x16::from_slice(simd, &storage); - - let p1 = simd.split_f32x16(input_v); - let (r, g) = simd.split_f32x8(p1.0); - let (b, a) = simd.split_f32x8(p1.1); - - (Channels { r, g, b }, a) - }; - - let (bg_channels, bg_a) = split(bg); - let (src_channels, src_a) = split(src_c); - - let unpremultiplied_bg = bg_channels.unpremultiply(bg_a); - let unpremultiplied_src = src_channels.unpremultiply(src_a); - - let mut res_bg = unpremultiplied_bg; - let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg); - - let apply_alpha = |unpremultiplied_src_channel: f32x4, - mix_src_channel: f32x4, - dest_channel: &mut f32x4| { - let p1 = (1.0 - bg_a) * unpremultiplied_src_channel; - let p2 = bg_a * mix_src_channel; - - *dest_channel = (p1 + p2).premultiply(src_a); - }; - - apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r); - apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g); - apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b); - - let combined = simd.combine_f32x8( - simd.combine_f32x4(res_bg.r, res_bg.g), - simd.combine_f32x4(res_bg.b, src_a), - ); - - let mut storage = [0.0; 16]; - simd.store_interleaved_128_f32x16(combined, &mut storage); - f32x16::from_slice(simd, &storage) + src_c.simd.vectorize( + #[inline(always)] + || { + if matches!(blend_mode.mix, Mix::Normal) { + src_c + } else { + // See https://www.w3.org/TR/compositing-1/#blending + let simd = src_c.simd; + + let split = |input: f32x16| { + let mut storage = [0.0; 16]; + simd.store_interleaved_128_f32x16(input, &mut storage); + let input_v = f32x16::from_slice(simd, &storage); + + let p1 = simd.split_f32x16(input_v); + let (r, g) = simd.split_f32x8(p1.0); + let (b, a) = simd.split_f32x8(p1.1); + + (Channels { r, g, b }, a) + }; + + let (bg_channels, bg_a) = split(bg); + let (src_channels, src_a) = split(src_c); + + let unpremultiplied_bg = bg_channels.unpremultiply(bg_a); + let unpremultiplied_src = src_channels.unpremultiply(src_a); + + let mut res_bg = unpremultiplied_bg; + let mix_src = blend_mode.mix(unpremultiplied_src, unpremultiplied_bg); + + let apply_alpha = |unpremultiplied_src_channel: f32x4, + mix_src_channel: f32x4, + dest_channel: &mut f32x4| { + let p1 = (1.0 - bg_a) * unpremultiplied_src_channel; + let p2 = bg_a * mix_src_channel; + + *dest_channel = (p1 + p2).premultiply(src_a); + }; + + apply_alpha(unpremultiplied_src.r, mix_src.r, &mut res_bg.r); + apply_alpha(unpremultiplied_src.g, mix_src.g, &mut res_bg.g); + apply_alpha(unpremultiplied_src.b, mix_src.b, &mut res_bg.b); + + let combined = simd.combine_f32x8( + simd.combine_f32x4(res_bg.r, res_bg.g), + simd.combine_f32x4(res_bg.b, src_a), + ); + + let mut storage = [0.0; 16]; + simd.store_interleaved_128_f32x16(combined, &mut storage); + f32x16::from_slice(simd, &storage) + } + }, + ) } trait MixExt { @@ -81,25 +88,29 @@ trait MixExt { } impl MixExt for BlendMode { + #[inline(always)] fn mix(&self, src: Channels, bg: Channels) -> Channels { - match self.mix { - Mix::Normal => src, - Mix::Multiply => Multiply::mix(src, bg), - Mix::Screen => Screen::mix(src, bg), - Mix::Overlay => Overlay::mix(src, bg), - Mix::Darken => Darken::mix(src, bg), - Mix::Lighten => Lighten::mix(src, bg), - Mix::ColorDodge => ColorDodge::mix(src, bg), - Mix::ColorBurn => ColorBurn::mix(src, bg), - Mix::HardLight => HardLight::mix(src, bg), - Mix::SoftLight => SoftLight::mix(src, bg), - Mix::Difference => Difference::mix(src, bg), - Mix::Exclusion => Exclusion::mix(src, bg), - Mix::Luminosity => Luminosity::mix(src, bg), - Mix::Color => Color::mix(src, bg), - Mix::Hue => Hue::mix(src, bg), - Mix::Saturation => Saturation::mix(src, bg), - } + src.r.simd.vectorize( + #[inline(always)] + || match self.mix { + Mix::Normal => src, + Mix::Multiply => Multiply::mix(src, bg), + Mix::Screen => Screen::mix(src, bg), + Mix::Overlay => Overlay::mix(src, bg), + Mix::Darken => Darken::mix(src, bg), + Mix::Lighten => Lighten::mix(src, bg), + Mix::ColorDodge => ColorDodge::mix(src, bg), + Mix::ColorBurn => ColorBurn::mix(src, bg), + Mix::HardLight => HardLight::mix(src, bg), + Mix::SoftLight => SoftLight::mix(src, bg), + Mix::Difference => Difference::mix(src, bg), + Mix::Exclusion => Exclusion::mix(src, bg), + Mix::Luminosity => Luminosity::mix(src, bg), + Mix::Color => Color::mix(src, bg), + Mix::Hue => Hue::mix(src, bg), + Mix::Saturation => Saturation::mix(src, bg), + }, + ) } } @@ -118,14 +129,20 @@ impl Screen { } impl HardLight { + #[inline(always)] fn single(src: f32x4, bg: f32x4) -> f32x4 { - let two = f32x4::splat(src.simd, 2.0); + src.simd.vectorize( + #[inline(always)] + || { + let two = f32x4::splat(src.simd, 2.0); - let mask = src.simd.simd_le_f32x4(src, f32x4::splat(src.simd, 0.5)); - let opt1 = Multiply::single(bg, src * two); - let opt2 = Screen::single(bg, two * src - 1.0); + let mask = src.simd.simd_le_f32x4(src, f32x4::splat(src.simd, 0.5)); + let opt1 = Multiply::single(bg, src * two); + let opt2 = Screen::single(bg, two * src - 1.0); - src.simd.select_f32x4(mask, opt1, opt2) + src.simd.select_f32x4(mask, opt1, opt2) + }, + ) } } @@ -254,57 +271,84 @@ non_separable_mix!(Luminosity, |cs: &mut Channels, cb: &mut Channels| { *cb }); +#[inline(always)] fn lum(r: f32x4, g: f32x4, b: f32x4) -> f32x4 { - 0.3 * r + 0.59 * g + 0.11 * b + r.simd.vectorize( + #[inline(always)] + || 0.3 * r + 0.59 * g + 0.11 * b, + ) } +#[inline(always)] fn sat(r: f32x4, g: f32x4, b: f32x4) -> f32x4 { - r.max(g).max(b) - r.min(g).min(b) + r.simd.vectorize( + #[inline(always)] + || r.max(g).max(b) - r.min(g).min(b), + ) } +#[inline(always)] fn clip_color(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4) { - let simd = r.simd; - - let l = lum(*r, *g, *b); - let n = r.min(g.min(*b)); - let x = r.max(g.max(*b)); - - for c in [r, g, b] { - *c = simd.select_f32x4( - simd.simd_lt_f32x4(n, f32x4::splat(simd, 0.0)), - l + (((*c - l) * l) / (l - n)), - *c, - ); - - *c = simd.select_f32x4( - simd.simd_gt_f32x4(x, f32x4::splat(simd, 1.0)), - l + (((*c - l) * (1.0 - l)) / (x - l)), - *c, - ); - } + r.simd.vectorize( + #[inline(always)] + || { + let simd = r.simd; + + let l = lum(*r, *g, *b); + let n = r.min(g.min(*b)); + let x = r.max(g.max(*b)); + + for c in [r, g, b] { + *c = simd.select_f32x4( + simd.simd_lt_f32x4(n, f32x4::splat(simd, 0.0)), + l + (((*c - l) * l) / (l - n)), + *c, + ); + + *c = simd.select_f32x4( + simd.simd_gt_f32x4(x, f32x4::splat(simd, 1.0)), + l + (((*c - l) * (1.0 - l)) / (x - l)), + *c, + ); + } + }, + ); } +#[inline(always)] fn set_lum(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4, l: f32x4) { - let d = l - lum(*r, *g, *b); - *r += d; - *g += d; - *b += d; - - clip_color(r, g, b); + r.simd.vectorize( + #[inline(always)] + || { + let d = l - lum(*r, *g, *b); + *r += d; + *g += d; + *b += d; + + clip_color(r, g, b); + }, + ); } // Adapted from tiny-skia +#[inline(always)] fn set_sat(r: &mut f32x4, g: &mut f32x4, b: &mut f32x4, s: f32x4) { - let simd = r.simd; - let zero = f32x4::splat(simd, 0.0); - let mn = r.min(g.min(*b)); - let mx = r.max(g.max(*b)); - let sat = mx - mn; - - // Map min channel to 0, max channel to s, and scale the middle proportionally. - let scale = |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat); - - *r = scale(*r); - *g = scale(*g); - *b = scale(*b); + r.simd.vectorize( + #[inline(always)] + || { + let simd = r.simd; + let zero = f32x4::splat(simd, 0.0); + let mn = r.min(g.min(*b)); + let mx = r.max(g.max(*b)); + let sat = mx - mn; + + // Map min channel to 0, max channel to s, and scale the middle proportionally. + let scale = + |c| simd.select_f32x4(simd.simd_eq_f32x4(sat, zero), zero, (c - mn) * s / sat); + + *r = scale(*r); + *g = scale(*g); + *b = scale(*b); + }, + ); } diff --git a/sparse_strips/vello_cpu/src/fine/highp/compose.rs b/sparse_strips/vello_cpu/src/fine/highp/compose.rs index 32983c9abe..fa9c0f00bf 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/compose.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/compose.rs @@ -16,6 +16,7 @@ pub(crate) trait ComposeExt { } impl ComposeExt for BlendMode { + #[inline(always)] fn compose( &self, simd: S, @@ -23,47 +24,52 @@ impl ComposeExt for BlendMode { bg_c: f32x16, alpha_mask: Option>, ) -> f32x16 { - // There some non-obvious subtleties worth highlighting here. - // We support two kinds of blending (in this case, we focus on compositing specifically): - // - Isolated blending, where layers as a whole are blended together with their backdrop. - // If we are currently performing this kind of blending, `alpha_mask` will always be `None`. - // After all, there is no concrete shape opacity associated with a layer. Instead, we are - // just compositing the RGBA values at _all_ positions of the source layer with the backdrop - // layer. For example, if the backdrop contains a green rectangle and source layer is just - // empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared, - // because we are compositing the whole source layer with the whole backdrop, and not - // just the parts of the source layer that have actually be drawn on. - // - Non-isolated blending, where a single path is blended with the backdrop. In this case, - // `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently - // compositing. Remember that strips always have a fixed height of 4, because of this, the - // strips might cover areas that aren't actually covered by the path (and just have an alpha - // value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for - // non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`. + simd.vectorize( + #[inline(always)] + || { + // There some non-obvious subtleties worth highlighting here. + // We support two kinds of blending (in this case, we focus on compositing specifically): + // - Isolated blending, where layers as a whole are blended together with their backdrop. + // If we are currently performing this kind of blending, `alpha_mask` will always be `None`. + // After all, there is no concrete shape opacity associated with a layer. Instead, we are + // just compositing the RGBA values at _all_ positions of the source layer with the backdrop + // layer. For example, if the backdrop contains a green rectangle and source layer is just + // empty, if we perform blending with `Compose::Clear`, then _everything_ will be cleared, + // because we are compositing the whole source layer with the whole backdrop, and not + // just the parts of the source layer that have actually be drawn on. + // - Non-isolated blending, where a single path is blended with the backdrop. In this case, + // `alpha_mask` _might_ be `Some` and contain the alpha values of the strips we are currently + // compositing. Remember that strips always have a fixed height of 4, because of this, the + // strips might cover areas that aren't actually covered by the path (and just have an alpha + // value of 0, or a value between 0-254 for anti-aliased parts). Because of this, for + // non-isolated blending, we need to lerp the result with the backdrop using `alpha_mask`. - let mut res = match self.compose { - Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), - Compose::Clear => Clear::compose(simd, src_c, bg_c), - Compose::Copy => Copy::compose(simd, src_c, bg_c), - Compose::DestOver => DestOver::compose(simd, src_c, bg_c), - Compose::Dest => Dest::compose(simd, src_c, bg_c), - Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), - Compose::DestIn => DestIn::compose(simd, src_c, bg_c), - Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), - Compose::DestOut => DestOut::compose(simd, src_c, bg_c), - Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), - Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), - Compose::Xor => Xor::compose(simd, src_c, bg_c), - Compose::Plus => Plus::compose(simd, src_c, bg_c), - // Have not been able to find a formula for this, so just fallback to Plus. - Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), - }; + let mut res = match self.compose { + Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), + Compose::Clear => Clear::compose(simd, src_c, bg_c), + Compose::Copy => Copy::compose(simd, src_c, bg_c), + Compose::DestOver => DestOver::compose(simd, src_c, bg_c), + Compose::Dest => Dest::compose(simd, src_c, bg_c), + Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), + Compose::DestIn => DestIn::compose(simd, src_c, bg_c), + Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), + Compose::DestOut => DestOut::compose(simd, src_c, bg_c), + Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), + Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), + Compose::Xor => Xor::compose(simd, src_c, bg_c), + Compose::Plus => Plus::compose(simd, src_c, bg_c), + // Have not been able to find a formula for this, so just fallback to Plus. + Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), + }; - if let Some(alpha_mask) = alpha_mask { - let alpha_mask_inv = 1.0 - alpha_mask; - res = alpha_mask * res + alpha_mask_inv * bg_c; - } + if let Some(alpha_mask) = alpha_mask { + let alpha_mask_inv = 1.0 - alpha_mask; + res = alpha_mask * res + alpha_mask_inv * bg_c; + } - res + res + }, + ) } } @@ -72,20 +78,26 @@ macro_rules! compose { struct $name; impl $name { + #[inline(always)] fn compose(simd: S, src_c: f32x16, bg_c: f32x16) -> f32x16 { - let al_b = bg_c.splat_4th(); - let al_s = src_c.splat_4th(); + simd.vectorize( + #[inline(always)] + || { + let al_b = bg_c.splat_4th(); + let al_s = src_c.splat_4th(); - let fa = $fa(simd, al_s, al_b); - let fb = $fb(simd, al_s, al_b); + let fa = $fa(simd, al_s, al_b); + let fb = $fb(simd, al_s, al_b); - if $sat { - (src_c * fa + fb * bg_c) - .min(f32x16::splat(simd, 1.0)) - .max(f32x16::splat(simd, 0.0)) - } else { - src_c * fa + fb * bg_c - } + if $sat { + (src_c * fa + fb * bg_c) + .min(f32x16::splat(simd, 1.0)) + .max(f32x16::splat(simd, 0.0)) + } else { + src_c * fa + fb * bg_c + } + }, + ) } } }; diff --git a/sparse_strips/vello_cpu/src/fine/highp/mod.rs b/sparse_strips/vello_cpu/src/fine/highp/mod.rs index 0948001bee..c7294573a7 100644 --- a/sparse_strips/vello_cpu/src/fine/highp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/highp/mod.rs @@ -379,18 +379,24 @@ mod fill { } /// Applies blend mode compositing to a buffer without per-pixel masks. + #[inline(always)] pub(super) fn blend>>( simd: S, dest: &mut [f32], src: T, blend_mode: BlendMode, ) { - for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) { - let bg_v = f32x16::from_slice(simd, next_dest); - let src_c = blend::mix(next_src, bg_v, blend_mode); - let res = blend_mode.compose(simd, src_c, bg_v, None); - res.store_slice(next_dest); - } + simd.vectorize( + #[inline(always)] + || { + for (next_dest, next_src) in dest.chunks_exact_mut(16).zip(src) { + let bg_v = f32x16::from_slice(simd, next_dest); + let src_c = blend::mix(next_src, bg_v, blend_mode); + let res = blend_mode.compose(simd, src_c, bg_v, None); + res.store_slice(next_dest); + } + }, + ); } /// Performs the core alpha compositing calculation. @@ -449,6 +455,7 @@ mod alpha_fill { /// Composites a buffer of colors with per-pixel alpha masks. /// /// Each pixel's source alpha is modulated by its corresponding mask value. + #[inline(always)] pub(super) fn alpha_composite_arbitrary>>( simd: S, dest: &mut [f32], @@ -471,6 +478,7 @@ mod alpha_fill { } /// Applies blend mode compositing with per-pixel alpha masks. + #[inline(always)] pub(super) fn blend>>( simd: S, dest: &mut [f32], diff --git a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs index 7b4a252acf..7ef974751a 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs @@ -7,65 +7,79 @@ use vello_common::fearless_simd::*; use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32}; // TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed. +#[inline(always)] pub(crate) fn mix(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMode) -> u8x32 { - if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) { - return res; - } - - // Fallback for blend modes that aren't supported in u8. - - let to_f32 = |val: u8x32| { - let (a, b) = src_c.simd.split_u8x32(val); - let mut a = u8_to_f32(a); - let mut b = u8_to_f32(b); - a *= f32x16::splat(src_c.simd, 1.0 / 255.0); - b *= f32x16::splat(src_c.simd, 1.0 / 255.0); - (a, b) - }; - - let to_u8 = |val1: f32x16, val2: f32x16| { - let val1 = - f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); - let val2 = - f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); - - val1.simd.combine_u8x16(val1, val2) - }; - - let (mut src_1, mut src_2) = to_f32(src_c); - let (bg_1, bg_2) = to_f32(bg_c); - - src_1 = highp::blend::mix(src_1, bg_1, blend_mode); - src_2 = highp::blend::mix(src_2, bg_2, blend_mode); + src_c.simd.vectorize( + #[inline(always)] + || { + if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) { + return res; + } - to_u8(src_1, src_2) + // Fallback for blend modes that aren't supported in u8. + + let to_f32 = |val: u8x32| { + let (a, b) = src_c.simd.split_u8x32(val); + let mut a = u8_to_f32(a); + let mut b = u8_to_f32(b); + a *= f32x16::splat(src_c.simd, 1.0 / 255.0); + b *= f32x16::splat(src_c.simd, 1.0 / 255.0); + (a, b) + }; + + let to_u8 = |val1: f32x16, val2: f32x16| { + let val1 = f32_to_u8( + f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)), + ); + let val2 = f32_to_u8( + f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)), + ); + + val1.simd.combine_u8x16(val1, val2) + }; + + let (mut src_1, mut src_2) = to_f32(src_c); + let (bg_1, bg_2) = to_f32(bg_c); + + src_1 = highp::blend::mix(src_1, bg_1, blend_mode); + src_2 = highp::blend::mix(src_2, bg_2, blend_mode); + + to_u8(src_1, src_2) + }, + ) } +#[inline(always)] fn try_u8_mix(blend_mode: BlendMode, src_c: u8x32, bg_c: u8x32) -> Option> { - // We implement the u8 fast path for blend modes that - // 1) are separable. - // 2) don't have too many divisions, since integer normalization is - // relatively expensive. - // In the future, it's possible to do further experimentation to see whether - // some more blend modes are worth doing in integer space. - Some(match blend_mode.mix { - Mix::Normal => src_c, - Mix::Multiply => Multiply::mix(src_c, bg_c), - Mix::Screen => Screen::mix(src_c, bg_c), - Mix::Overlay => Overlay::mix(src_c, bg_c), - Mix::Darken => Darken::mix(src_c, bg_c), - Mix::Lighten => Lighten::mix(src_c, bg_c), - Mix::HardLight => HardLight::mix(src_c, bg_c), - Mix::Difference => Difference::mix(src_c, bg_c), - Mix::Exclusion => Exclusion::mix(src_c, bg_c), - Mix::ColorDodge - | Mix::ColorBurn - | Mix::SoftLight - | Mix::Luminosity - | Mix::Color - | Mix::Hue - | Mix::Saturation => return None, - }) + src_c.simd.vectorize( + #[inline(always)] + || { + // We implement the u8 fast path for blend modes that + // 1) are separable. + // 2) don't have too many divisions, since integer normalization is + // relatively expensive. + // In the future, it's possible to do further experimentation to see whether + // some more blend modes are worth doing in integer space. + match blend_mode.mix { + Mix::Normal => Some(src_c), + Mix::Multiply => Some(Multiply::mix(src_c, bg_c)), + Mix::Screen => Some(Screen::mix(src_c, bg_c)), + Mix::Overlay => Some(Overlay::mix(src_c, bg_c)), + Mix::Darken => Some(Darken::mix(src_c, bg_c)), + Mix::Lighten => Some(Lighten::mix(src_c, bg_c)), + Mix::HardLight => Some(HardLight::mix(src_c, bg_c)), + Mix::Difference => Some(Difference::mix(src_c, bg_c)), + Mix::Exclusion => Some(Exclusion::mix(src_c, bg_c)), + Mix::ColorDodge + | Mix::ColorBurn + | Mix::SoftLight + | Mix::Luminosity + | Mix::Color + | Mix::Hue + | Mix::Saturation => None, + } + }, + ) } macro_rules! u8_mix { diff --git a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs index c03d43bd4a..e5e360f29c 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/compose.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/compose.rs @@ -18,6 +18,7 @@ pub(crate) trait ComposeExt { } impl ComposeExt for BlendMode { + #[inline(always)] fn compose( &self, simd: S, @@ -25,32 +26,37 @@ impl ComposeExt for BlendMode { bg_c: u8x32, alpha_mask: Option>, ) -> u8x32 { - let mut res = match self.compose { - Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), - Compose::Clear => Clear::compose(simd, src_c, bg_c), - Compose::Copy => Copy::compose(simd, src_c, bg_c), - Compose::DestOver => DestOver::compose(simd, src_c, bg_c), - Compose::Dest => Dest::compose(simd, src_c, bg_c), - Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), - Compose::DestIn => DestIn::compose(simd, src_c, bg_c), - Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), - Compose::DestOut => DestOut::compose(simd, src_c, bg_c), - Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), - Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), - Compose::Xor => Xor::compose(simd, src_c, bg_c), - Compose::Plus => Plus::compose(simd, src_c, bg_c), - // Have not been able to find a formula for this, so just fallback to Plus. - Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), - }; + simd.vectorize( + #[inline(always)] + || { + let mut res = match self.compose { + Compose::SrcOver => SrcOver::compose(simd, src_c, bg_c), + Compose::Clear => Clear::compose(simd, src_c, bg_c), + Compose::Copy => Copy::compose(simd, src_c, bg_c), + Compose::DestOver => DestOver::compose(simd, src_c, bg_c), + Compose::Dest => Dest::compose(simd, src_c, bg_c), + Compose::SrcIn => SrcIn::compose(simd, src_c, bg_c), + Compose::DestIn => DestIn::compose(simd, src_c, bg_c), + Compose::SrcOut => SrcOut::compose(simd, src_c, bg_c), + Compose::DestOut => DestOut::compose(simd, src_c, bg_c), + Compose::SrcAtop => SrcAtop::compose(simd, src_c, bg_c), + Compose::DestAtop => DestAtop::compose(simd, src_c, bg_c), + Compose::Xor => Xor::compose(simd, src_c, bg_c), + Compose::Plus => Plus::compose(simd, src_c, bg_c), + // Have not been able to find a formula for this, so just fallback to Plus. + Compose::PlusLighter => Plus::compose(simd, src_c, bg_c), + }; - if let Some(alpha_mask) = alpha_mask { - let alpha_mask_inv = 255 - alpha_mask; - let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res); - let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c); - res = simd.narrow_u16x32((p1 + p2).div_255()); - } + if let Some(alpha_mask) = alpha_mask { + let alpha_mask_inv = 255 - alpha_mask; + let p1 = simd.widen_u8x32(alpha_mask) * simd.widen_u8x32(res); + let p2 = simd.widen_u8x32(alpha_mask_inv) * simd.widen_u8x32(bg_c); + res = simd.narrow_u16x32((p1 + p2).div_255()); + } - res + res + }, + ) } } @@ -59,23 +65,29 @@ macro_rules! compose { struct $name; impl $name { + #[inline(always)] fn compose(simd: S, src_c: u8x32, bg_c: u8x32) -> u8x32 { - let al_b = bg_c.splat_4th(); - let al_s = src_c.splat_4th(); + simd.vectorize( + #[inline(always)] + || { + let al_b = bg_c.splat_4th(); + let al_s = src_c.splat_4th(); - let fa = $fa(simd, al_s, al_b); - let fb = $fb(simd, al_s, al_b); + let fa = $fa(simd, al_s, al_b); + let fb = $fb(simd, al_s, al_b); - if $sat { - simd.narrow_u16x32( - (simd.widen_u8x32(src_c.normalized_mul(fa)) - + simd.widen_u8x32(fb.normalized_mul(bg_c))) - .min(u16x32::splat(simd, 255)) - .max(u16x32::splat(simd, 0)), - ) - } else { - src_c.normalized_mul(fa) + fb.normalized_mul(bg_c) - } + if $sat { + simd.narrow_u16x32( + (simd.widen_u8x32(src_c.normalized_mul(fa)) + + simd.widen_u8x32(fb.normalized_mul(bg_c))) + .min(u16x32::splat(simd, 255)) + .max(u16x32::splat(simd, 0)), + ) + } else { + src_c.normalized_mul(fa) + fb.normalized_mul(bg_c) + } + }, + ) } } }; diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs index 3e9e88b79f..029e4e7e24 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs @@ -358,6 +358,7 @@ mod fill { use vello_common::util::normalized_mul_u8x32; /// Applies blend mode compositing to a buffer without per-pixel masks. + #[inline(always)] pub(super) fn blend>>( simd: S, dest: &mut [u8], @@ -385,6 +386,7 @@ mod fill { /// Composites a solid color onto a buffer using alpha blending. /// /// Uses the "over" operator: `result = src + bg * (1 - src_alpha)` + #[inline(always)] pub(super) fn alpha_composite_solid(s: S, dest: &mut [u8], src: [u8; 4]) { s.vectorize( #[inline(always)] @@ -409,6 +411,7 @@ mod fill { /// Composites a buffer of colors onto another buffer using alpha blending. /// /// Each source pixel is composited individually based on its alpha channel. + #[inline(always)] pub(super) fn alpha_composite>>( simd: S, dest: &mut [u8], @@ -456,6 +459,7 @@ mod alpha_fill { use vello_common::util::{Div255Ext, normalized_mul_u8x32}; /// Applies blend mode compositing with per-pixel alpha masks. + #[inline(always)] pub(super) fn blend>>( simd: S, dest: &mut [u8],