From 9a1d24feb199aadcc0c4acdd1fb88fcaca65eb67 Mon Sep 17 00:00:00 2001 From: Nico Burns Date: Wed, 22 Apr 2026 12:46:10 +0100 Subject: [PATCH] Vello CPU: Add public API to render with stride Signed-off-by: Nico Burns --- sparse_strips/vello_cpu/src/dispatch/mod.rs | 4 + .../vello_cpu/src/dispatch/multi_threaded.rs | 78 +++++++++++-- .../vello_cpu/src/dispatch/single_threaded.rs | 110 ++++++++++++++++-- sparse_strips/vello_cpu/src/render.rs | 37 +++++- 4 files changed, 211 insertions(+), 18 deletions(-) diff --git a/sparse_strips/vello_cpu/src/dispatch/mod.rs b/sparse_strips/vello_cpu/src/dispatch/mod.rs index 5032342290..72ee32b095 100644 --- a/sparse_strips/vello_cpu/src/dispatch/mod.rs +++ b/sparse_strips/vello_cpu/src/dispatch/mod.rs @@ -87,6 +87,10 @@ pub(crate) trait Dispatcher: Debug + Send + Sync { render_mode: RenderMode, width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ); diff --git a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs index e3708ba2ae..5acd504e48 100644 --- a/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/multi_threaded.rs @@ -170,11 +170,15 @@ impl MultiThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { use crate::fine::F32Kernel; - dispatch!(self.level, simd => self.rasterize_with::<_, F32Kernel>(simd, buffer, width, height, encoded_paints, image_resolver)); + dispatch!(self.level, simd => self.rasterize_with::<_, F32Kernel>(simd, buffer, width, height, dst_x, dst_y, dst_buffer_width, dst_buffer_height, encoded_paints, image_resolver)); } #[cfg(feature = "u8_pipeline")] @@ -183,11 +187,15 @@ impl MultiThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { use crate::fine::U8Kernel; - dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height, encoded_paints, image_resolver)); + dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height, dst_x, dst_y, dst_buffer_width, dst_buffer_height, encoded_paints, image_resolver)); } fn init(&mut self) { @@ -382,10 +390,22 @@ impl MultiThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut buffer = Regions::new(width, height, buffer); + let mut buffer = Regions::new_at_offset( + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + buffer, + ); let fines = ThreadLocal::new(); let wide = &self.wide; let alpha_slots = self.alpha_storage.take(); @@ -612,6 +632,10 @@ impl Dispatcher for MultiThreadedDispatcher { render_mode: RenderMode, width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { @@ -621,23 +645,63 @@ impl Dispatcher for MultiThreadedDispatcher { #[cfg(all(feature = "u8_pipeline", not(feature = "f32_pipeline")))] { let _ = render_mode; - self.rasterize_u8(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_u8( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } // Only f32 pipeline enabled #[cfg(all(feature = "f32_pipeline", not(feature = "u8_pipeline")))] { let _ = render_mode; - self.rasterize_f32(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_f32( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } // Both pipelines enabled #[cfg(all(feature = "f32_pipeline", feature = "u8_pipeline"))] match render_mode { RenderMode::OptimizeSpeed => { - self.rasterize_u8(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_u8( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } RenderMode::OptimizeQuality => { - self.rasterize_f32(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_f32( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } } } diff --git a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs index 19a73e656e..14c74e0b25 100644 --- a/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs +++ b/sparse_strips/vello_cpu/src/dispatch/single_threaded.rs @@ -93,12 +93,19 @@ impl SingleThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { use crate::fine::F32Kernel; use vello_common::fearless_simd::dispatch; - dispatch!(self.level, simd => self.rasterize_with::<_, F32Kernel>(simd, buffer, width, height, encoded_paints, image_resolver)); + dispatch!(self.level, simd => self.rasterize_with::<_, F32Kernel>(simd, buffer, width, height, dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height,encoded_paints, image_resolver)); } /// Rasterizes the scene using u8 precision (fast). @@ -111,12 +118,19 @@ impl SingleThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { use crate::fine::U8Kernel; use vello_common::fearless_simd::dispatch; - dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height, encoded_paints, image_resolver)); + dispatch!(self.level, simd => self.rasterize_with::<_, U8Kernel>(simd, buffer, width, height,dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, encoded_paints, image_resolver)); } /// Core rasterization dispatcher that chooses between simple and filter-aware paths. @@ -133,6 +147,10 @@ impl SingleThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { @@ -145,6 +163,10 @@ impl SingleThreadedDispatcher { buffer, width, height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, encoded_paints, image_resolver, &mut layer_manager, @@ -156,6 +178,10 @@ impl SingleThreadedDispatcher { buffer, width, height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, encoded_paints, image_resolver, ); @@ -178,6 +204,10 @@ impl SingleThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, layer_manager: &mut LayerManager, @@ -245,7 +275,15 @@ impl SingleThreadedDispatcher { wtile_bbox: _, } => { // Final composition directly to output buffer. - let mut regions = Regions::new(width, height, buffer); + let mut regions = Regions::new_at_offset( + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + buffer, + ); regions.update_regions(|region| { // Use the background color from the wide tile. let bg = self.wide.get(region.x, region.y).bg; @@ -396,10 +434,22 @@ impl SingleThreadedDispatcher { buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { - let mut regions = Regions::new(width, height, buffer); + let mut regions = Regions::new_at_offset( + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + buffer, + ); let mut fine = Fine::::new(simd); regions.update_regions(|region| { @@ -704,6 +754,10 @@ impl Dispatcher for SingleThreadedDispatcher { render_mode: RenderMode, width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, encoded_paints: &[EncodedPaint], image_resolver: &dyn ImageResolver, ) { @@ -711,14 +765,34 @@ impl Dispatcher for SingleThreadedDispatcher { #[cfg(all(feature = "u8_pipeline", not(feature = "f32_pipeline")))] { let _ = render_mode; - self.rasterize_u8(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_u8( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } // If only the f32 pipeline is enabled, then use it #[cfg(all(feature = "f32_pipeline", not(feature = "u8_pipeline")))] { let _ = render_mode; - self.rasterize_f32(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_f32( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } // If both pipelines are enabled, select precision based on render mode parameter. @@ -726,11 +800,31 @@ impl Dispatcher for SingleThreadedDispatcher { match render_mode { RenderMode::OptimizeSpeed => { // Use u8 precision for faster rendering. - self.rasterize_u8(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_u8( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } RenderMode::OptimizeQuality => { // Use f32 precision for higher quality. - self.rasterize_f32(buffer, width, height, encoded_paints, image_resolver); + self.rasterize_f32( + buffer, + width, + height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, + encoded_paints, + image_resolver, + ); } } diff --git a/sparse_strips/vello_cpu/src/render.rs b/sparse_strips/vello_cpu/src/render.rs index 488e46837f..06752c05cd 100644 --- a/sparse_strips/vello_cpu/src/render.rs +++ b/sparse_strips/vello_cpu/src/render.rs @@ -627,12 +627,16 @@ impl RenderContext { /// Render the current context into a buffer. /// The buffer is expected to be in premultiplied RGBA8 format with length `width * height * 4` - pub fn render_to_buffer( + pub fn render_to_buffer_with_offset( &self, resources: &mut Resources, buffer: &mut [u8], width: u16, height: u16, + dst_x: u16, + dst_y: u16, + dst_buffer_width: u16, + dst_buffer_height: u16, render_mode: RenderMode, ) { // TODO: Maybe we should move those checks into the dispatcher. @@ -640,8 +644,8 @@ impl RenderContext { assert!(!wide.has_layers(), "some layers haven't been popped yet"); assert_eq!( buffer.len(), - (width as usize) * (height as usize) * 4, - "provided width ({}) and height ({}) do not match buffer size ({})", + (dst_buffer_width as usize) * (dst_buffer_height as usize) * 4, + "provided dst_buffer_width ({}) and dst_buffer_height ({}) do not match buffer size ({})", width, height, buffer.len(), @@ -654,6 +658,10 @@ impl RenderContext { render_mode, width, height, + dst_x, + dst_y, + dst_buffer_width, + dst_buffer_height, &self.encoded_paints, &resources.image_registry, ); @@ -665,6 +673,29 @@ impl RenderContext { resources.after_render(); } + /// Render the current context into a buffer. + /// The buffer is expected to be in premultiplied RGBA8 format with length `width * height * 4` + pub fn render_to_buffer( + &self, + resources: &mut Resources, + buffer: &mut [u8], + width: u16, + height: u16, + render_mode: RenderMode, + ) { + self.render_to_buffer_with_offset( + resources, + buffer, + width, + height, + 0, + 0, + width, + height, + render_mode, + ); + } + /// Render the current context into a pixmap. pub fn render_to_pixmap(&self, resources: &mut Resources, pixmap: &mut Pixmap) { let width = pixmap.width();