-
Notifications
You must be signed in to change notification settings - Fork 260
vello_cpu: Add u8 fast path for some blend modes #1653
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,224 @@ | ||
| // Copyright 2025 the Vello Authors | ||
| // SPDX-License-Identifier: Apache-2.0 OR MIT | ||
|
|
||
| use crate::fine::{Splat4thExt, highp, u8_to_f32}; | ||
| use crate::peniko::{BlendMode, Mix}; | ||
| use vello_common::fearless_simd::*; | ||
| use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32}; | ||
|
|
||
| // TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed. | ||
| pub(crate) fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> { | ||
| if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) { | ||
| return res; | ||
| } | ||
|
|
||
| // Fallback for blend modes that aren't supported in u8. | ||
|
|
||
| let to_f32 = |val: u8x32<S>| { | ||
| let (a, b) = src_c.simd.split_u8x32(val); | ||
| let mut a = u8_to_f32(a); | ||
| let mut b = u8_to_f32(b); | ||
| a *= f32x16::splat(src_c.simd, 1.0 / 255.0); | ||
| b *= f32x16::splat(src_c.simd, 1.0 / 255.0); | ||
| (a, b) | ||
| }; | ||
|
|
||
| let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| { | ||
| let val1 = | ||
| f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); | ||
| let val2 = | ||
| f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); | ||
|
|
||
| val1.simd.combine_u8x16(val1, val2) | ||
| }; | ||
|
|
||
| let (mut src_1, mut src_2) = to_f32(src_c); | ||
| let (bg_1, bg_2) = to_f32(bg_c); | ||
|
|
||
| src_1 = highp::blend::mix(src_1, bg_1, blend_mode); | ||
| src_2 = highp::blend::mix(src_2, bg_2, blend_mode); | ||
|
|
||
| to_u8(src_1, src_2) | ||
| } | ||
|
|
||
| fn try_u8_mix<S: Simd>(blend_mode: BlendMode, src_c: u8x32<S>, bg_c: u8x32<S>) -> Option<u8x32<S>> { | ||
| // We implement the u8 fast path for blend modes that | ||
| // 1) are separable. | ||
| // 2) don't have too many divisions, since integer normalization is | ||
| // relatively expensive. | ||
| // In the future, it's possible to do further experimentation to see whether | ||
| // some more blend modes are worth doing in integer space. | ||
| Some(match blend_mode.mix { | ||
| Mix::Normal => src_c, | ||
| Mix::Multiply => Multiply::mix(src_c, bg_c), | ||
| Mix::Screen => Screen::mix(src_c, bg_c), | ||
| Mix::Overlay => Overlay::mix(src_c, bg_c), | ||
| Mix::Darken => Darken::mix(src_c, bg_c), | ||
| Mix::Lighten => Lighten::mix(src_c, bg_c), | ||
| Mix::HardLight => HardLight::mix(src_c, bg_c), | ||
| Mix::Difference => Difference::mix(src_c, bg_c), | ||
| Mix::Exclusion => Exclusion::mix(src_c, bg_c), | ||
| Mix::ColorDodge | ||
| | Mix::ColorBurn | ||
| | Mix::SoftLight | ||
| | Mix::Luminosity | ||
| | Mix::Color | ||
| | Mix::Hue | ||
| | Mix::Saturation => return None, | ||
| }) | ||
| } | ||
|
|
||
| macro_rules! u8_mix { | ||
| ($name:ident, $calc:expr) => { | ||
| struct $name; | ||
|
|
||
| impl $name { | ||
| #[inline(always)] | ||
| fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>) -> u8x32<S> { | ||
| let simd = src_c.simd; | ||
| let res = $calc(src_c, bg_c); | ||
|
|
||
| with_src_alpha(simd, res, src_c) | ||
| } | ||
| } | ||
| }; | ||
| } | ||
|
|
||
| // Formula for blending is (see https://www.w3.org/TR/compositing-1/#generalformula): | ||
| // Cs' = (1 - Ab) * Cs + Ab * B(Cb, Cs) | ||
| // Since vello_cpu expects premultiplied colors, we need to return: | ||
| // M = As * Cs' | ||
| // = As * ((1 - Ab) * Cs + Ab * B(Cb, Cs)) | ||
| // = As * (1 - Ab) * Cs + As * Ab * B(Cb, Cs) | ||
| // = S * (1 - Ab) + As * Ab * B(Cb, Cs) | ||
| // where S = As * Cs and D = Ab * Cb (so just the premultiplied color). | ||
|
|
||
| // Multiply: | ||
| // B(Cb, Cs) = Cb * Cs | ||
| // M = S * (1 - Ab) + As * Ab * Cb * Cs | ||
| // = S * (1 - Ab) + S * D | ||
| u8_mix!(Multiply, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let one_minus_bg_a = 255 - bg_c.splat_4th(); | ||
| let p1 = normalized_mul_u8x32(src_c, one_minus_bg_a); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_c); | ||
|
|
||
| simd.narrow_u16x32(p1 + p2) | ||
| }); | ||
|
|
||
| // Screen: | ||
| // B(Cb, Cs) = Cb + Cs - Cb * Cs | ||
| // M = S * (1 - Ab) + As * D + S * Ab - S * D | ||
| // = S + As * D - S * D | ||
| u8_mix!(Screen, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_c); | ||
| let res = simd.widen_u8x32(src_c) + p1 - p2; | ||
|
|
||
| simd.narrow_u16x32(res) | ||
| }); | ||
|
|
||
| // Overlay is hard-light with source and backdrop swapped. | ||
| u8_mix!(Overlay, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| hard_light_inner(src_c, bg_c, bg_c) | ||
| }); | ||
|
|
||
| // Darken: | ||
| // B(Cb, Cs) = min(Cb, Cs) | ||
| // M = S * (1 - Ab) + min(S * Ab, D * As) | ||
| u8_mix!(Darken, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let src_a = src_c.splat_4th(); | ||
| let bg_a = bg_c.splat_4th(); | ||
| let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_a).min(normalized_mul_u8x32(bg_c, src_a)); | ||
|
|
||
| simd.narrow_u16x32(p1 + p2) | ||
| }); | ||
|
|
||
| // Lighten: | ||
| // B(Cb, Cs) = max(Cb, Cs) | ||
| // M = S * (1 - Ab) + max(S * Ab, D * As) | ||
| u8_mix!(Lighten, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let src_a = src_c.splat_4th(); | ||
| let bg_a = bg_c.splat_4th(); | ||
| let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_a).max(normalized_mul_u8x32(bg_c, src_a)); | ||
|
|
||
| simd.narrow_u16x32(p1 + p2) | ||
| }); | ||
|
|
||
| // Hard-light: | ||
| // if Cs <= 0.5: B(Cb, Cs) = 2 * Cb * Cs | ||
| // otherwise: B(Cb, Cs) = 1 - 2 * (1 - Cb) * (1 - Cs) | ||
| u8_mix!(HardLight, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| hard_light_inner(src_c, bg_c, src_c) | ||
| }); | ||
|
|
||
| // Difference: | ||
| // B(Cb, Cs) = abs(Cb - Cs) | ||
| // M = S * (1 - Ab) + abs(S * Ab - D * As) | ||
| u8_mix!(Difference, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let src_a = src_c.splat_4th(); | ||
| let bg_a = bg_c.splat_4th(); | ||
| let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_a); | ||
| let p3 = normalized_mul_u8x32(bg_c, src_a); | ||
| let diff = p2.max(p3) - p2.min(p3); | ||
|
|
||
| simd.narrow_u16x32(p1 + diff) | ||
| }); | ||
|
|
||
| // Exclusion: | ||
| // B(Cb, Cs) = Cb + Cs - 2 * Cb * Cs | ||
| // M = S * (1 - Ab) + As * D + S * Ab - 2 * S * D | ||
| // = S + As * D - 2 * S * D | ||
| u8_mix!(Exclusion, |src_c: u8x32<S>, bg_c: u8x32<S>| { | ||
| let simd = src_c.simd; | ||
| let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c); | ||
| let p2 = normalized_mul_u8x32(src_c, bg_c); | ||
| let res = simd.widen_u8x32(src_c) + p1; | ||
| let sub = p2 + p2; | ||
| let res = simd.select_u16x32(res.simd_ge(sub), res - sub, u16x32::splat(simd, 0)); | ||
|
|
||
| simd.narrow_u16x32(res) | ||
| }); | ||
|
|
||
| #[inline(always)] | ||
| fn hard_light_inner<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, condition: u8x32<S>) -> u8x32<S> { | ||
| let simd = src_c.simd; | ||
| let src = simd.widen_u8x32(src_c); | ||
| let bg = simd.widen_u8x32(bg_c); | ||
| let src_a = simd.widen_u8x32(src_c.splat_4th()); | ||
| let bg_a = simd.widen_u8x32(bg_c.splat_4th()); | ||
| let condition_a = simd.widen_u8x32(condition.splat_4th()); | ||
| let condition = simd.widen_u8x32(condition); | ||
|
|
||
| let base = src * (255 - bg_a); | ||
| // Multiply branch: As * Ab * 2 * Cb * Cs = 2 * S * D. | ||
| let multiply = 2 * src * bg; | ||
| // Screen branch: As * Ab * (1 - 2 * (1 - Cb) * (1 - Cs)) | ||
| // = As * Ab - 2 * (As - S) * (Ab - D). | ||
| let screen = src_a * bg_a - 2 * (src_a - src) * (bg_a - bg); | ||
| let blended = simd.select_u16x32( | ||
| // The spec condition is `Cs <= 0.5` but on unpremultiplied color. | ||
| // Since `Cs = S / As`, we avoid division by multiplying both sides | ||
| // by alpha: `Cs <= 0.5` => `S <= 0.5 * As` => `2 * S <= As`. | ||
| (condition + condition).simd_le(condition_a), | ||
| multiply, | ||
| screen, | ||
| ); | ||
| let res = (base + blended).div_255(); | ||
|
|
||
| simd.narrow_u16x32(res) | ||
| } | ||
|
|
||
| #[inline(always)] | ||
| fn with_src_alpha<S: Simd>(simd: S, rgb: u8x32<S>, src_c: u8x32<S>) -> u8x32<S> { | ||
| let alpha_mask = u32x8::splat(simd, u32::from_ne_bytes([0, 0, 0, 255])).to_bytes(); | ||
|
|
||
| (rgb & !alpha_mask) | (src_c & alpha_mask) | ||
| } | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -8,14 +8,15 @@ | |
| //! performance on many architectures compared to floating-point operations, while | ||
| //! maintaining sufficient precision for most rendering tasks. | ||
|
|
||
| pub(crate) mod blend; | ||
| mod compose; | ||
| mod gradient; | ||
| mod image; | ||
|
|
||
| use crate::filter::filter_lowp; | ||
| use crate::fine::FineKernel; | ||
| use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter}; | ||
| use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE, Splat4thExt}; | ||
| use crate::fine::{FineKernel, highp, u8_to_f32}; | ||
| use crate::layer_manager::LayerManager; | ||
| use crate::peniko::BlendMode; | ||
| use crate::region::Region; | ||
|
|
@@ -32,7 +33,7 @@ use vello_common::mask::Mask; | |
| use vello_common::paint::{PremulColor, Tint, TintMode}; | ||
| use vello_common::pixmap::Pixmap; | ||
| use vello_common::tile::Tile; | ||
| use vello_common::util::{Div255Ext, f32_to_u8}; | ||
| use vello_common::util::Div255Ext; | ||
|
|
||
| /// The kernel for doing rendering using u8/u16. | ||
| #[derive(Clone, Copy, Debug)] | ||
|
|
@@ -350,8 +351,8 @@ mod fill { | |
| //! using only the source alpha channel for compositing. | ||
|
|
||
| use crate::fine::Splat4thExt; | ||
| use crate::fine::lowp::blend; | ||
| use crate::fine::lowp::compose::ComposeExt; | ||
| use crate::fine::lowp::mix; | ||
| use crate::peniko::{BlendMode, Mix}; | ||
| use vello_common::fearless_simd::*; | ||
| use vello_common::util::normalized_mul_u8x32; | ||
|
|
@@ -372,7 +373,7 @@ mod fill { | |
| let src_v = if default_mix { | ||
| next_src | ||
| } else { | ||
| mix(next_src, bg_v, blend_mode) | ||
| blend::mix(next_src, bg_v, blend_mode) | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Just thinking from a performance perspective, would it make sense to combine One downside I can see is that you’d need implementations for every Mix * Compose combination. Still, it might make sense for a few commonly used subsets.
Collaborator
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It might help a bit (especially for u8), but I don't think it carries it's weight due to the large number of combinations you get (as you mentioned, not good for code size). And blending by itself is already pretty slow. I also don't think it's common at all to have a non-default blend mode + composition mode set. |
||
| }; | ||
| let res = blend_mode.compose(simd, src_v, bg_v, None); | ||
| res.store_slice(next_dest); | ||
|
|
@@ -449,7 +450,7 @@ mod alpha_fill { | |
|
|
||
| use crate::fine::Splat4thExt; | ||
| use crate::fine::lowp::compose::ComposeExt; | ||
| use crate::fine::lowp::{extract_masks, mix}; | ||
| use crate::fine::lowp::{blend, extract_masks}; | ||
| use crate::peniko::{BlendMode, Mix}; | ||
| use vello_common::fearless_simd::*; | ||
| use vello_common::util::{Div255Ext, normalized_mul_u8x32}; | ||
|
|
@@ -474,7 +475,7 @@ mod alpha_fill { | |
| let src_c = if default_mix { | ||
| next_src | ||
| } else { | ||
| mix(next_src, bg_v, blend_mode) | ||
| blend::mix(next_src, bg_v, blend_mode) | ||
| }; | ||
| let masks = extract_masks(simd, &next_mask); | ||
| let res = blend_mode.compose(simd, src_c, bg_v, Some(masks)); | ||
|
|
@@ -565,38 +566,6 @@ mod alpha_fill { | |
| } | ||
| } | ||
|
|
||
| /// Applies blend mode mixing by converting to f32, mixing, then converting back to u8. | ||
| /// | ||
| /// TODO: Add a proper lowp mix pipeline that operates entirely in integer space | ||
| /// for better performance (currently converts through f32 which is slower). | ||
| fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> { | ||
| let to_f32 = |val: u8x32<S>| { | ||
| let (a, b) = src_c.simd.split_u8x32(val); | ||
| let mut a = u8_to_f32(a); | ||
| let mut b = u8_to_f32(b); | ||
| a *= f32x16::splat(src_c.simd, 1.0 / 255.0); | ||
| b *= f32x16::splat(src_c.simd, 1.0 / 255.0); | ||
| (a, b) | ||
| }; | ||
|
|
||
| let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| { | ||
| let val1 = | ||
| f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); | ||
| let val2 = | ||
| f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); | ||
|
|
||
| val1.simd.combine_u8x16(val1, val2) | ||
| }; | ||
|
|
||
| let (mut src_1, mut src_2) = to_f32(src_c); | ||
| let (bg_1, bg_2) = to_f32(bg_c); | ||
|
|
||
| src_1 = highp::blend::mix(src_1, bg_1, blend_mode); | ||
| src_2 = highp::blend::mix(src_2, bg_2, blend_mode); | ||
|
|
||
| to_u8(src_1, src_2) | ||
| } | ||
|
|
||
| /// Expands 8 mask bytes into a 32-byte SIMD vector where each pixel's 4 components | ||
| /// share the same mask value (each of 8 mask values is repeated 4 times). | ||
| /// | ||
|
|
||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Would it make sense to add
#[inline(always)]here?There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Yes, but I will do this in a follow-up since we need to fix this up in a couple of places anyway (see #1579).
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Will add a TODO.