diff --git a/sparse_strips/vello_cpu/src/fine/lowp/blend.rs b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs new file mode 100644 index 0000000000..7b4a252acf --- /dev/null +++ b/sparse_strips/vello_cpu/src/fine/lowp/blend.rs @@ -0,0 +1,224 @@ +// Copyright 2025 the Vello Authors +// SPDX-License-Identifier: Apache-2.0 OR MIT + +use crate::fine::{Splat4thExt, highp, u8_to_f32}; +use crate::peniko::{BlendMode, Mix}; +use vello_common::fearless_simd::*; +use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32}; + +// TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed. +pub(crate) fn mix(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMode) -> u8x32 { + if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) { + return res; + } + + // Fallback for blend modes that aren't supported in u8. + + let to_f32 = |val: u8x32| { + let (a, b) = src_c.simd.split_u8x32(val); + let mut a = u8_to_f32(a); + let mut b = u8_to_f32(b); + a *= f32x16::splat(src_c.simd, 1.0 / 255.0); + b *= f32x16::splat(src_c.simd, 1.0 / 255.0); + (a, b) + }; + + let to_u8 = |val1: f32x16, val2: f32x16| { + let val1 = + f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); + let val2 = + f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); + + val1.simd.combine_u8x16(val1, val2) + }; + + let (mut src_1, mut src_2) = to_f32(src_c); + let (bg_1, bg_2) = to_f32(bg_c); + + src_1 = highp::blend::mix(src_1, bg_1, blend_mode); + src_2 = highp::blend::mix(src_2, bg_2, blend_mode); + + to_u8(src_1, src_2) +} + +fn try_u8_mix(blend_mode: BlendMode, src_c: u8x32, bg_c: u8x32) -> Option> { + // We implement the u8 fast path for blend modes that + // 1) are separable. + // 2) don't have too many divisions, since integer normalization is + // relatively expensive. + // In the future, it's possible to do further experimentation to see whether + // some more blend modes are worth doing in integer space. + Some(match blend_mode.mix { + Mix::Normal => src_c, + Mix::Multiply => Multiply::mix(src_c, bg_c), + Mix::Screen => Screen::mix(src_c, bg_c), + Mix::Overlay => Overlay::mix(src_c, bg_c), + Mix::Darken => Darken::mix(src_c, bg_c), + Mix::Lighten => Lighten::mix(src_c, bg_c), + Mix::HardLight => HardLight::mix(src_c, bg_c), + Mix::Difference => Difference::mix(src_c, bg_c), + Mix::Exclusion => Exclusion::mix(src_c, bg_c), + Mix::ColorDodge + | Mix::ColorBurn + | Mix::SoftLight + | Mix::Luminosity + | Mix::Color + | Mix::Hue + | Mix::Saturation => return None, + }) +} + +macro_rules! u8_mix { + ($name:ident, $calc:expr) => { + struct $name; + + impl $name { + #[inline(always)] + fn mix(src_c: u8x32, bg_c: u8x32) -> u8x32 { + let simd = src_c.simd; + let res = $calc(src_c, bg_c); + + with_src_alpha(simd, res, src_c) + } + } + }; +} + +// Formula for blending is (see https://www.w3.org/TR/compositing-1/#generalformula): +// Cs' = (1 - Ab) * Cs + Ab * B(Cb, Cs) +// Since vello_cpu expects premultiplied colors, we need to return: +// M = As * Cs' +// = As * ((1 - Ab) * Cs + Ab * B(Cb, Cs)) +// = As * (1 - Ab) * Cs + As * Ab * B(Cb, Cs) +// = S * (1 - Ab) + As * Ab * B(Cb, Cs) +// where S = As * Cs and D = Ab * Cb (so just the premultiplied color). + +// Multiply: +// B(Cb, Cs) = Cb * Cs +// M = S * (1 - Ab) + As * Ab * Cb * Cs +// = S * (1 - Ab) + S * D +u8_mix!(Multiply, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let one_minus_bg_a = 255 - bg_c.splat_4th(); + let p1 = normalized_mul_u8x32(src_c, one_minus_bg_a); + let p2 = normalized_mul_u8x32(src_c, bg_c); + + simd.narrow_u16x32(p1 + p2) +}); + +// Screen: +// B(Cb, Cs) = Cb + Cs - Cb * Cs +// M = S * (1 - Ab) + As * D + S * Ab - S * D +// = S + As * D - S * D +u8_mix!(Screen, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c); + let p2 = normalized_mul_u8x32(src_c, bg_c); + let res = simd.widen_u8x32(src_c) + p1 - p2; + + simd.narrow_u16x32(res) +}); + +// Overlay is hard-light with source and backdrop swapped. +u8_mix!(Overlay, |src_c: u8x32, bg_c: u8x32| { + hard_light_inner(src_c, bg_c, bg_c) +}); + +// Darken: +// B(Cb, Cs) = min(Cb, Cs) +// M = S * (1 - Ab) + min(S * Ab, D * As) +u8_mix!(Darken, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let src_a = src_c.splat_4th(); + let bg_a = bg_c.splat_4th(); + let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); + let p2 = normalized_mul_u8x32(src_c, bg_a).min(normalized_mul_u8x32(bg_c, src_a)); + + simd.narrow_u16x32(p1 + p2) +}); + +// Lighten: +// B(Cb, Cs) = max(Cb, Cs) +// M = S * (1 - Ab) + max(S * Ab, D * As) +u8_mix!(Lighten, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let src_a = src_c.splat_4th(); + let bg_a = bg_c.splat_4th(); + let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); + let p2 = normalized_mul_u8x32(src_c, bg_a).max(normalized_mul_u8x32(bg_c, src_a)); + + simd.narrow_u16x32(p1 + p2) +}); + +// Hard-light: +// if Cs <= 0.5: B(Cb, Cs) = 2 * Cb * Cs +// otherwise: B(Cb, Cs) = 1 - 2 * (1 - Cb) * (1 - Cs) +u8_mix!(HardLight, |src_c: u8x32, bg_c: u8x32| { + hard_light_inner(src_c, bg_c, src_c) +}); + +// Difference: +// B(Cb, Cs) = abs(Cb - Cs) +// M = S * (1 - Ab) + abs(S * Ab - D * As) +u8_mix!(Difference, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let src_a = src_c.splat_4th(); + let bg_a = bg_c.splat_4th(); + let p1 = normalized_mul_u8x32(src_c, 255 - bg_a); + let p2 = normalized_mul_u8x32(src_c, bg_a); + let p3 = normalized_mul_u8x32(bg_c, src_a); + let diff = p2.max(p3) - p2.min(p3); + + simd.narrow_u16x32(p1 + diff) +}); + +// Exclusion: +// B(Cb, Cs) = Cb + Cs - 2 * Cb * Cs +// M = S * (1 - Ab) + As * D + S * Ab - 2 * S * D +// = S + As * D - 2 * S * D +u8_mix!(Exclusion, |src_c: u8x32, bg_c: u8x32| { + let simd = src_c.simd; + let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c); + let p2 = normalized_mul_u8x32(src_c, bg_c); + let res = simd.widen_u8x32(src_c) + p1; + let sub = p2 + p2; + let res = simd.select_u16x32(res.simd_ge(sub), res - sub, u16x32::splat(simd, 0)); + + simd.narrow_u16x32(res) +}); + +#[inline(always)] +fn hard_light_inner(src_c: u8x32, bg_c: u8x32, condition: u8x32) -> u8x32 { + let simd = src_c.simd; + let src = simd.widen_u8x32(src_c); + let bg = simd.widen_u8x32(bg_c); + let src_a = simd.widen_u8x32(src_c.splat_4th()); + let bg_a = simd.widen_u8x32(bg_c.splat_4th()); + let condition_a = simd.widen_u8x32(condition.splat_4th()); + let condition = simd.widen_u8x32(condition); + + let base = src * (255 - bg_a); + // Multiply branch: As * Ab * 2 * Cb * Cs = 2 * S * D. + let multiply = 2 * src * bg; + // Screen branch: As * Ab * (1 - 2 * (1 - Cb) * (1 - Cs)) + // = As * Ab - 2 * (As - S) * (Ab - D). + let screen = src_a * bg_a - 2 * (src_a - src) * (bg_a - bg); + let blended = simd.select_u16x32( + // The spec condition is `Cs <= 0.5` but on unpremultiplied color. + // Since `Cs = S / As`, we avoid division by multiplying both sides + // by alpha: `Cs <= 0.5` => `S <= 0.5 * As` => `2 * S <= As`. + (condition + condition).simd_le(condition_a), + multiply, + screen, + ); + let res = (base + blended).div_255(); + + simd.narrow_u16x32(res) +} + +#[inline(always)] +fn with_src_alpha(simd: S, rgb: u8x32, src_c: u8x32) -> u8x32 { + let alpha_mask = u32x8::splat(simd, u32::from_ne_bytes([0, 0, 0, 255])).to_bytes(); + + (rgb & !alpha_mask) | (src_c & alpha_mask) +} diff --git a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs index c59d68122d..3e9e88b79f 100644 --- a/sparse_strips/vello_cpu/src/fine/lowp/mod.rs +++ b/sparse_strips/vello_cpu/src/fine/lowp/mod.rs @@ -8,14 +8,15 @@ //! performance on many architectures compared to floating-point operations, while //! maintaining sufficient precision for most rendering tasks. +pub(crate) mod blend; mod compose; mod gradient; mod image; use crate::filter::filter_lowp; +use crate::fine::FineKernel; use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter}; use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE, Splat4thExt}; -use crate::fine::{FineKernel, highp, u8_to_f32}; use crate::layer_manager::LayerManager; use crate::peniko::BlendMode; use crate::region::Region; @@ -32,7 +33,7 @@ use vello_common::mask::Mask; use vello_common::paint::{PremulColor, Tint, TintMode}; use vello_common::pixmap::Pixmap; use vello_common::tile::Tile; -use vello_common::util::{Div255Ext, f32_to_u8}; +use vello_common::util::Div255Ext; /// The kernel for doing rendering using u8/u16. #[derive(Clone, Copy, Debug)] @@ -350,8 +351,8 @@ mod fill { //! using only the source alpha channel for compositing. use crate::fine::Splat4thExt; + use crate::fine::lowp::blend; use crate::fine::lowp::compose::ComposeExt; - use crate::fine::lowp::mix; use crate::peniko::{BlendMode, Mix}; use vello_common::fearless_simd::*; use vello_common::util::normalized_mul_u8x32; @@ -372,7 +373,7 @@ mod fill { let src_v = if default_mix { next_src } else { - mix(next_src, bg_v, blend_mode) + blend::mix(next_src, bg_v, blend_mode) }; let res = blend_mode.compose(simd, src_v, bg_v, None); res.store_slice(next_dest); @@ -449,7 +450,7 @@ mod alpha_fill { use crate::fine::Splat4thExt; use crate::fine::lowp::compose::ComposeExt; - use crate::fine::lowp::{extract_masks, mix}; + use crate::fine::lowp::{blend, extract_masks}; use crate::peniko::{BlendMode, Mix}; use vello_common::fearless_simd::*; use vello_common::util::{Div255Ext, normalized_mul_u8x32}; @@ -474,7 +475,7 @@ mod alpha_fill { let src_c = if default_mix { next_src } else { - mix(next_src, bg_v, blend_mode) + blend::mix(next_src, bg_v, blend_mode) }; let masks = extract_masks(simd, &next_mask); let res = blend_mode.compose(simd, src_c, bg_v, Some(masks)); @@ -565,38 +566,6 @@ mod alpha_fill { } } -/// Applies blend mode mixing by converting to f32, mixing, then converting back to u8. -/// -/// TODO: Add a proper lowp mix pipeline that operates entirely in integer space -/// for better performance (currently converts through f32 which is slower). -fn mix(src_c: u8x32, bg_c: u8x32, blend_mode: BlendMode) -> u8x32 { - let to_f32 = |val: u8x32| { - let (a, b) = src_c.simd.split_u8x32(val); - let mut a = u8_to_f32(a); - let mut b = u8_to_f32(b); - a *= f32x16::splat(src_c.simd, 1.0 / 255.0); - b *= f32x16::splat(src_c.simd, 1.0 / 255.0); - (a, b) - }; - - let to_u8 = |val1: f32x16, val2: f32x16| { - let val1 = - f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5))); - let val2 = - f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5))); - - val1.simd.combine_u8x16(val1, val2) - }; - - let (mut src_1, mut src_2) = to_f32(src_c); - let (bg_1, bg_2) = to_f32(bg_c); - - src_1 = highp::blend::mix(src_1, bg_1, blend_mode); - src_2 = highp::blend::mix(src_2, bg_2, blend_mode); - - to_u8(src_1, src_2) -} - /// Expands 8 mask bytes into a 32-byte SIMD vector where each pixel's 4 components /// share the same mask value (each of 8 mask values is repeated 4 times). ///