Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
224 changes: 224 additions & 0 deletions sparse_strips/vello_cpu/src/fine/lowp/blend.rs
Original file line number Diff line number Diff line change
@@ -0,0 +1,224 @@
// Copyright 2025 the Vello Authors
// SPDX-License-Identifier: Apache-2.0 OR MIT

use crate::fine::{Splat4thExt, highp, u8_to_f32};
use crate::peniko::{BlendMode, Mix};
use vello_common::fearless_simd::*;
use vello_common::util::{Div255Ext, f32_to_u8, normalized_mul_u8x32};

// TODO: Make sure this vectorizes properly (also the f32 pipeline) by inlining if needed.
pub(crate) fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Would it make sense to add #[inline(always)] here?

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Yes, but I will do this in a follow-up since we need to fix this up in a couple of places anyway (see #1579).

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Will add a TODO.

if let Some(res) = try_u8_mix(blend_mode, src_c, bg_c) {
return res;
}

// Fallback for blend modes that aren't supported in u8.

let to_f32 = |val: u8x32<S>| {
let (a, b) = src_c.simd.split_u8x32(val);
let mut a = u8_to_f32(a);
let mut b = u8_to_f32(b);
a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
(a, b)
};

let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
let val1 =
f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
let val2 =
f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));

val1.simd.combine_u8x16(val1, val2)
};

let (mut src_1, mut src_2) = to_f32(src_c);
let (bg_1, bg_2) = to_f32(bg_c);

src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
src_2 = highp::blend::mix(src_2, bg_2, blend_mode);

to_u8(src_1, src_2)
}

fn try_u8_mix<S: Simd>(blend_mode: BlendMode, src_c: u8x32<S>, bg_c: u8x32<S>) -> Option<u8x32<S>> {
// We implement the u8 fast path for blend modes that
// 1) are separable.
// 2) don't have too many divisions, since integer normalization is
// relatively expensive.
// In the future, it's possible to do further experimentation to see whether
// some more blend modes are worth doing in integer space.
Some(match blend_mode.mix {
Mix::Normal => src_c,
Mix::Multiply => Multiply::mix(src_c, bg_c),
Mix::Screen => Screen::mix(src_c, bg_c),
Mix::Overlay => Overlay::mix(src_c, bg_c),
Mix::Darken => Darken::mix(src_c, bg_c),
Mix::Lighten => Lighten::mix(src_c, bg_c),
Mix::HardLight => HardLight::mix(src_c, bg_c),
Mix::Difference => Difference::mix(src_c, bg_c),
Mix::Exclusion => Exclusion::mix(src_c, bg_c),
Mix::ColorDodge
| Mix::ColorBurn
| Mix::SoftLight
| Mix::Luminosity
| Mix::Color
| Mix::Hue
| Mix::Saturation => return None,
})
}

macro_rules! u8_mix {
($name:ident, $calc:expr) => {
struct $name;

impl $name {
#[inline(always)]
fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>) -> u8x32<S> {
let simd = src_c.simd;
let res = $calc(src_c, bg_c);

with_src_alpha(simd, res, src_c)
}
}
};
}

// Formula for blending is (see https://www.w3.org/TR/compositing-1/#generalformula):
// Cs' = (1 - Ab) * Cs + Ab * B(Cb, Cs)
// Since vello_cpu expects premultiplied colors, we need to return:
// M = As * Cs'
// = As * ((1 - Ab) * Cs + Ab * B(Cb, Cs))
// = As * (1 - Ab) * Cs + As * Ab * B(Cb, Cs)
// = S * (1 - Ab) + As * Ab * B(Cb, Cs)
// where S = As * Cs and D = Ab * Cb (so just the premultiplied color).

// Multiply:
// B(Cb, Cs) = Cb * Cs
// M = S * (1 - Ab) + As * Ab * Cb * Cs
// = S * (1 - Ab) + S * D
u8_mix!(Multiply, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let one_minus_bg_a = 255 - bg_c.splat_4th();
let p1 = normalized_mul_u8x32(src_c, one_minus_bg_a);
let p2 = normalized_mul_u8x32(src_c, bg_c);

simd.narrow_u16x32(p1 + p2)
});

// Screen:
// B(Cb, Cs) = Cb + Cs - Cb * Cs
// M = S * (1 - Ab) + As * D + S * Ab - S * D
// = S + As * D - S * D
u8_mix!(Screen, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c);
let p2 = normalized_mul_u8x32(src_c, bg_c);
let res = simd.widen_u8x32(src_c) + p1 - p2;

simd.narrow_u16x32(res)
});

// Overlay is hard-light with source and backdrop swapped.
u8_mix!(Overlay, |src_c: u8x32<S>, bg_c: u8x32<S>| {
hard_light_inner(src_c, bg_c, bg_c)
});

// Darken:
// B(Cb, Cs) = min(Cb, Cs)
// M = S * (1 - Ab) + min(S * Ab, D * As)
u8_mix!(Darken, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let src_a = src_c.splat_4th();
let bg_a = bg_c.splat_4th();
let p1 = normalized_mul_u8x32(src_c, 255 - bg_a);
let p2 = normalized_mul_u8x32(src_c, bg_a).min(normalized_mul_u8x32(bg_c, src_a));

simd.narrow_u16x32(p1 + p2)
});

// Lighten:
// B(Cb, Cs) = max(Cb, Cs)
// M = S * (1 - Ab) + max(S * Ab, D * As)
u8_mix!(Lighten, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let src_a = src_c.splat_4th();
let bg_a = bg_c.splat_4th();
let p1 = normalized_mul_u8x32(src_c, 255 - bg_a);
let p2 = normalized_mul_u8x32(src_c, bg_a).max(normalized_mul_u8x32(bg_c, src_a));

simd.narrow_u16x32(p1 + p2)
});

// Hard-light:
// if Cs <= 0.5: B(Cb, Cs) = 2 * Cb * Cs
// otherwise: B(Cb, Cs) = 1 - 2 * (1 - Cb) * (1 - Cs)
u8_mix!(HardLight, |src_c: u8x32<S>, bg_c: u8x32<S>| {
hard_light_inner(src_c, bg_c, src_c)
});

// Difference:
// B(Cb, Cs) = abs(Cb - Cs)
// M = S * (1 - Ab) + abs(S * Ab - D * As)
u8_mix!(Difference, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let src_a = src_c.splat_4th();
let bg_a = bg_c.splat_4th();
let p1 = normalized_mul_u8x32(src_c, 255 - bg_a);
let p2 = normalized_mul_u8x32(src_c, bg_a);
let p3 = normalized_mul_u8x32(bg_c, src_a);
let diff = p2.max(p3) - p2.min(p3);

simd.narrow_u16x32(p1 + diff)
});

// Exclusion:
// B(Cb, Cs) = Cb + Cs - 2 * Cb * Cs
// M = S * (1 - Ab) + As * D + S * Ab - 2 * S * D
// = S + As * D - 2 * S * D
u8_mix!(Exclusion, |src_c: u8x32<S>, bg_c: u8x32<S>| {
let simd = src_c.simd;
let p1 = normalized_mul_u8x32(src_c.splat_4th(), bg_c);
let p2 = normalized_mul_u8x32(src_c, bg_c);
let res = simd.widen_u8x32(src_c) + p1;
let sub = p2 + p2;
let res = simd.select_u16x32(res.simd_ge(sub), res - sub, u16x32::splat(simd, 0));

simd.narrow_u16x32(res)
});

#[inline(always)]
fn hard_light_inner<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, condition: u8x32<S>) -> u8x32<S> {
let simd = src_c.simd;
let src = simd.widen_u8x32(src_c);
let bg = simd.widen_u8x32(bg_c);
let src_a = simd.widen_u8x32(src_c.splat_4th());
let bg_a = simd.widen_u8x32(bg_c.splat_4th());
let condition_a = simd.widen_u8x32(condition.splat_4th());
let condition = simd.widen_u8x32(condition);

let base = src * (255 - bg_a);
// Multiply branch: As * Ab * 2 * Cb * Cs = 2 * S * D.
let multiply = 2 * src * bg;
// Screen branch: As * Ab * (1 - 2 * (1 - Cb) * (1 - Cs))
// = As * Ab - 2 * (As - S) * (Ab - D).
let screen = src_a * bg_a - 2 * (src_a - src) * (bg_a - bg);
let blended = simd.select_u16x32(
// The spec condition is `Cs <= 0.5` but on unpremultiplied color.
// Since `Cs = S / As`, we avoid division by multiplying both sides
// by alpha: `Cs <= 0.5` => `S <= 0.5 * As` => `2 * S <= As`.
(condition + condition).simd_le(condition_a),
multiply,
screen,
);
let res = (base + blended).div_255();

simd.narrow_u16x32(res)
}

#[inline(always)]
fn with_src_alpha<S: Simd>(simd: S, rgb: u8x32<S>, src_c: u8x32<S>) -> u8x32<S> {
let alpha_mask = u32x8::splat(simd, u32::from_ne_bytes([0, 0, 0, 255])).to_bytes();

(rgb & !alpha_mask) | (src_c & alpha_mask)
}
45 changes: 7 additions & 38 deletions sparse_strips/vello_cpu/src/fine/lowp/mod.rs
Original file line number Diff line number Diff line change
Expand Up @@ -8,14 +8,15 @@
//! performance on many architectures compared to floating-point operations, while
//! maintaining sufficient precision for most rendering tasks.

pub(crate) mod blend;
mod compose;
mod gradient;
mod image;

use crate::filter::filter_lowp;
use crate::fine::FineKernel;
use crate::fine::lowp::image::{BilinearImagePainter, PlainBilinearImagePainter};
use crate::fine::{COLOR_COMPONENTS, Painter, SCRATCH_BUF_SIZE, Splat4thExt};
use crate::fine::{FineKernel, highp, u8_to_f32};
use crate::layer_manager::LayerManager;
use crate::peniko::BlendMode;
use crate::region::Region;
Expand All @@ -32,7 +33,7 @@ use vello_common::mask::Mask;
use vello_common::paint::{PremulColor, Tint, TintMode};
use vello_common::pixmap::Pixmap;
use vello_common::tile::Tile;
use vello_common::util::{Div255Ext, f32_to_u8};
use vello_common::util::Div255Ext;

/// The kernel for doing rendering using u8/u16.
#[derive(Clone, Copy, Debug)]
Expand Down Expand Up @@ -350,8 +351,8 @@ mod fill {
//! using only the source alpha channel for compositing.

use crate::fine::Splat4thExt;
use crate::fine::lowp::blend;
use crate::fine::lowp::compose::ComposeExt;
use crate::fine::lowp::mix;
use crate::peniko::{BlendMode, Mix};
use vello_common::fearless_simd::*;
use vello_common::util::normalized_mul_u8x32;
Expand All @@ -372,7 +373,7 @@ mod fill {
let src_v = if default_mix {
next_src
} else {
mix(next_src, bg_v, blend_mode)
blend::mix(next_src, bg_v, blend_mode)

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Just thinking from a performance perspective, would it make sense to combine mix and compose into a single fused implementation? Could that improve performance even further?

One downside I can see is that you’d need implementations for every Mix * Compose combination. Still, it might make sense for a few commonly used subsets.

Copy link
Copy Markdown
Collaborator Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It might help a bit (especially for u8), but I don't think it carries it's weight due to the large number of combinations you get (as you mentioned, not good for code size). And blending by itself is already pretty slow. I also don't think it's common at all to have a non-default blend mode + composition mode set.

};
let res = blend_mode.compose(simd, src_v, bg_v, None);
res.store_slice(next_dest);
Expand Down Expand Up @@ -449,7 +450,7 @@ mod alpha_fill {

use crate::fine::Splat4thExt;
use crate::fine::lowp::compose::ComposeExt;
use crate::fine::lowp::{extract_masks, mix};
use crate::fine::lowp::{blend, extract_masks};
use crate::peniko::{BlendMode, Mix};
use vello_common::fearless_simd::*;
use vello_common::util::{Div255Ext, normalized_mul_u8x32};
Expand All @@ -474,7 +475,7 @@ mod alpha_fill {
let src_c = if default_mix {
next_src
} else {
mix(next_src, bg_v, blend_mode)
blend::mix(next_src, bg_v, blend_mode)
};
let masks = extract_masks(simd, &next_mask);
let res = blend_mode.compose(simd, src_c, bg_v, Some(masks));
Expand Down Expand Up @@ -565,38 +566,6 @@ mod alpha_fill {
}
}

/// Applies blend mode mixing by converting to f32, mixing, then converting back to u8.
///
/// TODO: Add a proper lowp mix pipeline that operates entirely in integer space
/// for better performance (currently converts through f32 which is slower).
fn mix<S: Simd>(src_c: u8x32<S>, bg_c: u8x32<S>, blend_mode: BlendMode) -> u8x32<S> {
let to_f32 = |val: u8x32<S>| {
let (a, b) = src_c.simd.split_u8x32(val);
let mut a = u8_to_f32(a);
let mut b = u8_to_f32(b);
a *= f32x16::splat(src_c.simd, 1.0 / 255.0);
b *= f32x16::splat(src_c.simd, 1.0 / 255.0);
(a, b)
};

let to_u8 = |val1: f32x16<S>, val2: f32x16<S>| {
let val1 =
f32_to_u8(f32x16::splat(val1.simd, 255.0).mul_add(val1, f32x16::splat(val1.simd, 0.5)));
let val2 =
f32_to_u8(f32x16::splat(val2.simd, 255.0).mul_add(val2, f32x16::splat(val2.simd, 0.5)));

val1.simd.combine_u8x16(val1, val2)
};

let (mut src_1, mut src_2) = to_f32(src_c);
let (bg_1, bg_2) = to_f32(bg_c);

src_1 = highp::blend::mix(src_1, bg_1, blend_mode);
src_2 = highp::blend::mix(src_2, bg_2, blend_mode);

to_u8(src_1, src_2)
}

/// Expands 8 mask bytes into a 32-byte SIMD vector where each pixel's 4 components
/// share the same mask value (each of 8 mask values is repeated 4 times).
///
Expand Down
Loading