Skip to content

Commit 895c1cb

Browse files
authored
perf(sparse_strips): Reuse flattened_cubics allocation (#1338)
By reusing this allocation, we see 5 to 6.6% improvement to flattening performance. <img width="748" height="221" alt="image" src="https://github.com/user-attachments/assets/bfade7ee-9671-4400-965f-364462b9dcaf" />
1 parent a108895 commit 895c1cb

File tree

2 files changed

+22
-35
lines changed

2 files changed

+22
-35
lines changed

sparse_strips/vello_bench/src/flatten.rs

Lines changed: 8 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -18,17 +18,21 @@ pub fn flatten(c: &mut Criterion) {
1818
let expanded_strokes = $item.expanded_strokes();
1919

2020
g.bench_function($item.name.clone(), |b| {
21+
// Reuse allocations to better simulate real-world use.
22+
let mut line_buf: Vec<flatten::Line> = vec![];
23+
let mut temp_buf: Vec<flatten::Line> = vec![];
24+
let mut flatten_ctx = FlattenCtx::default();
25+
2126
b.iter(|| {
22-
let mut line_buf: Vec<flatten::Line> = vec![];
23-
let mut temp_buf: Vec<flatten::Line> = vec![];
27+
line_buf.clear();
2428

2529
for path in &$item.fills {
2630
flatten::fill(
2731
Level::new(),
2832
&path.path,
2933
path.transform,
3034
&mut temp_buf,
31-
&mut FlattenCtx::default(),
35+
&mut flatten_ctx,
3236
);
3337
line_buf.extend(&temp_buf);
3438
}
@@ -39,7 +43,7 @@ pub fn flatten(c: &mut Criterion) {
3943
stroke,
4044
Affine::IDENTITY,
4145
&mut temp_buf,
42-
&mut FlattenCtx::default(),
46+
&mut flatten_ctx,
4347
);
4448
line_buf.extend(&temp_buf);
4549
}

sparse_strips/vello_common/src/flatten_simd.rs

Lines changed: 14 additions & 31 deletions
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,6 @@ use crate::flatten::TOL_2;
99
#[cfg(not(feature = "std"))]
1010
use crate::kurbo::common::FloatFuncs as _;
1111
use crate::kurbo::{CubicBez, ParamCurve, PathEl, Point, QuadBez};
12-
use alloc::vec;
1312
use alloc::vec::Vec;
1413
use bytemuck::{Pod, Zeroable};
1514
use fearless_simd::*;
@@ -33,7 +32,7 @@ pub(crate) fn flatten<S: Simd>(
3332
callback: &mut impl Callback,
3433
flatten_ctx: &mut FlattenCtx,
3534
) {
36-
let mut flattened_cubics = vec![];
35+
flatten_ctx.flattened_cubics.clear();
3736

3837
let sqrt_tol = tolerance.sqrt();
3938
let mut last_pt = None;
@@ -112,15 +111,9 @@ pub(crate) fn flatten<S: Simd>(
112111
callback.callback(PathEl::LineTo(p3));
113112
} else {
114113
let c = CubicBez::new(p0, p1, p2, p3);
115-
let max = flatten_cubic_simd(
116-
simd,
117-
c,
118-
flatten_ctx,
119-
tolerance as f32,
120-
&mut flattened_cubics,
121-
);
122-
123-
for p in &flattened_cubics[1..max] {
114+
let max = flatten_cubic_simd(simd, c, flatten_ctx, tolerance as f32);
115+
116+
for p in &flatten_ctx.flattened_cubics[1..max] {
124117
callback.callback(PathEl::LineTo(Point::new(p.x as f64, p.y as f64)));
125118
}
126119
}
@@ -243,6 +236,8 @@ pub struct FlattenCtx {
243236
uscale: [f32; MAX_QUADS],
244237
val: [f32; MAX_QUADS],
245238
n_quads: usize,
239+
/// Reusable buffer for flattened cubic points.
240+
flattened_cubics: Vec<Point32>,
246241
}
247242

248243
#[inline(always)]
@@ -454,12 +449,12 @@ fn estimate_subdiv_simd<S: Simd>(simd: S, sqrt_tol: f32, ctx: &mut FlattenCtx) {
454449
#[inline(always)]
455450
fn output_lines_simd<S: Simd>(
456451
simd: S,
457-
ctx: &FlattenCtx,
452+
ctx: &mut FlattenCtx,
458453
i: usize,
459454
x0: f32,
460455
dx: f32,
461456
n: usize,
462-
out: &mut [f32],
457+
start_idx: usize,
463458
) {
464459
let p0 = pt_splat_simd(simd, ctx.even_pts[i]);
465460
let p1 = pt_splat_simd(simd, ctx.odd_pts[i]);
@@ -473,6 +468,8 @@ fn output_lines_simd<S: Simd>(
473468
let a_inc = 4.0 * dx * da;
474469
let uscale = f32x8::splat(simd, ctx.uscale[i]);
475470

471+
let out: &mut [f32] = bytemuck::cast_slice_mut(&mut ctx.flattened_cubics[start_idx..]);
472+
476473
for j in 0..n.div_ceil(4) {
477474
let u = approx_parabola_inv_integral_simd(a);
478475
let t = u.madd(uscale, -ctx.u0[i] * uscale);
@@ -488,21 +485,15 @@ fn output_lines_simd<S: Simd>(
488485
}
489486

490487
#[inline(always)]
491-
fn flatten_cubic_simd<S: Simd>(
492-
simd: S,
493-
c: CubicBez,
494-
ctx: &mut FlattenCtx,
495-
accuracy: f32,
496-
result: &mut Vec<Point32>,
497-
) -> usize {
488+
fn flatten_cubic_simd<S: Simd>(simd: S, c: CubicBez, ctx: &mut FlattenCtx, accuracy: f32) -> usize {
498489
let n_quads = estimate_num_quads(c, accuracy);
499490
eval_cubics_simd(simd, &c, n_quads, ctx);
500491
let tol = accuracy * (1.0 - TO_QUAD_TOL);
501492
let sqrt_tol = tol.sqrt();
502493
estimate_subdiv_simd(simd, sqrt_tol, ctx);
503494
let sum: f32 = ctx.val[..n_quads].iter().sum();
504495
let n = ((0.5 * sum / sqrt_tol).ceil() as usize).max(1);
505-
result.resize(n + 4, Point32::default());
496+
ctx.flattened_cubics.resize(n + 4, Point32::default());
506497

507498
let step = sum / (n as f32);
508499
let step_recip = 1.0 / step;
@@ -519,21 +510,13 @@ fn flatten_cubic_simd<S: Simd>(
519510
if dn > 0 {
520511
let dx = step / val;
521512
let x0 = x0base * dx;
522-
output_lines_simd(
523-
simd,
524-
ctx,
525-
i,
526-
x0,
527-
dx,
528-
dn,
529-
bytemuck::cast_slice_mut(&mut result[last_n..]),
530-
);
513+
output_lines_simd(simd, ctx, i, x0, dx, dn, last_n);
531514
}
532515
x0base = this_n_next - this_n;
533516
last_n = this_n_next as usize;
534517
}
535518

536-
result[n] = ctx.even_pts[n_quads];
519+
ctx.flattened_cubics[n] = ctx.even_pts[n_quads];
537520

538521
n + 1
539522
}

0 commit comments

Comments
 (0)