Skip to content

Commit 1cf36b9

Browse files
jonathanpwangclaude
andcommitted
perf(static-verifier): reduce WHIR verification cell count by 50%
Three optimizations to reduce Halo2 advice cells in WHIR verification: 1. Pre-reduce + fused scalar_mul_add in Horner polynomial evaluation: eliminate redundant reduce() on Copy'd high-max_bits values every Horner step, and fuse scalar_mul+add into mul_add gates (-38%). 2. Select-then-compress Merkle paths: select input order first, then compress once instead of compressing both orderings and selecting after, halving Poseidon2 calls per Merkle level (-19%). 3. Pre-reduce x_pow/x_inv_pow in binary_k_fold: eliminate redundant reduces after squaring in the fold loop. Total: 37.99M → 18.87M cells (50.3% reduction). Co-Authored-By: Claude Opus 4.6 <[email protected]>
1 parent 17a3651 commit 1cf36b9

File tree

3 files changed

+65
-24
lines changed

3 files changed

+65
-24
lines changed

crates/static-verifier/src/field/baby_bear/extension.rs

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -141,6 +141,25 @@ impl BabyBearExt4Chip {
141141
)
142142
}
143143

144+
/// Fused `a * b + c` where `b` is a base-field scalar.
145+
/// Uses `mul_add` gates to save cells vs separate `scalar_mul` + `add`.
146+
pub fn scalar_mul_add(
147+
&self,
148+
ctx: &mut Context<Fr>,
149+
a: BabyBearExt4Wire,
150+
b: BabyBearWire,
151+
c: BabyBearExt4Wire,
152+
) -> BabyBearExt4Wire {
153+
BabyBearExt4Wire(
154+
a.0.iter()
155+
.zip(c.0.iter())
156+
.map(|(ai, ci)| self.base.mul_add(ctx, *ai, b, *ci))
157+
.collect_vec()
158+
.try_into()
159+
.unwrap(),
160+
)
161+
}
162+
144163
pub fn select(
145164
&self,
146165
ctx: &mut Context<Fr>,

crates/static-verifier/src/stages/shared_math.rs

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -41,9 +41,12 @@ pub(crate) fn horner_eval_ext_poly_assigned(
4141
if coeffs.is_empty() {
4242
return ext_chip.zero(ctx);
4343
}
44+
// Pre-reduce x so that ext_mul doesn't redundantly reduce the same
45+
// high-max_bits components on every Horner step.
46+
let x_reduced = ext_chip.reduce_max_bits(ctx, *x);
4447
let mut acc = *coeffs.last().unwrap();
4548
for coeff in coeffs.iter().rev().skip(1) {
46-
acc = ext_chip.mul(ctx, acc, *x);
49+
acc = ext_chip.mul(ctx, acc, x_reduced);
4750
acc = ext_chip.add(ctx, acc, *coeff);
4851
}
4952
acc
@@ -58,10 +61,12 @@ pub(crate) fn horner_eval_ext_poly_f_assigned(
5861
if coeffs.is_empty() {
5962
return ext_chip.zero(ctx);
6063
}
64+
// Pre-reduce x so that each mul_add step inside the loop doesn't redundantly
65+
// reduce the same high-max_bits value on every iteration.
66+
let x_reduced = ext_chip.base().reduce_max_bits(ctx, *x);
6167
let mut acc = *coeffs.last().unwrap();
6268
for coeff in coeffs.iter().rev().skip(1) {
63-
acc = ext_chip.scalar_mul(ctx, acc, *x);
64-
acc = ext_chip.add(ctx, acc, *coeff);
69+
acc = ext_chip.scalar_mul_add(ctx, acc, x_reduced, *coeff);
6570
}
6671
acc
6772
}

crates/static-verifier/src/stages/whir/mod.rs

Lines changed: 38 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -297,9 +297,9 @@ fn binary_k_fold_assigned(
297297
let inv_tw = omega_k_inv.powers().take(1usize << (k - 1)).collect();
298298
let half = RootF::ONE.halve();
299299

300-
let mut x_pow = x;
300+
let mut x_pow = base_chip.reduce_max_bits(ctx, x);
301301
let x_inv = invert_base_assigned(ctx, base_chip, x);
302-
let mut x_inv_pow = x_inv;
302+
let mut x_inv_pow = base_chip.reduce_max_bits(ctx, x_inv);
303303

304304
for (j, alpha) in alphas.iter().enumerate() {
305305
let m = n >> (j + 1);
@@ -318,7 +318,9 @@ fn binary_k_fold_assigned(
318318
values[i] = ext_chip.add(ctx, lo, fold);
319319
}
320320
x_pow = base_chip.square(ctx, x_pow);
321+
x_pow = base_chip.reduce_max_bits(ctx, x_pow);
321322
x_inv_pow = base_chip.square(ctx, x_inv_pow);
323+
x_inv_pow = base_chip.reduce_max_bits(ctx, x_inv_pow);
322324
}
323325
values[0]
324326
}
@@ -380,9 +382,11 @@ fn constrain_merkle_path(
380382
let query_bits = query_bits.to_vec();
381383

382384
for (bit, &sibling) in query_bits.iter().zip(merkle_path.siblings.iter()) {
383-
let left_right = compress_bn254_digests(ctx, ext_chip.range(), cur, sibling);
384-
let right_left = compress_bn254_digests(ctx, ext_chip.range(), sibling, cur);
385-
cur = gate.select(ctx, right_left, left_right, *bit);
385+
// Select input order first, then compress once (instead of compressing
386+
// both orderings and selecting after). Halves Poseidon2 calls per level.
387+
let left = gate.select(ctx, sibling, cur, *bit);
388+
let right = gate.select(ctx, cur, sibling, *bit);
389+
cur = compress_bn254_digests(ctx, ext_chip.range(), left, right);
386390
}
387391

388392
ctx.constrain_equal(&cur, &root_digest);
@@ -580,6 +584,7 @@ pub(crate) fn constrain_whir_verification(
580584
mu_power_idx += 1;
581585
}
582586
}
587+
583588
let codeword_vals = codeword_vals.into_iter().flatten().collect::<Vec<_>>();
584589
binary_k_fold_assigned(ctx, ext_chip, codeword_vals, &alphas_round, zi_root)
585590
} else {
@@ -591,6 +596,7 @@ pub(crate) fn constrain_whir_verification(
591596
merkle_path,
592597
codeword_commitment_roots[round_idx - 1],
593598
);
599+
594600
let opened_values = merkle_path
595601
.leaf_values
596602
.iter()
@@ -650,19 +656,21 @@ pub(crate) fn constrain_whir_verification(
650656
let mut z0_pows = Vec::with_capacity(slc_len);
651657
z0_pows.push(*z0);
652658
for _ in 1..slc_len {
653-
let next = ext_chip.mul(
654-
ctx,
655-
*z0_pows.last().expect("z0 power sequence is non-empty"),
656-
*z0_pows.last().expect("z0 power sequence is non-empty"),
657-
);
659+
let prev = z0_pows.last().unwrap();
660+
let next = ext_chip.square(ctx, *prev);
658661
z0_pows.push(next);
659662
}
660-
let z0_max = *z0_pows.last().expect("z0 power sequence is non-empty");
663+
let z0_max = *z0_pows.last().unwrap();
664+
// Pre-reduce z0_pows so eq_mle doesn't redundantly reduce them.
665+
let z0_pows_reduced: Vec<_> = z0_pows
666+
.iter()
667+
.map(|p| ext_chip.reduce_max_bits(ctx, *p))
668+
.collect();
661669
let eq = eval_eq_mle_assigned(
662670
ctx,
663671
ext_chip,
664672
alpha_slc,
665-
&z0_pows[..z0_pows.len().saturating_sub(1)],
673+
&z0_pows_reduced[..z0_pows_reduced.len().saturating_sub(1)],
666674
);
667675
let poly_eval = horner_eval_ext_poly_assigned(ctx, ext_chip, final_poly, &z0_max);
668676
let term = ext_chip.mul(ctx, *gamma, eq);
@@ -673,25 +681,34 @@ pub(crate) fn constrain_whir_verification(
673681

674682
profiler.push("query_point_evals", ctx.advice.len());
675683
let mut gamma_pow = ext_chip.mul(ctx, *gamma, *gamma);
676-
for zi in &zs_per_round[round_idx] {
684+
for zi in zs_per_round[round_idx].iter() {
677685
let mut zi_pows = Vec::with_capacity(slc_len);
678686
zi_pows.push(*zi);
679687
for _ in 1..slc_len {
680-
let next = ext_chip.base().mul(
681-
ctx,
682-
*zi_pows.last().expect("zi power sequence is non-empty"),
683-
*zi_pows.last().expect("zi power sequence is non-empty"),
684-
);
688+
let prev = zi_pows.last().unwrap();
689+
let next = ext_chip.base().square(ctx, *prev);
685690
zi_pows.push(next);
686691
}
687-
let zi_max = *zi_pows.last().expect("zi power sequence is non-empty");
692+
// Pre-reduce zi_pows so eq_mle and poly eval don't redundantly reduce.
693+
let zi_pows_reduced: Vec<_> = zi_pows
694+
.iter()
695+
.map(|p| ext_chip.base().reduce_max_bits(ctx, *p))
696+
.collect();
697+
688698
let eq = eval_eq_mle_ef_f_assigned(
689699
ctx,
690700
ext_chip,
691701
alpha_slc,
692-
&zi_pows[..zi_pows.len().saturating_sub(1)],
702+
&zi_pows_reduced[..zi_pows_reduced.len().saturating_sub(1)],
693703
);
694-
let poly_eval = horner_eval_ext_poly_f_assigned(ctx, ext_chip, final_poly, &zi_max);
704+
705+
let poly_eval = horner_eval_ext_poly_f_assigned(
706+
ctx,
707+
ext_chip,
708+
final_poly,
709+
zi_pows_reduced.last().unwrap(),
710+
);
711+
695712
let term = ext_chip.mul(ctx, gamma_pow, eq);
696713
let term = ext_chip.mul(ctx, term, poly_eval);
697714
final_acc = ext_chip.add(ctx, final_acc, term);

0 commit comments

Comments
 (0)