Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
pippenger.bench.cpp
Go to the documentation of this file.
1
13
14#include <benchmark/benchmark.h>
15
17
18#include <cstdint>
19#include <limits>
20#include <vector>
21
22using namespace benchmark;
23
27
28namespace {
29
30class PippengerBench : public benchmark::Fixture {
31 public:
32 static constexpr size_t MAX_POINTS = 1 << 22;
34 std::vector<Fr> scalars;
36
37 void SetUp([[maybe_unused]] const ::benchmark::State& state) override
38 {
40 srs = bb::srs::get_crs_factory<Curve>()->get_crs(MAX_POINTS);
41
42 scalars.resize(MAX_POINTS);
43 for (auto& x : scalars) {
44 x = Fr::random_element(&engine);
45 }
46 }
47};
48
49// ===================== Single MSM =====================
50
51BENCHMARK_DEFINE_F(PippengerBench, PippengerUnsafe)(benchmark::State& state)
52{
53 const size_t num_points = static_cast<size_t>(state.range(0));
54 std::span<const G1> points = srs->get_monomial_points().subspan(0, num_points);
55 std::span<Fr> span(&scalars[0], num_points);
56 bb::PolynomialSpan<Fr> poly_scalars(0, span);
57
58 for (auto _ : state) {
60 bb::scalar_multiplication::pippenger_unsafe<Curve>(poly_scalars, points);
61 }
62}
63
64// ===== Round-parallel MSM (A) — window-partitioned, independent of pippenger_unsafe =====
65
66BENCHMARK_DEFINE_F(PippengerBench, PippengerRoundParallel)(benchmark::State& state)
67{
68 const size_t num_threads = static_cast<size_t>(state.range(0));
69 const size_t num_points = static_cast<size_t>(state.range(1));
70 std::span<const G1> points = srs->get_monomial_points().subspan(0, num_points);
71 std::span<Fr> span(&scalars[0], num_points);
72 bb::PolynomialSpan<Fr> poly_scalars(0, span);
73
74 const size_t original_concurrency = bb::get_num_cpus();
76
77 for (auto _ : state) {
79 bb::scalar_multiplication::pippenger_round_parallel<Curve>(poly_scalars, points);
80 }
81
82 bb::set_parallel_for_concurrency(original_concurrency);
83}
84
85BENCHMARK_DEFINE_F(PippengerBench, PippengerUnsafeThreads)(benchmark::State& state)
86{
87 const size_t num_threads = static_cast<size_t>(state.range(0));
88 const size_t num_points = static_cast<size_t>(state.range(1));
89 std::span<const G1> points = srs->get_monomial_points().subspan(0, num_points);
90 std::span<Fr> span(&scalars[0], num_points);
91 bb::PolynomialSpan<Fr> poly_scalars(0, span);
92
93 const size_t original_concurrency = bb::get_num_cpus();
95
96 for (auto _ : state) {
98 bb::scalar_multiplication::pippenger_unsafe<Curve>(poly_scalars, points);
99 }
100
101 bb::set_parallel_for_concurrency(original_concurrency);
102}
103
104// ===================== Batch MSM =====================
105
106BENCHMARK_DEFINE_F(PippengerBench, BatchMSM)(benchmark::State& state)
107{
108 const size_t num_polys = static_cast<size_t>(state.range(0));
109 const size_t poly_size = static_cast<size_t>(state.range(1));
110
111 std::vector<std::vector<Fr>> all_scalars(num_polys);
113 std::span<const G1> points = srs->get_monomial_points().subspan(0, poly_size);
114
115 for (size_t i = 0; i < num_polys; ++i) {
116 all_scalars[i].resize(poly_size);
117 for (auto& s : all_scalars[i]) {
119 }
120 scalar_spans.emplace_back(0, std::span<Fr>(all_scalars[i]));
121 }
122
123 for (auto _ : state) {
126 }
127}
128
129// ===================== Batched MSM — Chonk-representative workloads =====================
130//
131// Two paths are benchmarked side-by-side for each scenario:
132// - "Batched" calls MSM::batch_multi_scalar_mul (new multi-MSM Phases 1-6b pipeline).
133// - "PerMsm" calls pippenger_round_parallel(...) once per MSM (the legacy fallback —
134// equivalent to what MSM::batch_multi_scalar_mul did before the multi-MSM dispatcher).
135//
136// The Batched/PerMsm ratio is the metric of interest: it shows the lift from batching
137// the round-parallel scaffolding (GLV doubling, Constantine recoding, schedule build)
138// once per batch_commit instead of K times.
139//
140// Scenarios are picked to mirror the workloads observed in chonk profiles:
141// - TranslatorWires_2_17 / 2_14: K=4 dense BN254, the regression case from the
142// translator-wire batch_commit hot path.
143// - MegaOink_K11: simulates a full Mega oink wire commit (K=11, all same SRS prefix,
144// all dense) — the headline target for batching.
145// - ECCVMSparse_half: K=4 with ~50% zero scalars, the ECCVM-wire pattern that the
146// OLD MSM optimised via non-zero work-unit weighting.
147// - DatabusSparse_mostly0: K=8 short polys (n=16384) with ~75% zero scalars —
148// the databus-inverse pattern.
149
150namespace {
151struct BatchScenario {
152 const char* name;
153 size_t k; // number of MSMs
154 size_t n; // points per MSM (uniform within each scenario)
155 double zero_density; // probability of a scalar being zero (0.0 = dense)
156};
157
158std::vector<std::vector<Fr>> build_batch_scalars(
159 size_t k, size_t n, double zero_density, bb::numeric::RNG& engine, std::span<const Fr> scalar_pool)
160{
162 for (size_t m = 0; m < k; ++m) {
163 out[m].resize(n);
164 for (size_t i = 0; i < n; ++i) {
165 const bool zero = zero_density > 0.0 && (static_cast<double>(engine.get_random_uint32() & 0xFFFFFU) /
166 static_cast<double>(0x100000U)) < zero_density;
167 out[m][i] = zero ? Fr::zero() : scalar_pool[(m * n + i) % scalar_pool.size()];
168 }
169 }
170 return out;
171}
172} // namespace
173
174BENCHMARK_DEFINE_F(PippengerBench, BatchedChonk)(benchmark::State& state)
175{
176 const size_t scenario_idx = static_cast<size_t>(state.range(0));
177 // Production K + N values, derived from the actual prover call sites that drive
178 // `commitment_key.batch_commit` -> `MSM::batch_multi_scalar_mul`:
179 //
180 // Translator K=10 N=2^17 — execute_wire_and_sorted_constraints_commitments_round:
181 // 5 ConcatenatedPolynomials + 5 OrderedRangeConstraints,
182 // all at full circuit size (MINI_CIRCUIT * CONCAT = 2^13 * 16).
183 // Dense (no duplicates hint passed).
184 // MegaOink K=17 N=2^17 — OinkProver::commit_to_wires (Mega):
185 // 3 base wires (w_l/w_r/w_o, duplicates hint=true)
186 // + 4 ecc_op_wires (sparse, mostly populated only when ecc ops fire)
187 // + 10 databus polys (5 buses * 2; mostly zero outside the active bus).
188 // Approximated here as a dense single density; the per-poly
189 // heterogeneity is left to a follow-up (would need build_batch_scalars
190 // to accept per-poly densities/hints).
191 // DatabusOnly K=10 N=2^14 — isolates the databus sub-batch from Mega oink (mostly-zero
192 // wires at a smaller size — what the datbus-inverse pattern looks like).
193 //
194 // K=4 sub-batches that existed previously were not representative of any prover; removed.
195 static const std::array<BatchScenario, 5> scenarios{ {
196 { "Translator_K10_2_17", 10, 1U << 17, 0.0 },
197 { "MegaOink_K17_2_17", 17, 1U << 17, 0.0 },
198 { "DatabusOnly_K10_2_14_mostly0", 10, 1U << 14, 0.75 },
199 // ECCVM 85-wire batch split into its dense and sparse halves. ~60 wires
200 // (precompute point-table + msm-region + accumulators + shifted entities)
201 // are dense; ~25 transcript wires are populated only up to op-queue size
202 // (CONST_OP_QUEUE_LOG_SIZE = 2^12) in a 2^15 dyadic allocation, so ~87.5%
203 // zero. Sum of the two scenarios approximates the production ECCVM commit
204 // batch. BN254 used as a proxy for Grumpkin: at N=2^15 both curves sit
205 // above their native GLV threshold (2^13), so the dispatcher's only
206 // cross-MSM amortisation (the shared GLV-doubled prefix) is OFF either
207 // way and the batched/per-MSM ratio transfers.
208 { "ECCVM_dense_K60_2_15", 60, 1U << 15, 0.0 },
209 { "ECCVM_transcript_K25_2_15", 25, 1U << 15, 0.875 },
210 } };
211 const auto& sc = scenarios[scenario_idx];
212 state.SetLabel(sc.name);
213
214 auto all_scalars = build_batch_scalars(sc.k, sc.n, sc.zero_density, engine, scalars);
216 std::span<const G1> points = srs->get_monomial_points().subspan(0, sc.n);
217 for (size_t m = 0; m < sc.k; ++m) {
218 scalar_spans.emplace_back(0, std::span<Fr>(all_scalars[m]));
219 }
220
221 for (auto _ : state) {
224 }
225}
226
227BENCHMARK_DEFINE_F(PippengerBench, PerMsmChonk)(benchmark::State& state)
228{
229 const size_t scenario_idx = static_cast<size_t>(state.range(0));
230 // Production K + N values, derived from the actual prover call sites that drive
231 // `commitment_key.batch_commit` -> `MSM::batch_multi_scalar_mul`:
232 //
233 // Translator K=10 N=2^17 — execute_wire_and_sorted_constraints_commitments_round:
234 // 5 ConcatenatedPolynomials + 5 OrderedRangeConstraints,
235 // all at full circuit size (MINI_CIRCUIT * CONCAT = 2^13 * 16).
236 // Dense (no duplicates hint passed).
237 // MegaOink K=17 N=2^17 — OinkProver::commit_to_wires (Mega):
238 // 3 base wires (w_l/w_r/w_o, duplicates hint=true)
239 // + 4 ecc_op_wires (sparse, mostly populated only when ecc ops fire)
240 // + 10 databus polys (5 buses * 2; mostly zero outside the active bus).
241 // Approximated here as a dense single density; the per-poly
242 // heterogeneity is left to a follow-up (would need build_batch_scalars
243 // to accept per-poly densities/hints).
244 // DatabusOnly K=10 N=2^14 — isolates the databus sub-batch from Mega oink (mostly-zero
245 // wires at a smaller size — what the datbus-inverse pattern looks like).
246 //
247 // K=4 sub-batches that existed previously were not representative of any prover; removed.
248 static const std::array<BatchScenario, 5> scenarios{ {
249 { "Translator_K10_2_17", 10, 1U << 17, 0.0 },
250 { "MegaOink_K17_2_17", 17, 1U << 17, 0.0 },
251 { "DatabusOnly_K10_2_14_mostly0", 10, 1U << 14, 0.75 },
252 // ECCVM 85-wire batch split into its dense and sparse halves. ~60 wires
253 // (precompute point-table + msm-region + accumulators + shifted entities)
254 // are dense; ~25 transcript wires are populated only up to op-queue size
255 // (CONST_OP_QUEUE_LOG_SIZE = 2^12) in a 2^15 dyadic allocation, so ~87.5%
256 // zero. Sum of the two scenarios approximates the production ECCVM commit
257 // batch. BN254 used as a proxy for Grumpkin: at N=2^15 both curves sit
258 // above their native GLV threshold (2^13), so the dispatcher's only
259 // cross-MSM amortisation (the shared GLV-doubled prefix) is OFF either
260 // way and the batched/per-MSM ratio transfers.
261 { "ECCVM_dense_K60_2_15", 60, 1U << 15, 0.0 },
262 { "ECCVM_transcript_K25_2_15", 25, 1U << 15, 0.875 },
263 } };
264 const auto& sc = scenarios[scenario_idx];
265 state.SetLabel(sc.name);
266
267 auto all_scalars = build_batch_scalars(sc.k, sc.n, sc.zero_density, engine, scalars);
268 std::span<const G1> points = srs->get_monomial_points().subspan(0, sc.n);
269
270 for (auto _ : state) {
272 for (size_t m = 0; m < sc.k; ++m) {
273 bb::PolynomialSpan<const Fr> sp(0, std::span<const Fr>(all_scalars[m].data(), sc.n));
274 (void)bb::scalar_multiplication::pippenger_round_parallel<Curve>(sp, points);
275 }
276 }
277}
278
289BENCHMARK_DEFINE_F(PippengerBench, BatchMSM_1656)(benchmark::State& state)
290{
291 const size_t num_threads = static_cast<size_t>(state.range(0));
292 const size_t msm_size = static_cast<size_t>(state.range(1));
293
294 std::vector<Fr> msm_scalars(msm_size);
295 for (auto& s : msm_scalars) {
297 }
298
300 scalar_spans.emplace_back(0, std::span<Fr>(msm_scalars));
301 std::span<const G1> points = srs->get_monomial_points().subspan(0, msm_size);
302
303 // This is thread-local: restore after the benchmark so other cases in this binary are unaffected.
304 const size_t original_concurrency = bb::get_num_cpus();
306
307 for (auto _ : state) {
310 }
311
312 bb::set_parallel_for_concurrency(original_concurrency);
313}
314
315// ===================== Sparsity-profile single MSM =====================
316//
317// Single-MSM pippenger_round_parallel across dyadic sizes 2^15..2^19 under two scalar
318// distributions, to A/B the thread-pool backend (new generation-counter pool vs the
319// merge-train pool) at the workload shapes that stress the round-parallel scaffolding
320// and the dedup pre-pass.
321//
322// Dense80 — 80% uniformly-random nonzero scalars, 20% zero. Exercises the main
323// bucket-accumulation pipeline with light sparsity. dedup_hint=false.
324// DupHeavy — 50% unique random, 25% all equal to one random scalar A, 5% all equal
325// to another random scalar B, 20% zero. Heavy duplication drives the
326// Phase A dedup pre-pass (the most thread-intensive stage), so this is
327// the case most sensitive to pool dispatch / oversubscription behavior.
328// dedup_hint=true.
329//
330// Scalars are drawn from the fixture's deterministic debug RNG so the A/B runs on the
331// two pool backends see identical inputs.
332namespace {
333enum class SparsityProfile : uint8_t { Dense80 = 0, DupHeavy = 1 };
334
335[[nodiscard]] double uniform01(bb::numeric::RNG& engine) noexcept
336{
337 return static_cast<double>(engine.get_random_uint32()) / static_cast<double>(std::numeric_limits<uint32_t>::max());
338}
339
340std::vector<Fr> build_sparsity_scalars(SparsityProfile profile, size_t n, bb::numeric::RNG& engine)
341{
342 std::vector<Fr> out(n);
343 if (profile == SparsityProfile::Dense80) {
344 for (size_t i = 0; i < n; ++i) {
345 out[i] = (uniform01(engine) < 0.20) ? Fr::zero() : Fr::random_element(&engine);
346 }
347 } else {
348 const Fr dup_a = Fr::random_element(&engine);
349 const Fr dup_b = Fr::random_element(&engine);
350 for (size_t i = 0; i < n; ++i) {
351 const double r = uniform01(engine);
352 if (r < 0.20) {
353 out[i] = Fr::zero(); // 20% zero
354 } else if (r < 0.45) {
355 out[i] = dup_a; // 25% duplicate of A
356 } else if (r < 0.50) {
357 out[i] = dup_b; // 5% duplicate of B
358 } else {
359 out[i] = Fr::random_element(&engine); // 50% unique random
360 }
361 }
362 }
363 return out;
364}
365} // namespace
366
367BENCHMARK_DEFINE_F(PippengerBench, PippengerSparsity)(benchmark::State& state)
368{
369 const auto profile = static_cast<SparsityProfile>(state.range(0));
370 const size_t num_points = static_cast<size_t>(state.range(1));
371 const bool dedup_hint = (profile == SparsityProfile::DupHeavy);
372 state.SetLabel(profile == SparsityProfile::Dense80 ? "Dense80" : "DupHeavy");
373
374 // Build the scalar set from a fresh RNG re-seeded deterministically per (profile, size)
375 // rather than from the shared advancing engine. Two reasons:
376 // 1. Every benchmark repetition (--benchmark_repetitions) reuses the SAME scalars, so the
377 // measured variance reflects pool/scheduler noise only, not input variation.
378 // 2. The input is independent of benchmark execution order, so a filtered subset or a
379 // pool-toggled A/B run sees byte-identical scalars — the comparison is properly paired.
380 // The scalar build is outside the timed `for (auto _ : state)` loop regardless, so RNG cost
381 // never enters the measurement.
382 const std::uint_fast64_t case_seed =
383 0xC0FFEEULL + (static_cast<std::uint_fast64_t>(profile) << 32) + static_cast<std::uint_fast64_t>(num_points);
384 bb::numeric::RNG& case_engine = bb::numeric::get_debug_randomness(/*reset=*/true, /*seed=*/case_seed);
385
386 std::vector<Fr> msm_scalars = build_sparsity_scalars(profile, num_points, case_engine);
387 std::span<const G1> points = srs->get_monomial_points().subspan(0, num_points);
388 bb::PolynomialSpan<const Fr> poly_scalars(0, std::span<const Fr>(msm_scalars.data(), num_points));
389
390 for (auto _ : state) {
392 (void)bb::scalar_multiplication::pippenger_round_parallel<Curve>(poly_scalars, points, dedup_hint);
393 }
394}
395
396// ===================== Registration =====================
397
398// Single MSM: 2^14 to 2^20
399BENCHMARK_REGISTER_F(PippengerBench, PippengerUnsafe)
400 ->Unit(benchmark::kMillisecond)
401 ->RangeMultiplier(4)
402 ->Range(1 << 14, 1 << 20);
403
404// Sparsity-profile single MSM: {profile (0=Dense80, 1=DupHeavy), size}, sizes 2^15..2^19.
405BENCHMARK_REGISTER_F(PippengerBench, PippengerSparsity)
406 ->Unit(benchmark::kMillisecond)
407 ->ArgsProduct({ { 0, 1 }, { 1 << 15, 1 << 16, 1 << 17, 1 << 18, 1 << 19 } });
408
409// Batch MSM: {num_polynomials, polynomial_size}
410// AVM-like: 32 polys of size 2^21 (one batch from ~2618 wire polys committed in batches of 32)
411BENCHMARK_REGISTER_F(PippengerBench, BatchMSM)
412 ->Unit(benchmark::kMillisecond)
413 ->Args({ 32, 1 << 19 })
414 ->Args({ 32, 1 << 21 });
415
416// Issue #1656 target: {threads=256, msm_size}
417BENCHMARK_REGISTER_F(PippengerBench, BatchMSM_1656)
418 ->Unit(benchmark::kMillisecond)
419 ->Args({ 256, 1 << 16 })
420 ->Args({ 256, 1 << 20 });
421
422// Chonk-representative batched MSM workloads. Scenario index 0..6 indexes into the
423// scenario table above. Run "Batched" and "PerMsm" with matching indices and compare.
424BENCHMARK_REGISTER_F(PippengerBench, BatchedChonk)->Unit(benchmark::kMillisecond)->DenseRange(0, 4, 1);
425BENCHMARK_REGISTER_F(PippengerBench, PerMsmChonk)->Unit(benchmark::kMillisecond)->DenseRange(0, 4, 1);
426
427// Grid sweep for A vs B: {threads, size}. N covers 2^7..2^21 (with extra in-between
428// points around the GLV crossover).
429BENCHMARK_REGISTER_F(PippengerBench, PippengerRoundParallel)
430 ->Unit(benchmark::kMillisecond)
431 ->ArgsProduct({ { 1, 4, 8, 12, 16, 32, 64, 128 },
432 { 1 << 7,
433 1 << 8,
434 1 << 9,
435 1 << 10,
436 1 << 11,
437 1 << 12,
438 1 << 13,
439 1 << 14,
440 1 << 15,
441 3 << 14,
442 1 << 16,
443 3 << 15,
444 1 << 17,
445 3 << 16,
446 1 << 18,
447 1 << 19,
448 1 << 20,
449 1 << 21 } });
450
451BENCHMARK_REGISTER_F(PippengerBench, PippengerUnsafeThreads)
452 ->Unit(benchmark::kMillisecond)
453 ->ArgsProduct({ { 1, 4, 8, 12, 16, 32, 64, 128 },
454 { 1 << 9,
455 1 << 10,
456 1 << 11,
457 1 << 12,
458 1 << 13,
459 1 << 14,
460 1 << 15,
461 1 << 16,
462 1 << 17,
463 1 << 18,
464 1 << 19,
465 1 << 20 } });
466
467} // namespace
468
typename Group::affine_element AffineElement
Definition bn254.hpp:22
bb::fr ScalarField
Definition bn254.hpp:18
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< const AffineElement > points, std::span< PolynomialSpan< ScalarField > > scalars, bool handle_edge_cases=true, std::span< const uint8_t > dedup_hints={}) noexcept
numeric::RNG & engine
#define GOOGLE_BB_BENCH_REPORTER(state)
RNG & get_debug_randomness(bool reset, std::uint_fast64_t seed)
Definition engine.cpp:245
std::filesystem::path bb_crs_path()
void init_file_crs_factory(const std::filesystem::path &path)
size_t get_num_cpus()
Definition thread.cpp:33
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:23
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
BENCHMARK_MAIN()
Curve::AffineElement G1
std::byte * data
static field random_element(numeric::RNG *engine=nullptr) noexcept
static constexpr field zero()