Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
sumcheck_iteration.bench.cpp
Go to the documentation of this file.
23
24#include <benchmark/benchmark.h>
25
26#include <algorithm>
27#include <atomic>
28#include <numeric>
29#include <string_view>
30
31namespace {
32
33using namespace bb;
34
35struct ActiveBlock {
36 size_t start = 0;
37 size_t size = 0;
38};
39
40std::vector<ActiveBlock> make_active_blocks(const size_t round_size, const bool fragmented)
41{
42 if (!fragmented) {
43 return { ActiveBlock{ .start = 0, .size = round_size } };
44 }
45
46 // Translator-like round-0 shape seen in Chonk: roughly 65% of edge-pairs are active,
47 // split across a small number of ranges from concatenated mini-circuit wires plus tails.
48 constexpr size_t NUM_BLOCKS = 18;
49 constexpr size_t ACTIVE_PERCENT = 65;
50 const size_t total_pairs = round_size / 2;
51 const size_t active_pairs = std::max<size_t>(NUM_BLOCKS, total_pairs * ACTIVE_PERCENT / 100);
52 const size_t inactive_pairs = total_pairs - active_pairs;
53 const size_t base_block_pairs = active_pairs / NUM_BLOCKS;
54 const size_t extra_block_pairs = active_pairs % NUM_BLOCKS;
55 const size_t base_gap_pairs = inactive_pairs / NUM_BLOCKS;
56 const size_t extra_gap_pairs = inactive_pairs % NUM_BLOCKS;
57
59 blocks.reserve(NUM_BLOCKS);
60 size_t pair_cursor = 0;
61 for (size_t block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
62 const size_t gap = base_gap_pairs + (block_idx < extra_gap_pairs ? 1 : 0);
63 pair_cursor += gap;
64 const size_t block_pairs = base_block_pairs + (block_idx < extra_block_pairs ? 1 : 0);
65 blocks.push_back(ActiveBlock{ .start = pair_cursor * 2, .size = block_pairs * 2 });
66 pair_cursor += block_pairs;
67 }
68 return blocks;
69}
70
71template <typename FF> std::vector<FF> make_challenges(const size_t log_n)
72{
73 std::vector<FF> result;
74 result.reserve(log_n);
75 for (size_t idx = 0; idx < log_n; ++idx) {
76 result.emplace_back(static_cast<uint64_t>(idx + 7));
77 }
78 return result;
79}
80
81template <typename Flavor> class SyntheticPolynomials {
82 public:
83 using FF = typename Flavor::FF;
86
87 explicit SyntheticPolynomials(const size_t size)
88 {
89 storage.reserve(Flavor::NUM_ALL_ENTITIES);
90 for (size_t poly_idx = 0; poly_idx < Flavor::NUM_ALL_ENTITIES; ++poly_idx) {
91 storage.emplace_back(size);
92 const FF value = FF(static_cast<uint64_t>(poly_idx + 3));
93 for (auto& coeff : storage.back().coeffs()) {
94 coeff = value;
95 }
96 }
97 for (auto [prover_poly, stored_poly] : zip_view(polynomials.get_all(), storage)) {
98 prover_poly = stored_poly.share();
99 }
100 if constexpr (requires(ProverPolynomials& p) { p.row_skip_active_prefix_end = size_t{}; }) {
101 polynomials.row_skip_active_prefix_end = size;
102 }
103 }
104
105 ProverPolynomials polynomials;
106
107 private:
109};
110
111template <typename Flavor, typename Edges> void fill_extended_edges(Edges& edges)
112{
113 using FF = typename Flavor::FF;
114 size_t entity_idx = 0;
115 for (auto& edge : edges.get_all()) {
116 const FF value = FF(static_cast<uint64_t>(entity_idx + 11));
117 for (auto& evaluation : edge.evaluations) {
118 evaluation = value;
119 }
120 ++entity_idx;
121 }
122}
123
124template <typename Flavor> auto make_relation_parameters()
125{
126 using FF = typename Flavor::FF;
128 params.eta = FF(17);
129 params.eta_two = FF(19);
130 params.eta_three = FF(23);
131 params.beta = FF(29);
132 params.gamma = FF(31);
133 params.public_input_delta = FF(37);
134 params.eccvm_set_permutation_delta = FF(43);
135 return params;
136}
137
138template <typename Flavor> auto make_subrelation_separators()
139{
140 using FF = typename Flavor::FF;
142 for (size_t idx = 0; idx < alphas.size(); ++idx) {
143 alphas[idx] = FF(static_cast<uint64_t>(idx + 101));
144 }
145 return alphas;
146}
147
148template <typename Flavor>
149void accumulate_one_edge(SumcheckProverRound<Flavor>& round,
151 const auto& extended_edges,
152 const bb::RelationParameters<typename Flavor::FF>& relation_parameters,
153 const typename Flavor::FF& scaling_factor)
154{
155 round.accumulate_relation_univariates_public(accum, extended_edges, relation_parameters, scaling_factor);
156}
157
158template <typename Flavor> void bench_accumulate_relations_only(benchmark::State& state)
159{
160 using FF = typename Flavor::FF;
161 using Round = SumcheckProverRound<Flavor>;
162 typename Round::SumcheckTupleOfTuplesOfUnivariates accum{};
163 typename Round::ExtendedEdges edges;
164 fill_extended_edges<Flavor>(edges);
165 auto params = make_relation_parameters<Flavor>();
166 Round round(/*initial_round_size=*/2);
167 const FF scaling_factor = FF(5);
168
169 for (auto _ : state) {
170 accumulate_one_edge<Flavor>(round, accum, edges, params, scaling_factor);
171 benchmark::DoNotOptimize(accum);
172 }
173
174 state.counters["relations"] = static_cast<double>(Flavor::NUM_RELATIONS);
175 state.counters["subrelations"] = static_cast<double>(Flavor::NUM_SUBRELATIONS);
176}
177
178enum class Scheduler {
179 STATIC_BLOCKS,
180 CHUNK_STEALING,
181};
182
183template <typename Flavor>
184void bench_sumcheck_loop_shape(benchmark::State& state, const Scheduler scheduler, const bool fragmented)
185{
186 using FF = typename Flavor::FF;
187 using Round = SumcheckProverRound<Flavor>;
188 using Tuple = typename Round::SumcheckTupleOfTuplesOfUnivariates;
189
190 const size_t log_n = static_cast<size_t>(state.range(0));
191 const size_t round_size = size_t{ 1 } << log_n;
192 constexpr size_t ROWS_PER_CHUNK = 64;
193
194 SyntheticPolynomials<Flavor> synthetic_polynomials(round_size);
195 auto& polynomials = synthetic_polynomials.polynomials;
196 auto params = make_relation_parameters<Flavor>();
197 const auto blocks = make_active_blocks(round_size, fragmented);
198
199 for (auto _ : state) {
200 Round round(round_size);
201 std::vector<Tuple> thread_accumulators(get_num_cpus());
202
203 if (scheduler == Scheduler::STATIC_BLOCKS) {
204 parallel_for([&](ThreadChunk chunk) {
205 typename Round::ExtendedEdges extended_edges;
206 for (const auto& block : blocks) {
207 const size_t iterations = block.size / 2;
208 for (size_t i : chunk.range(iterations)) {
209 const size_t edge_idx = block.start + i * 2;
210 round.extend_edges(extended_edges, polynomials, edge_idx);
211 accumulate_one_edge<Flavor>(
212 round, thread_accumulators[chunk.thread_index], extended_edges, params, FF(7));
213 }
214 }
215 });
216 } else {
218 for (const auto& block : blocks) {
219 for (size_t start = block.start; start < block.start + block.size; start += ROWS_PER_CHUNK) {
220 chunks.push_back(ActiveBlock{
221 .start = start,
222 .size = std::min(ROWS_PER_CHUNK, block.start + block.size - start),
223 });
224 }
225 }
226
227 std::atomic<size_t> next_chunk{ 0 };
228 const size_t num_slots = std::min(get_num_cpus(), std::max<size_t>(chunks.size(), 1));
229 thread_accumulators.resize(num_slots);
230 parallel_for(num_slots, [&](size_t slot_idx) {
231 typename Round::ExtendedEdges extended_edges;
232 while (true) {
233 const size_t chunk_idx = next_chunk.fetch_add(1, std::memory_order_relaxed);
234 if (chunk_idx >= chunks.size()) {
235 break;
236 }
237 const auto& chunk = chunks[chunk_idx];
238 for (size_t edge_idx = chunk.start; edge_idx < chunk.start + chunk.size; edge_idx += 2) {
239 round.extend_edges(extended_edges, polynomials, edge_idx);
240 accumulate_one_edge<Flavor>(
241 round, thread_accumulators[slot_idx], extended_edges, params, FF(7));
242 }
243 }
244 });
245 }
246
247 Tuple total{};
248 for (const auto& accum : thread_accumulators) {
250 }
251 benchmark::DoNotOptimize(total);
252 }
253
254 size_t active_edges = 0;
255 for (const auto& block : blocks) {
256 active_edges += block.size;
257 }
258 state.counters["active_edge_pairs"] = static_cast<double>(active_edges / 2);
259 state.counters["active_pct"] = 100.0 * static_cast<double>(active_edges) / static_cast<double>(round_size);
260 state.counters["blocks"] = static_cast<double>(blocks.size());
261 state.counters["relations"] = static_cast<double>(Flavor::NUM_RELATIONS);
262 state.counters["subrelations"] = static_cast<double>(Flavor::NUM_SUBRELATIONS);
263 state.counters["threads"] = static_cast<double>(get_num_cpus());
264}
265
266struct NanoSpec {
267 size_t relations = 0;
268 size_t subrelations = 0;
269 size_t heavy_period = 0;
270};
271
272template <Scheduler scheduler> void bench_nano_scheduler(benchmark::State& state)
273{
274 using FF = bb::fr;
275 const size_t log_n = static_cast<size_t>(state.range(0));
276 const size_t rows = size_t{ 1 } << log_n;
277 const bool imbalanced = static_cast<bool>(state.range(1));
278 const auto blocks = make_active_blocks(rows, /*fragmented=*/true);
279 const NanoSpec spec{ .relations = 12, .subrelations = 36, .heavy_period = imbalanced ? 8UL : 1UL };
280 constexpr size_t ROWS_PER_CHUNK = 64;
281
282 auto do_row = [&](std::array<FF, 64>& accum, const size_t row) {
283 const bool heavy = ((row / 2) % spec.heavy_period) == 0;
284 const size_t active_relations = heavy ? spec.relations : 2;
285 FF x = FF(static_cast<uint64_t>((row & 255) + 3));
286 for (size_t relation_idx = 0; relation_idx < active_relations; ++relation_idx) {
287 for (size_t subrelation_idx = 0; subrelation_idx < spec.subrelations / spec.relations; ++subrelation_idx) {
288 x = x * FF(static_cast<uint64_t>(relation_idx + 5)) + FF(static_cast<uint64_t>(subrelation_idx + 7));
289 accum[relation_idx * 4 + subrelation_idx] += x;
290 }
291 }
292 };
293
294 for (auto _ : state) {
296 for (auto& accum : accumulators) {
297 std::fill(accum.begin(), accum.end(), FF::zero());
298 }
299
300 if constexpr (scheduler == Scheduler::STATIC_BLOCKS) {
301 parallel_for([&](ThreadChunk chunk) {
302 for (const auto& block : blocks) {
303 const size_t iterations = block.size / 2;
304 for (size_t i : chunk.range(iterations)) {
305 do_row(accumulators[chunk.thread_index], block.start + i * 2);
306 }
307 }
308 });
309 } else {
311 for (const auto& block : blocks) {
312 for (size_t start = block.start; start < block.start + block.size; start += ROWS_PER_CHUNK) {
313 chunks.push_back(ActiveBlock{
314 .start = start,
315 .size = std::min(ROWS_PER_CHUNK, block.start + block.size - start),
316 });
317 }
318 }
319
320 std::atomic<size_t> next_chunk{ 0 };
321 const size_t num_slots = std::min(get_num_cpus(), std::max<size_t>(chunks.size(), 1));
322 accumulators.resize(num_slots);
323 parallel_for(num_slots, [&](size_t slot_idx) {
324 while (true) {
325 const size_t chunk_idx = next_chunk.fetch_add(1, std::memory_order_relaxed);
326 if (chunk_idx >= chunks.size()) {
327 break;
328 }
329 const auto& chunk = chunks[chunk_idx];
330 for (size_t edge_idx = chunk.start; edge_idx < chunk.start + chunk.size; edge_idx += 2) {
331 do_row(accumulators[slot_idx], edge_idx);
332 }
333 }
334 });
335 }
336 benchmark::DoNotOptimize(accumulators);
337 }
338
339 state.counters["relations"] = static_cast<double>(spec.relations);
340 state.counters["subrelations"] = static_cast<double>(spec.subrelations);
341 state.counters["threads"] = static_cast<double>(get_num_cpus());
342 state.counters["imbalanced"] = imbalanced ? 1.0 : 0.0;
343}
344
345template <typename Flavor> void bench_compute_univariate_round0(benchmark::State& state)
346{
347 using FF = typename Flavor::FF;
348 using Round = SumcheckProverRound<Flavor>;
349 const size_t log_n = static_cast<size_t>(state.range(0));
350 const size_t round_size = size_t{ 1 } << log_n;
351
352 SyntheticPolynomials<Flavor> synthetic_polynomials(round_size);
353 auto params = make_relation_parameters<Flavor>();
354 auto alphas = make_subrelation_separators<Flavor>();
355 GateSeparatorPolynomial<FF> gate_separators(make_challenges<FF>(log_n), log_n);
356
357 for (auto _ : state) {
358 Round round(round_size);
359 auto result = round.compute_univariate(synthetic_polynomials.polynomials, params, gate_separators, alphas);
360 benchmark::DoNotOptimize(result);
361 }
362
363 state.counters["relations"] = static_cast<double>(Flavor::NUM_RELATIONS);
364 state.counters["subrelations"] = static_cast<double>(Flavor::NUM_SUBRELATIONS);
365 state.counters["threads"] = static_cast<double>(get_num_cpus());
366}
367
368template <typename Flavor> void register_flavor_benches(const std::string& name, const bool fragmented)
369{
370 benchmark::RegisterBenchmark((name + "/accumulate_relations_only").c_str(),
371 &bench_accumulate_relations_only<Flavor>)
372 ->UseRealTime();
373 benchmark::RegisterBenchmark((name + "/loop_static_blocks").c_str(),
374 &bench_sumcheck_loop_shape<Flavor>,
375 Scheduler::STATIC_BLOCKS,
376 fragmented)
377 ->Arg(17)
378 ->UseRealTime()
379 ->Unit(benchmark::kMillisecond);
380 benchmark::RegisterBenchmark((name + "/loop_chunk_stealing").c_str(),
381 &bench_sumcheck_loop_shape<Flavor>,
382 Scheduler::CHUNK_STEALING,
383 fragmented)
384 ->Arg(17)
385 ->UseRealTime()
386 ->Unit(benchmark::kMillisecond);
387 benchmark::RegisterBenchmark((name + "/compute_univariate_round0").c_str(),
388 &bench_compute_univariate_round0<Flavor>)
389 ->Arg(17)
390 ->UseRealTime()
391 ->Unit(benchmark::kMillisecond);
392}
393
394} // namespace
395
396int main(int argc, char** argv)
397{
398 register_flavor_benches<bb::MegaZKFlavor>("MegaZK", /*fragmented=*/false);
399 register_flavor_benches<bb::TranslatorShortMonomialFlavor>("TranslatorShort", /*fragmented=*/true);
400 register_flavor_benches<bb::ECCVMShortMonomialFlavor>("ECCVMShort", /*fragmented=*/false);
401 benchmark::RegisterBenchmark("Nano/static_blocks", &bench_nano_scheduler<Scheduler::STATIC_BLOCKS>)
402 ->Args({ 17, 0 })
403 ->Args({ 17, 1 })
404 ->UseRealTime()
405 ->Unit(benchmark::kMillisecond);
406 benchmark::RegisterBenchmark("Nano/chunk_stealing", &bench_nano_scheduler<Scheduler::CHUNK_STEALING>)
407 ->Args({ 17, 0 })
408 ->Args({ 17, 1 })
409 ->UseRealTime()
410 ->Unit(benchmark::kMillisecond);
411
412 benchmark::Initialize(&argc, argv);
413 benchmark::RunSpecifiedBenchmarks();
414 benchmark::Shutdown();
415 return 0;
416}
A container for the prover polynomials.
typename Curve::ScalarField FF
static constexpr size_t NUM_SUBRELATIONS
static constexpr size_t NUM_ALL_ENTITIES
static constexpr size_t NUM_RELATIONS
Structured polynomial class that represents the coefficients 'a' of a_0 + a_1 x .....
static constexpr void add_nested_tuples(Tuple &tuple_1, const Tuple &tuple_2)
Componentwise addition of nested tuples (tuples of tuples)
Definition utils.hpp:118
Imlementation of the Sumcheck prover round.
decltype(create_sumcheck_tuple_of_tuples_of_univariates< Relations >()) SumcheckTupleOfTuplesOfUnivariates
SumcheckRoundUnivariate compute_univariate(ProverPolynomialsOrPartiallyEvaluatedMultivariates &polynomials, const bb::RelationParameters< FF > &relation_parameters, const bb::GateSeparatorPolynomial< FF > &gate_separators, const SubrelationSeparators &alphas)
Return the evaluations of the univariate round polynomials. Toggles between chunked computation (desi...
void extend_edges(ExtendedEdges &extended_edges, const ProverPolynomialsOrPartiallyEvaluatedMultivariates &multivariates, const size_t edge_idx)
To compute the round univariate in Round , the prover first computes the values of Honk polynomials ...
std::array< FF, Flavor::NUM_SUBRELATIONS - 1 > SubrelationSeparators
void accumulate_relation_univariates_public(SumcheckTupleOfTuplesOfUnivariates &univariate_accumulators, const auto &extended_edges, const bb::RelationParameters< FF > &relation_parameters, const FF &scaling_factor)
typename ECCVMFlavor::ProverPolynomials ProverPolynomials
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
size_t get_num_cpus()
Definition thread.cpp:33
field< Bn254FrParams > fr
Definition fr.hpp:155
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Implementation of the methods for the -polynomials used in in Sumcheck.
Container for parameters used by the grand product (permutation, lookup) Honk relations.
size_t thread_index
Definition thread.hpp:150