24#include <benchmark/benchmark.h>
43 return { ActiveBlock{ .start = 0, .size = round_size } };
48 constexpr size_t NUM_BLOCKS = 18;
49 constexpr size_t ACTIVE_PERCENT = 65;
50 const size_t total_pairs = round_size / 2;
51 const size_t active_pairs =
std::max<size_t>(NUM_BLOCKS, total_pairs * ACTIVE_PERCENT / 100);
52 const size_t inactive_pairs = total_pairs - active_pairs;
53 const size_t base_block_pairs = active_pairs / NUM_BLOCKS;
54 const size_t extra_block_pairs = active_pairs % NUM_BLOCKS;
55 const size_t base_gap_pairs = inactive_pairs / NUM_BLOCKS;
56 const size_t extra_gap_pairs = inactive_pairs % NUM_BLOCKS;
59 blocks.reserve(NUM_BLOCKS);
60 size_t pair_cursor = 0;
61 for (
size_t block_idx = 0; block_idx < NUM_BLOCKS; ++block_idx) {
62 const size_t gap = base_gap_pairs + (block_idx < extra_gap_pairs ? 1 : 0);
64 const size_t block_pairs = base_block_pairs + (block_idx < extra_block_pairs ? 1 : 0);
65 blocks.push_back(ActiveBlock{ .start = pair_cursor * 2, .size = block_pairs * 2 });
66 pair_cursor += block_pairs;
71template <
typename FF> std::vector<FF> make_challenges(
const size_t log_n)
73 std::vector<FF> result;
74 result.reserve(log_n);
75 for (
size_t idx = 0; idx < log_n; ++idx) {
76 result.emplace_back(
static_cast<uint64_t
>(idx + 7));
81template <
typename Flavor>
class SyntheticPolynomials {
87 explicit SyntheticPolynomials(
const size_t size)
91 storage.emplace_back(size);
92 const FF value =
FF(
static_cast<uint64_t
>(poly_idx + 3));
93 for (
auto& coeff : storage.back().coeffs()) {
97 for (
auto [prover_poly, stored_poly] :
zip_view(polynomials.get_all(), storage)) {
98 prover_poly = stored_poly.share();
100 if constexpr (
requires(
ProverPolynomials& p) { p.row_skip_active_prefix_end =
size_t{}; }) {
101 polynomials.row_skip_active_prefix_end = size;
111template <
typename Flavor,
typename Edges>
void fill_extended_edges(Edges& edges)
114 size_t entity_idx = 0;
115 for (
auto& edge : edges.get_all()) {
116 const FF value =
FF(
static_cast<uint64_t
>(entity_idx + 11));
117 for (
auto& evaluation : edge.evaluations) {
124template <
typename Flavor>
auto make_relation_parameters()
129 params.eta_two =
FF(19);
130 params.eta_three =
FF(23);
131 params.beta =
FF(29);
132 params.gamma =
FF(31);
133 params.public_input_delta =
FF(37);
134 params.eccvm_set_permutation_delta =
FF(43);
138template <
typename Flavor>
auto make_subrelation_separators()
142 for (
size_t idx = 0; idx < alphas.size(); ++idx) {
143 alphas[idx] =
FF(
static_cast<uint64_t
>(idx + 101));
148template <
typename Flavor>
151 const auto& extended_edges,
158template <
typename Flavor>
void bench_accumulate_relations_only(benchmark::State& state)
162 typename Round::SumcheckTupleOfTuplesOfUnivariates accum{};
163 typename Round::ExtendedEdges edges;
164 fill_extended_edges<Flavor>(edges);
165 auto params = make_relation_parameters<Flavor>();
167 const FF scaling_factor =
FF(5);
169 for (
auto _ : state) {
170 accumulate_one_edge<Flavor>(round, accum, edges, params, scaling_factor);
171 benchmark::DoNotOptimize(accum);
178enum class Scheduler {
183template <
typename Flavor>
184void bench_sumcheck_loop_shape(benchmark::State& state,
const Scheduler scheduler,
const bool fragmented)
188 using Tuple =
typename Round::SumcheckTupleOfTuplesOfUnivariates;
190 const size_t log_n =
static_cast<size_t>(state.range(0));
191 const size_t round_size =
size_t{ 1 } << log_n;
192 constexpr size_t ROWS_PER_CHUNK = 64;
194 SyntheticPolynomials<Flavor> synthetic_polynomials(round_size);
195 auto& polynomials = synthetic_polynomials.polynomials;
196 auto params = make_relation_parameters<Flavor>();
197 const auto blocks = make_active_blocks(round_size, fragmented);
199 for (
auto _ : state) {
200 Round round(round_size);
203 if (scheduler == Scheduler::STATIC_BLOCKS) {
205 typename Round::ExtendedEdges extended_edges;
206 for (
const auto& block : blocks) {
207 const size_t iterations = block.size / 2;
208 for (
size_t i : chunk.range(iterations)) {
209 const size_t edge_idx = block.start + i * 2;
210 round.
extend_edges(extended_edges, polynomials, edge_idx);
211 accumulate_one_edge<Flavor>(
212 round, thread_accumulators[chunk.
thread_index], extended_edges, params,
FF(7));
218 for (
const auto& block : blocks) {
219 for (
size_t start = block.start; start < block.start + block.size; start += ROWS_PER_CHUNK) {
220 chunks.push_back(ActiveBlock{
222 .size = std::min(ROWS_PER_CHUNK, block.start + block.size - start),
227 std::atomic<size_t> next_chunk{ 0 };
229 thread_accumulators.resize(num_slots);
231 typename Round::ExtendedEdges extended_edges;
234 if (chunk_idx >= chunks.size()) {
237 const auto& chunk = chunks[chunk_idx];
238 for (
size_t edge_idx = chunk.start; edge_idx < chunk.start + chunk.size; edge_idx += 2) {
239 round.
extend_edges(extended_edges, polynomials, edge_idx);
240 accumulate_one_edge<Flavor>(
241 round, thread_accumulators[slot_idx], extended_edges, params,
FF(7));
248 for (
const auto& accum : thread_accumulators) {
251 benchmark::DoNotOptimize(total);
254 size_t active_edges = 0;
255 for (
const auto& block : blocks) {
256 active_edges += block.size;
258 state.counters[
"active_edge_pairs"] =
static_cast<double>(active_edges / 2);
259 state.counters[
"active_pct"] = 100.0 *
static_cast<double>(active_edges) /
static_cast<double>(round_size);
260 state.counters[
"blocks"] =
static_cast<double>(blocks.size());
263 state.counters[
"threads"] =
static_cast<double>(
get_num_cpus());
267 size_t relations = 0;
268 size_t subrelations = 0;
269 size_t heavy_period = 0;
272template <Scheduler scheduler>
void bench_nano_scheduler(benchmark::State& state)
275 const size_t log_n =
static_cast<size_t>(state.range(0));
276 const size_t rows =
size_t{ 1 } << log_n;
277 const bool imbalanced =
static_cast<bool>(state.range(1));
278 const auto blocks = make_active_blocks(rows,
true);
279 const NanoSpec spec{ .relations = 12, .subrelations = 36, .heavy_period = imbalanced ? 8UL : 1UL };
280 constexpr size_t ROWS_PER_CHUNK = 64;
282 auto do_row = [&](std::array<FF, 64>& accum,
const size_t row) {
283 const bool heavy = ((row / 2) % spec.heavy_period) == 0;
284 const size_t active_relations = heavy ? spec.relations : 2;
285 FF x =
FF(
static_cast<uint64_t
>((row & 255) + 3));
286 for (
size_t relation_idx = 0; relation_idx < active_relations; ++relation_idx) {
287 for (
size_t subrelation_idx = 0; subrelation_idx < spec.subrelations / spec.relations; ++subrelation_idx) {
288 x = x *
FF(
static_cast<uint64_t
>(relation_idx + 5)) +
FF(
static_cast<uint64_t
>(subrelation_idx + 7));
289 accum[relation_idx * 4 + subrelation_idx] += x;
294 for (
auto _ : state) {
296 for (
auto& accum : accumulators) {
300 if constexpr (scheduler == Scheduler::STATIC_BLOCKS) {
302 for (
const auto& block : blocks) {
303 const size_t iterations = block.size / 2;
304 for (
size_t i : chunk.range(iterations)) {
305 do_row(accumulators[chunk.
thread_index], block.start + i * 2);
311 for (
const auto& block : blocks) {
312 for (
size_t start = block.start; start < block.start + block.size; start += ROWS_PER_CHUNK) {
313 chunks.push_back(ActiveBlock{
315 .size = std::min(ROWS_PER_CHUNK, block.start + block.size - start),
320 std::atomic<size_t> next_chunk{ 0 };
322 accumulators.resize(num_slots);
326 if (chunk_idx >= chunks.size()) {
329 const auto& chunk = chunks[chunk_idx];
330 for (
size_t edge_idx = chunk.start; edge_idx < chunk.start + chunk.size; edge_idx += 2) {
331 do_row(accumulators[slot_idx], edge_idx);
336 benchmark::DoNotOptimize(accumulators);
339 state.counters[
"relations"] =
static_cast<double>(spec.relations);
340 state.counters[
"subrelations"] =
static_cast<double>(spec.subrelations);
341 state.counters[
"threads"] =
static_cast<double>(
get_num_cpus());
342 state.counters[
"imbalanced"] = imbalanced ? 1.0 : 0.0;
345template <
typename Flavor>
void bench_compute_univariate_round0(benchmark::State& state)
349 const size_t log_n =
static_cast<size_t>(state.range(0));
350 const size_t round_size =
size_t{ 1 } << log_n;
352 SyntheticPolynomials<Flavor> synthetic_polynomials(round_size);
353 auto params = make_relation_parameters<Flavor>();
354 auto alphas = make_subrelation_separators<Flavor>();
357 for (
auto _ : state) {
358 Round round(round_size);
359 auto result = round.
compute_univariate(synthetic_polynomials.polynomials, params, gate_separators, alphas);
360 benchmark::DoNotOptimize(result);
365 state.counters[
"threads"] =
static_cast<double>(
get_num_cpus());
368template <
typename Flavor>
void register_flavor_benches(
const std::string& name,
const bool fragmented)
370 benchmark::RegisterBenchmark((name +
"/accumulate_relations_only").c_str(),
371 &bench_accumulate_relations_only<Flavor>)
373 benchmark::RegisterBenchmark((name +
"/loop_static_blocks").c_str(),
374 &bench_sumcheck_loop_shape<Flavor>,
375 Scheduler::STATIC_BLOCKS,
379 ->Unit(benchmark::kMillisecond);
380 benchmark::RegisterBenchmark((name +
"/loop_chunk_stealing").c_str(),
381 &bench_sumcheck_loop_shape<Flavor>,
382 Scheduler::CHUNK_STEALING,
386 ->Unit(benchmark::kMillisecond);
387 benchmark::RegisterBenchmark((name +
"/compute_univariate_round0").c_str(),
388 &bench_compute_univariate_round0<Flavor>)
391 ->Unit(benchmark::kMillisecond);
398 register_flavor_benches<bb::MegaZKFlavor>(
"MegaZK",
false);
399 register_flavor_benches<bb::TranslatorShortMonomialFlavor>(
"TranslatorShort",
true);
400 register_flavor_benches<bb::ECCVMShortMonomialFlavor>(
"ECCVMShort",
false);
401 benchmark::RegisterBenchmark(
"Nano/static_blocks", &bench_nano_scheduler<Scheduler::STATIC_BLOCKS>)
405 ->Unit(benchmark::kMillisecond);
406 benchmark::RegisterBenchmark(
"Nano/chunk_stealing", &bench_nano_scheduler<Scheduler::CHUNK_STEALING>)
410 ->Unit(benchmark::kMillisecond);
412 benchmark::Initialize(&argc, argv);
413 benchmark::RunSpecifiedBenchmarks();
414 benchmark::Shutdown();
A container for the prover polynomials.
typename Curve::ScalarField FF
static constexpr size_t NUM_SUBRELATIONS
static constexpr size_t NUM_ALL_ENTITIES
static constexpr size_t NUM_RELATIONS
Structured polynomial class that represents the coefficients 'a' of a_0 + a_1 x .....
static constexpr void add_nested_tuples(Tuple &tuple_1, const Tuple &tuple_2)
Componentwise addition of nested tuples (tuples of tuples)
Imlementation of the Sumcheck prover round.
decltype(create_sumcheck_tuple_of_tuples_of_univariates< Relations >()) SumcheckTupleOfTuplesOfUnivariates
SumcheckRoundUnivariate compute_univariate(ProverPolynomialsOrPartiallyEvaluatedMultivariates &polynomials, const bb::RelationParameters< FF > &relation_parameters, const bb::GateSeparatorPolynomial< FF > &gate_separators, const SubrelationSeparators &alphas)
Return the evaluations of the univariate round polynomials. Toggles between chunked computation (desi...
void extend_edges(ExtendedEdges &extended_edges, const ProverPolynomialsOrPartiallyEvaluatedMultivariates &multivariates, const size_t edge_idx)
To compute the round univariate in Round , the prover first computes the values of Honk polynomials ...
std::array< FF, Flavor::NUM_SUBRELATIONS - 1 > SubrelationSeparators
void accumulate_relation_univariates_public(SumcheckTupleOfTuplesOfUnivariates &univariate_accumulators, const auto &extended_edges, const bb::RelationParameters< FF > &relation_parameters, const FF &scaling_factor)
typename ECCVMFlavor::ProverPolynomials ProverPolynomials
Entry point for Barretenberg command-line interface.
field< Bn254FrParams > fr
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Implementation of the methods for the -polynomials used in in Sumcheck.
Container for parameters used by the grand product (permutation, lookup) Honk relations.
static constexpr field zero()