Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
pippenger_arena_layout.hpp
Go to the documentation of this file.
1// Per-worker arena layout for the round-parallel Pippenger MSM (Zone W slab).
2//
3// Canonical source of truth for the per-worker byte walk that was previously
4// duplicated across `compute_arena_bytes_for_msm`, the live allocator inside
5// `pippenger_round_parallel`, and `pippenger_bn254_arena_layout_fits_for_test`.
6// The historical arena drift bugs (cluster_offsets miscount, wasm
7// aligned_local overflow, NO_GLV abort, t1 abort) all traced to disagreements
8// between those copies; this struct removes that class by computing the layout
9// once.
10//
11// The constructor's layout walk mirrors the live allocator's `layout_add`
12// sequence exactly, including alignment slop. The sizer's previous
13// arithmetic-only formula did not honour per-allocation alignment, so it
14// systematically under-counted by a few bytes per slab; the struct fixes that
15// by construction.
16//
17// Phase A and Stage 6 fields overlay the same per-worker bytes because the
18// parallel_for invocations are disjoint (Phase A runs on the first window
19// batch, Stage 6 runs per batch thereafter, and never on the same worker
20// concurrently). `per_worker_union_bytes = max(ts_fixed, pa_layout)`.
21
22#pragma once
23
25
26#include <algorithm>
27#include <array>
28#include <bit>
29#include <cstddef>
30#include <cstdint>
31#include <utility>
32
34
35// ============================================================================
36// Round-parallel internals exposed to the test suite.
37//
38// `pippenger_bn254_arena_layout_fits_for_test` is a TU-local helper that walks
39// the actual Zone P / Zone W / Zone S allocator for representative inputs and
40// asserts the result fits in `compute_arena_bytes_for_msm`'s promise. Its body
41// lives in `scalar_multiplication.test.cpp`, which means the helpers it needs
42// (`choose_window_bits`, `build_var_window_schedule`, `ChunkOutput`,
43// `DEDUP_MAX_*`, `VAR_WINDOW_MAX_WINDOWS`, `compute_arena_bytes_for_msm`) need
44// header-visible declarations.
45// ============================================================================
46
47// Per-window count cap shared by `VariableWindowSchedule` arrays and the live
48// allocator's `window_sums_storage` slot.
49inline constexpr size_t VAR_WINDOW_MAX_WINDOWS = 128;
50
51// Dedup pre-pass caps. DEDUP_MAX_CLUSTERS bounds `extra_points` at ≤ 1 MB;
52// DEDUP_MAX_MEMBERS bounds the per-worker `cluster_members` slab.
53inline constexpr size_t DEDUP_MAX_CLUSTERS = 16384;
54inline constexpr size_t DEDUP_MAX_MEMBERS = 32768;
55
56// Uniform window schedule produced by `build_var_window_schedule`. Holds the
57// per-window `c` value and bucket count for downstream sizing/dispatch.
59 size_t num_windows = 0;
60 std::array<uint8_t, VAR_WINDOW_MAX_WINDOWS> window_bits_per_window{}; // window_bits_w for each w
61 std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> bit_base{}; // B_w = Σ_{k<w} c_k, B_0 = 0
62 std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS> num_buckets{}; // 2^(window_bits_w - 1) + 1
63};
64
65// Per-chunk recursive-affine bucket-reduce output (Stage 6b output cell).
66template <typename Curve> struct ChunkOutput {
67 typename Curve::Element R{};
68 typename Curve::Element L{};
69 uint32_t lo = 0;
70 uint32_t hi = 0;
71 uint8_t empty = 1;
72};
73
74// Pick the optimal window size `c`. Native uses a cost model
75// `rounds * (n + 15 * buckets)`; WASM uses a closed-form `target_load` formula.
76[[nodiscard]] inline uint32_t choose_window_bits(size_t num_points,
77 size_t num_bits,
78 size_t n_input,
79 size_t num_logical_threads) noexcept
80{
81 constexpr uint32_t MAX_C = 20;
82 uint32_t best = 2;
83
84#ifdef __wasm__
85 static_cast<void>(num_bits);
86 const size_t target_load = (n_input > 4096) ? (num_logical_threads * 2 / 3) : (num_logical_threads / 3);
87 if (target_load == 0 || num_points <= target_load) {
88 best = 2;
89 } else {
90 const size_t ratio = num_points / target_load;
91 const uint32_t lg = static_cast<uint32_t>(numeric::get_msb(ratio));
92 best = lg + 1;
93 if (best < 2) {
94 best = 2;
95 } else if (best >= MAX_C) {
96 best = MAX_C - 1;
97 }
98 }
99#else
100 static_cast<void>(n_input);
101 static_cast<void>(num_logical_threads);
102 uint64_t best_cost = static_cast<uint64_t>(-1);
103 for (uint32_t window_bits = 2; window_bits < MAX_C; ++window_bits) {
104 const uint64_t rounds = (num_bits + 2 + window_bits - 1) / window_bits;
105 const uint64_t buckets = (uint64_t{ 1 } << (window_bits - 1)) + 1;
106 const uint64_t n = num_points;
107 constexpr uint64_t BUCKET_ACC_COST = 15;
108 const uint64_t cost = rounds * (n + (buckets * BUCKET_ACC_COST));
109 if (cost < best_cost) {
110 best_cost = cost;
111 best = window_bits;
112 }
113 }
114#endif
115
116 return best;
117}
118
119// Build a uniform window schedule for the given bit budget and chosen `c`. Every window
120// is `window_bits` wide except the final one, which takes the remaining bits. The +2 on
121// the bit budget accommodates the carry-less top bit of the Constantine recoder.
122inline VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits) noexcept
123{
125
126 size_t bits_remaining = num_bits + 2;
127 size_t bit_offset = 0;
128 size_t w = 0;
129 while (bits_remaining > 0 && w < VAR_WINDOW_MAX_WINDOWS) {
130 const size_t window_bits_w = std::min<size_t>(window_bits, bits_remaining);
131 sched.bit_base[w] = static_cast<uint16_t>(bit_offset);
132 sched.window_bits_per_window[w] = static_cast<uint8_t>(window_bits_w);
133 sched.num_buckets[w] = static_cast<uint16_t>((size_t{ 1 } << (window_bits_w - 1)) + 1);
134 bit_offset += window_bits_w;
135 bits_remaining -= window_bits_w;
136 ++w;
137 }
138 sched.num_windows = w;
139 return sched;
140}
141
142// Maximum number of independent additions batched per modular inversion in the
143// affine-arithmetic group ops (used by Stage 6a/6b). Sizes per-worker
144// `points_to_add`, `inversion_scratch`, and `pair_dest` arrays.
145inline constexpr size_t BATCH_CAPACITY = 256;
146
147// Phase A's chunked tree-reduce limit. Capped so the per-worker scratch slab
148// (chunk_pts + chunk_ids) stays under ~128 KB.
149inline constexpr size_t DEDUP_MAX_CHUNK_MEMBERS = 2048;
150
151inline constexpr size_t MIN_BATCH_CAPACITY = 32;
152inline constexpr size_t MIN_AFFINE_THREAD_RATIO = 2;
153inline constexpr size_t SUBCHUNK_ENTRIES_CAP = 2048;
154inline constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL;
155
156// Per-bucket-chunk metadata produced by Stage 6a, consumed by Stage 6b's
157// cross-thread reduce.
158// lo, hi — lowest / highest non-empty digit in the chunk (inclusive)
159// buckets_padded — next power of two ≥ (hi - lo + 1)
160// empty — 1 iff the chunk had no entries (Stage 6b skips it)
162 uint32_t lo = 0;
163 uint32_t hi = 0;
164 uint32_t buckets_padded = 0;
165 uint8_t empty = 1;
166};
167
168template <typename Curve> struct PerWorkerArenaLayout {
170 using BaseField = typename Curve::BaseField;
171
172 // Caps shared between sizer and allocator. Centralised here so the two
173 // sites can't diverge.
174 static constexpr size_t PHASE_A_DIRTY_SLOTS_CAP = 4096; // HT_SIZE
175 static constexpr size_t PHASE_A_BUCKET_REP_CAP = 256; // loose cap
176 static constexpr size_t PHASE_A_STAGED_CAP = 1024; // loose cap
178 static constexpr size_t WORKER_SLAB_ALIGN = alignof(AffineElement);
179
180 // Computed byte sizes (filled by constructor's layout walk).
181 size_t ts_fixed_layout = 0; // ThreadScratch wpb-independent fields, with align slop
182 size_t pa_layout = 0; // PhaseAScratch fields, with align slop
183 size_t per_worker_union_bytes = 0; // = align_up(max(ts_fixed_layout, pa_layout), WORKER_SLAB_ALIGN)
184 size_t per_worker_per_wpb_layout = 0; // Stage 6 wpb-dependent tail
185 size_t per_worker_bytes = 0; // = align_up(union + tail, WORKER_SLAB_ALIGN)
186
187 // Constructor performs the canonical layout walk. `windows_per_batch` and
188 // `dense_stride_est` may be zero — only the wpb-independent parts then
189 // have meaningful values, useful for the sizer's pre-wpb-solve step.
190 PerWorkerArenaLayout(size_t chunk_capacity,
191 size_t global_max_overflow_per_window,
192 bool dedup_active,
193 size_t phase_a_cluster_members_cap,
194 size_t phase_a_cluster_offsets_cap,
195 size_t windows_per_batch,
196 size_t dense_stride_est) noexcept
197 {
198 auto align_up = [](size_t off, size_t align) -> size_t { return (off + align - 1) & ~(align - 1); };
199 auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; };
200
201 // ThreadScratch fixed (curr_pts / curr_buckets / points_to_add /
202 // inversion_scratch / pair_dest / overflow_slots / overflow_pts).
203 layout_add(ts_fixed_layout, sizeof(AffineElement) * chunk_capacity, alignof(AffineElement));
204 layout_add(ts_fixed_layout, sizeof(uint32_t) * chunk_capacity, alignof(uint32_t));
205 layout_add(ts_fixed_layout, sizeof(AffineElement) * 2 * BATCH_CAPACITY, alignof(AffineElement));
206 layout_add(ts_fixed_layout, sizeof(BaseField) * BATCH_CAPACITY, alignof(BaseField));
207 layout_add(ts_fixed_layout, sizeof(uint32_t) * BATCH_CAPACITY, alignof(uint32_t));
208 layout_add(ts_fixed_layout, sizeof(uint32_t) * global_max_overflow_per_window, alignof(uint32_t));
209 layout_add(ts_fixed_layout, sizeof(AffineElement) * global_max_overflow_per_window, alignof(AffineElement));
210
211 // PhaseA (cluster_members / cluster_offsets / dirty_slots / bucket_rep
212 // / staged / chunk_pts / chunk_ids). Only allocated when dedup_active.
213 if (dedup_active) {
214 layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_members_cap, alignof(uint32_t));
215 layout_add(pa_layout, sizeof(uint32_t) * phase_a_cluster_offsets_cap, alignof(uint32_t));
216 layout_add(pa_layout, sizeof(uint16_t) * PHASE_A_DIRTY_SLOTS_CAP, alignof(uint16_t));
217 layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_BUCKET_REP_CAP, alignof(uint32_t));
218 layout_add(pa_layout,
221 layout_add(pa_layout, sizeof(AffineElement) * PHASE_A_CHUNK_CAP, alignof(AffineElement));
222 layout_add(pa_layout, sizeof(uint32_t) * PHASE_A_CHUNK_CAP, alignof(uint32_t));
223 }
224
226
227 // Stage 6 wpb-dependent tail (dense_buckets / is_present / pair
228 // scratch / chunk_infos). Skipped when windows_per_batch == 0 (sizer's
229 // pre-wpb-solve call).
230 if (windows_per_batch != 0) {
231 const size_t dense_total = windows_per_batch * dense_stride_est;
232 const size_t dense_pair_max = dense_total / 2;
233 layout_add(per_worker_per_wpb_layout, sizeof(AffineElement) * dense_total, alignof(AffineElement));
234 layout_add(per_worker_per_wpb_layout, sizeof(uint8_t) * dense_total, alignof(uint8_t));
235 layout_add(per_worker_per_wpb_layout,
236 sizeof(std::pair<uint32_t, uint32_t>) * dense_pair_max,
238 layout_add(per_worker_per_wpb_layout, sizeof(uint32_t) * dense_pair_max, alignof(uint32_t));
239 layout_add(per_worker_per_wpb_layout, sizeof(BaseField) * dense_pair_max, alignof(BaseField));
240 layout_add(per_worker_per_wpb_layout,
241 sizeof(AffineBucketChunkInfo) * windows_per_batch,
242 alignof(AffineBucketChunkInfo));
243 }
244
246 }
247};
248
249// Stride upper bound for `s.dense_buckets`: next_pow2(⌈(B-1)/T⌉), with a floor of 2.
250[[nodiscard]] inline size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept
251{
252 const size_t per_thread = (B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 };
253 return std::max<size_t>(2, std::bit_ceil(per_thread));
254}
255
256// Upper bound on Σ_t buckets_per_thread[t][w] per window: B + T - 1 (adjacent threads
257// may share one boundary bucket). Returns 0 when B_eff == 0.
258[[nodiscard]] inline size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept
259{
260 return (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : size_t{ 0 };
261}
262
263// Per-OS-thread Stage 6a seam overflow capacity (per-window upper bound).
264[[nodiscard]] inline size_t compute_global_max_overflow_per_window(size_t n,
265 size_t num_threads,
266 size_t subchunk_entries_cap) noexcept
267{
268 const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads;
269 return (global_max_chunk_len + subchunk_entries_cap - 1) / subchunk_entries_cap;
270}
271
272// Per-window byte cost for one window in a windows-per-batch slab. Identical formula
273// at three sites (sizer outer, sizer per-schedule lambda, live allocator); centralised
274// here so they cannot drift.
275//
276// schedule = 4·n
277// HIST slot = max(4·t·B, sizeof(ChunkOutput)·t + 96·t) [H ∪ O overlay]
278// DENSE slot = 65 · bucket_partials_max(B, t) [bucket_partials_dense + present]
279// bucket_start = 8·(B+1)
280// chunk arrays = 8·(t+1) + 8·(t+1) + 8·t + 8·t + 8·t + 16·worker + 8·t
281// dense_buckets = 87·worker·stride [s.dense_buckets + aux]
282template <typename Curve>
283[[nodiscard]] inline size_t compute_per_window_bytes(
284 size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
285{
286 const size_t bucket_partials_max = compute_bucket_partials_max(B_eff, num_threads);
287 const size_t hist_h_bytes_pw = size_t{ 4 } * num_threads * B_eff;
288 const size_t hist_o_bytes_pw = (sizeof(ChunkOutput<Curve>) * num_threads) + (size_t{ 96 } * num_threads);
289 const size_t hist_slot_bytes_pw = std::max(hist_h_bytes_pw, hist_o_bytes_pw);
290 const size_t dense_slot_bytes_pw = size_t{ 65 } * bucket_partials_max;
291 return (size_t{ 4 } * n) + hist_slot_bytes_pw + dense_slot_bytes_pw + (size_t{ 8 } * (B_eff + 1)) +
292 (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * num_threads) +
293 (size_t{ 8 } * num_threads) + (size_t{ 8 } * num_threads) + (size_t{ 16 } * worker_total) +
294 (size_t{ 8 } * num_threads) + (size_t{ 87 } * worker_total * dense_stride);
295}
296
297// Phase-1 prologue bytes living in the per-MSM arena (msb_per_scalar, glv_scalars,
298// glv_points, per_thread_msb_hist). Two-copy duplicate eliminated.
299[[nodiscard]] inline size_t compute_phase_one_prologue_bytes(size_t n,
300 bool use_glv,
301 bool inline_glv_double,
302 size_t profile_threads) noexcept
303{
304 return n // msb_per_scalar
305 + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) // glv_scalars_storage
306 + (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) // glv_points_storage
307 + (profile_threads * size_t{ 1024 }); // per_thread_msb_hist
308}
309
313};
314
315// Phase A per-worker caps. `members_cap = min(DEDUP_MAX_MEMBERS, n)` is tight (each
316// scalar contributes ≤ 1 cluster_member entry). `offsets_cap = cids_per_thread + 2`
317// covers the leading-zero sentinel + post-last terminator.
318[[nodiscard]] inline PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept
319{
320 return { std::min(DEDUP_MAX_MEMBERS, n), (DEDUP_MAX_CLUSTERS / num_threads) + 2 };
321}
322
323// Solve `wpb · per_window_bytes ≤ available_budget`, clamped to W_R and ≥ 1.
324// Mirrors the three identical wpb-pickers in the sizer and live allocator.
325[[nodiscard]] inline size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
326{
327 if (W_R == 0) {
328 return 1;
329 }
330 if (per_window_bytes == 0 || available_budget == 0) {
331 return std::max<size_t>(1, W_R);
332 }
333 return std::min(std::max<size_t>(1, available_budget / per_window_bytes), W_R);
334}
335
336} // namespace bb::scalar_multiplication::round_parallel_detail
typename Group::element Element
Definition grumpkin.hpp:63
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64
constexpr T get_msb(const T in)
Definition get_msb.hpp:50
size_t compute_global_max_overflow_per_window(size_t n, size_t num_threads, size_t subchunk_entries_cap) noexcept
size_t compute_phase_one_prologue_bytes(size_t n, bool use_glv, bool inline_glv_double, size_t profile_threads) noexcept
size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
size_t compute_per_window_bytes(size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept
PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept
size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept
uint32_t choose_window_bits(size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads) noexcept
VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits) noexcept
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
PerWorkerArenaLayout(size_t chunk_capacity, size_t global_max_overflow_per_window, bool dedup_active, size_t phase_a_cluster_members_cap, size_t phase_a_cluster_offsets_cap, size_t windows_per_batch, size_t dense_stride_est) noexcept