Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
scalar_multiplication.test.cpp
Go to the documentation of this file.
11#include <array>
12#include <bit>
13#include <filesystem>
14#include <gtest/gtest.h>
15
16using namespace bb;
17
18namespace {
20
21// Walks the actual Zone P / Zone W / Zone S allocator for a representative BN254
22// MSM shape and asserts the result fits in `compute_arena_bytes_for_msm`'s promise.
23// Mirrors the live allocator inside `pippenger_round_parallel` exactly; the only
24// historical drift bugs (cluster_offsets miscount, wasm aligned_local overflow,
25// NO_GLV abort, t1 abort) all came from this walk falling out of sync.
26bool pippenger_bn254_arena_layout_fits_for_test(size_t n_input,
27 bool external_glv_provided = false,
28 bool dedup_active = false,
29 size_t effective_num_bits_for_test = 0) noexcept
30{
31 using Curve = curve::BN254;
32 using ScalarField = typename Curve::ScalarField;
33 using Element = typename Curve::Element;
34 using AffineElement = typename Curve::AffineElement;
35 namespace rpd = scalar_multiplication::round_parallel_detail;
36
37 constexpr size_t FULL_NUM_BITS = ScalarField::modulus.get_msb() + 1;
38 if (n_input < 4) {
39 return true;
40 }
41
42 const bool use_glv = external_glv_provided || (n_input <= rpd::GLV_SMALL_N_THRESHOLD);
43 const bool inline_glv_double = use_glv && !external_glv_provided;
44 const size_t n = use_glv ? 2 * n_input : n_input;
45 const size_t NUM_BITS = use_glv ? size_t{ 128 } : FULL_NUM_BITS;
46 const size_t arena_capacity =
47 scalar_multiplication::compute_arena_bytes_for_msm<Curve>(n_input, external_glv_provided, dedup_active);
48 if (arena_capacity == 0) {
49 return true;
50 }
51
52 const size_t actual_num_bits = (effective_num_bits_for_test == 0 || effective_num_bits_for_test > NUM_BITS)
53 ? NUM_BITS
54 : effective_num_bits_for_test;
55 const size_t num_logical_threads_for_c =
57 const size_t window_bits = rpd::choose_window_bits(n, actual_num_bits, n_input, num_logical_threads_for_c);
58 const auto sched = rpd::build_var_window_schedule(actual_num_bits, window_bits);
59 const size_t num_buckets = (size_t{ 1 } << (window_bits - 1)) + 1;
60
61 using rpd::BATCH_CAPACITY;
62 constexpr size_t MIN_BATCH_CAPACITY = 32;
63 constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL;
64 constexpr size_t SUBCHUNK_ENTRIES_CAP = 2048;
65
66 const size_t desired_threads = std::max<size_t>(1, bb::get_num_cpus());
67 const size_t max_threads_for_min_batch = std::max<size_t>(1, n / MIN_BATCH_CAPACITY);
68 const size_t num_threads = std::min(desired_threads, max_threads_for_min_batch);
69 const size_t profile_threads = std::max<size_t>(1, bb::get_num_cpus());
70 const size_t worker_total = num_threads;
71
72 size_t B_eff = num_buckets;
73 for (size_t w = 0; w < sched.num_windows; ++w) {
74 B_eff = std::max(B_eff, static_cast<size_t>(sched.num_buckets[w]));
75 }
76 const size_t dense_stride_est =
77 std::max<size_t>(2, std::bit_ceil((B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 }));
78 const size_t bucket_partials_per_window_max = (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : 0;
79 const size_t hist_h_bytes_pw_shared = (size_t{ 4 } * num_threads * B_eff);
80 const size_t hist_o_bytes_pw_shared =
81 (sizeof(rpd::ChunkOutput<Curve>) * num_threads) + (size_t{ 96 } * num_threads);
82 const size_t hist_slot_bytes_pw_shared = std::max(hist_h_bytes_pw_shared, hist_o_bytes_pw_shared);
83 const size_t dense_slot_bytes_pw_shared = (size_t{ 65 } * bucket_partials_per_window_max);
84 const size_t per_window_bytes_shared =
85 hist_slot_bytes_pw_shared + dense_slot_bytes_pw_shared + (size_t{ 8 } * (B_eff + 1)) +
86 (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * num_threads) +
87 (size_t{ 8 } * num_threads) + (size_t{ 8 } * num_threads) + (size_t{ 16 } * worker_total) +
88 (size_t{ 8 } * num_threads) + (size_t{ 87 } * worker_total * dense_stride_est);
89 const size_t capacity_lo = n;
90 const size_t per_window_bytes_lo = (size_t{ 4 } * capacity_lo) + per_window_bytes_shared;
91
92 const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads;
93 const size_t global_max_overflow_per_window =
94 (global_max_chunk_len + SUBCHUNK_ENTRIES_CAP - 1) / SUBCHUNK_ENTRIES_CAP;
95 const size_t chunk_capacity = std::max(SUBCHUNK_ENTRIES_CAP, 2 * global_max_overflow_per_window);
96
97 const size_t phase_a_cluster_members_cap = std::min(rpd::DEDUP_MAX_MEMBERS, n);
98 const size_t phase_a_cluster_offsets_cap = (rpd::DEDUP_MAX_CLUSTERS / num_threads) + 2;
99
100 const size_t phase_one_prologue_bytes = n + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) +
101 (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) +
102 (profile_threads * size_t{ 1024 });
103
104 const rpd::PerWorkerArenaLayout<Curve> budget_layout(
105 /*chunk_capacity=*/SUBCHUNK_ENTRIES_CAP,
106 global_max_overflow_per_window,
107 dedup_active,
108 phase_a_cluster_members_cap,
109 phase_a_cluster_offsets_cap,
110 /*windows_per_batch=*/0,
111 /*dense_stride_est=*/0);
112 const size_t worker_union_bytes_for_budget = budget_layout.per_worker_union_bytes;
113 const size_t fixed_overhead = (worker_union_bytes_for_budget * worker_total) +
114 (size_t{ 96 } * rpd::VAR_WINDOW_MAX_WINDOWS) + (size_t{ 8 } * (num_threads + 1)) +
115 phase_one_prologue_bytes;
116 const size_t available_budget =
117 (BATCH_MEM_BUDGET > fixed_overhead) ? (BATCH_MEM_BUDGET - fixed_overhead) : size_t{ 0 };
118 const size_t windows_per_batch = (per_window_bytes_lo == 0 || available_budget == 0)
119 ? std::max<size_t>(1, sched.num_windows)
120 : std::min(std::max<size_t>(1, available_budget / per_window_bytes_lo),
121 static_cast<size_t>(sched.num_windows));
122
123 auto align_up = [](size_t off, size_t align) -> size_t { return (off + align - 1) & ~(align - 1); };
124 auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; };
125 auto bump_fits = [&](size_t count,
126 size_t size,
127 size_t align,
128 size_t& cursor,
129 size_t bound,
130 size_t base_offset,
131 size_t base_misalign) {
132 const size_t cur_addr_mod = (base_misalign + base_offset + cursor) & (align - 1);
133 const size_t align_delta = (cur_addr_mod == 0) ? size_t{ 0 } : (align - cur_addr_mod);
134 const size_t aligned_local = cursor + align_delta;
135 const size_t bytes = count * size;
136 if (aligned_local + bytes > bound) {
137 return false;
138 }
139 cursor = aligned_local + bytes;
140 return true;
141 };
142
143 for (size_t base_misalign = 0; base_misalign < alignof(AffineElement); ++base_misalign) {
144 size_t arena_cursor = 0;
145 if (!bump_fits(n, sizeof(uint8_t), alignof(uint8_t), arena_cursor, arena_capacity, 0, base_misalign)) {
146 return false;
147 }
148 if (!bump_fits(profile_threads,
151 arena_cursor,
152 arena_capacity,
153 0,
154 base_misalign)) {
155 return false;
156 }
157 if (use_glv) {
158 if (!bump_fits(
159 n, sizeof(ScalarField), alignof(ScalarField), arena_cursor, arena_capacity, 0, base_misalign)) {
160 return false;
161 }
162 if (inline_glv_double &&
163 !bump_fits(
164 n, sizeof(AffineElement), alignof(AffineElement), arena_cursor, arena_capacity, 0, base_misalign)) {
165 return false;
166 }
167 }
168 const size_t bytes_P_prefix = arena_cursor;
169
170 const rpd::PerWorkerArenaLayout<Curve> worker_layout(chunk_capacity,
171 global_max_overflow_per_window,
172 dedup_active,
173 phase_a_cluster_members_cap,
174 phase_a_cluster_offsets_cap,
175 windows_per_batch,
176 dense_stride_est);
177 constexpr size_t WORKER_SLAB_ALIGN = rpd::PerWorkerArenaLayout<Curve>::WORKER_SLAB_ALIGN;
178 const size_t per_worker_bytes = worker_layout.per_worker_bytes;
179
180 size_t bytes_P_extra_layout = 0;
181 layout_add(bytes_P_extra_layout, sizeof(Element) * rpd::VAR_WINDOW_MAX_WINDOWS, alignof(Element));
182 if (dedup_active) {
183 layout_add(bytes_P_extra_layout, sizeof(uint32_t) * n, alignof(uint32_t));
184 layout_add(bytes_P_extra_layout, sizeof(AffineElement) * rpd::DEDUP_MAX_CLUSTERS, alignof(AffineElement));
185 }
186 const size_t bytes_P_min = align_up(bytes_P_prefix, alignof(Element)) + bytes_P_extra_layout;
187 const size_t bytes_P = align_up(bytes_P_min + base_misalign, WORKER_SLAB_ALIGN) - base_misalign;
188 const size_t bytes_W = per_worker_bytes * worker_total;
189 if (bytes_P + bytes_W > arena_capacity) {
190 return false;
191 }
192 const size_t bytes_S_total = arena_capacity - bytes_P - bytes_W;
193 size_t zone_S_cursor = 0;
194 const size_t zone_S_base = bytes_P + bytes_W;
195
196 const size_t schedule_total = windows_per_batch * capacity_lo;
197 if (!bump_fits(schedule_total,
198 sizeof(uint32_t),
199 alignof(uint32_t),
200 zone_S_cursor,
201 bytes_S_total,
202 zone_S_base,
203 base_misalign)) {
204 return false;
205 }
206 const size_t hist_h_bytes_total = size_t{ 4 } * windows_per_batch * num_threads * B_eff;
207 size_t o_layout_cur = 0;
208 o_layout_cur = align_up(o_layout_cur, alignof(rpd::ChunkOutput<Curve>));
209 o_layout_cur += sizeof(rpd::ChunkOutput<Curve>) * windows_per_batch * num_threads;
210 o_layout_cur = align_up(o_layout_cur, alignof(Element));
211 o_layout_cur += sizeof(Element) * num_threads * windows_per_batch;
212 const size_t hist_slot_cells =
213 (std::max(hist_h_bytes_total, o_layout_cur) + sizeof(AffineElement) - 1) / sizeof(AffineElement);
214 const size_t dense_slot_cells =
215 ((size_t{ 65 } * windows_per_batch * bucket_partials_per_window_max) + sizeof(AffineElement) - 1) /
216 sizeof(AffineElement);
217 if (!bump_fits(hist_slot_cells,
218 sizeof(AffineElement),
219 alignof(AffineElement),
220 zone_S_cursor,
221 bytes_S_total,
222 zone_S_base,
223 base_misalign) ||
224 !bump_fits(dense_slot_cells,
225 sizeof(AffineElement),
226 alignof(AffineElement),
227 zone_S_cursor,
228 bytes_S_total,
229 zone_S_base,
230 base_misalign) ||
231 !bump_fits(windows_per_batch * (B_eff + 1),
232 sizeof(size_t),
233 alignof(size_t),
234 zone_S_cursor,
235 bytes_S_total,
236 zone_S_base,
237 base_misalign) ||
238 !bump_fits(windows_per_batch * (num_threads + 1),
239 sizeof(size_t),
240 alignof(size_t),
241 zone_S_cursor,
242 bytes_S_total,
243 zone_S_base,
244 base_misalign) ||
245 !bump_fits(windows_per_batch * (num_threads + 1),
246 sizeof(size_t),
247 alignof(size_t),
248 zone_S_cursor,
249 bytes_S_total,
250 zone_S_base,
251 base_misalign) ||
252 !bump_fits(windows_per_batch * num_threads,
253 sizeof(size_t),
254 alignof(size_t),
255 zone_S_cursor,
256 bytes_S_total,
257 zone_S_base,
258 base_misalign) ||
259 !bump_fits((num_threads * windows_per_batch) + 1,
260 sizeof(size_t),
261 alignof(size_t),
262 zone_S_cursor,
263 bytes_S_total,
264 zone_S_base,
265 base_misalign) ||
266 !bump_fits(num_threads + 1,
267 sizeof(size_t),
268 alignof(size_t),
269 zone_S_cursor,
270 bytes_S_total,
271 zone_S_base,
272 base_misalign) ||
273 !bump_fits(windows_per_batch * num_threads,
274 sizeof(size_t),
275 alignof(size_t),
276 zone_S_cursor,
277 bytes_S_total,
278 zone_S_base,
279 base_misalign) ||
280 !bump_fits(windows_per_batch * num_threads,
281 sizeof(size_t),
282 alignof(size_t),
283 zone_S_cursor,
284 bytes_S_total,
285 zone_S_base,
286 base_misalign)) {
287 return false;
288 }
289 }
290 return true;
291}
292} // namespace
293
294template <class Curve> class ScalarMultiplicationTest : public ::testing::Test {
295 public:
296 using Group = typename Curve::Group;
297 using Element = typename Curve::Element;
300
301 static constexpr size_t num_points = 31013;
302
303 // Bounds used by test_batch_multi_scalar_mul. Kept small so num_points (and therefore
304 // SetUpTestSuite, which builds num_points random EC points) stays cheap — especially under wasm,
305 // where the fixture build previously dominated the whole ecc_tests run.
306 static constexpr size_t kMaxBatchMSMs = 32;
307 static constexpr size_t kMaxBatchPointsPerMSM = 400;
308
309 // Pinning invariants: these tests walk generators[]/scalars[] without bounds checks beyond an
310 // occasional runtime ASSERT_LT. Pin the relationships at compile time so changing any one of
311 // these constants in isolation cannot regress into an out-of-bounds walk.
313 "test_batch_multi_scalar_mul can exceed num_points; "
314 "raise num_points or lower kMaxBatchMSMs / kMaxBatchPointsPerMSM");
315
316 static inline std::vector<AffineElement> generators{};
317 static inline std::vector<ScalarField> scalars{};
318
319 static AffineElement naive_msm(std::span<ScalarField> input_scalars, std::span<const AffineElement> input_points)
320 {
321 size_t total_points = input_scalars.size();
322 size_t num_threads = get_num_cpus();
323 std::vector<Element> expected_accs(num_threads);
324 size_t range_per_thread = (total_points + num_threads - 1) / num_threads;
325 parallel_for(num_threads, [&](size_t thread_idx) {
326 Element expected_thread_acc;
327 expected_thread_acc.self_set_infinity();
328 size_t start = thread_idx * range_per_thread;
329 size_t end = ((thread_idx + 1) * range_per_thread > total_points) ? total_points
330 : (thread_idx + 1) * range_per_thread;
331 bool skip = start >= total_points;
332 if (!skip) {
333 for (size_t i = start; i < end; ++i) {
334 expected_thread_acc += input_points[i] * input_scalars[i];
335 }
336 }
337 expected_accs[thread_idx] = expected_thread_acc;
338 });
339
340 Element expected_acc = Element();
341 expected_acc.self_set_infinity();
342 for (auto& acc : expected_accs) {
343 expected_acc += acc;
344 }
345 return AffineElement(expected_acc);
346 }
347
348 static std::vector<AffineElement> make_repeated_test_points(size_t num_pts)
349 {
350 std::vector<AffineElement> points(num_pts);
351 for (size_t i = 0; i < num_pts; ++i) {
352 points[i] = generators[i % generators.size()];
353 }
354 return points;
355 }
356
357 static void SetUpTestSuite()
358 {
359 generators.resize(num_points);
360 scalars.resize(num_points);
361 parallel_for_range(num_points, [&](size_t start, size_t end) {
362 for (size_t i = start; i < end; ++i) {
363 generators[i] = Group::one * Curve::ScalarField::random_element(&engine);
364 scalars[i] = Curve::ScalarField::random_element(&engine);
365 }
366 });
367 for (size_t i = 0; i < num_points - 1; ++i) {
368 ASSERT_EQ(generators[i].x == generators[i + 1].x, false);
369 }
370 };
371
372 // ======================= Test Methods =======================
373
375 {
376 std::span<ScalarField> test_scalars(&scalars[0], num_points);
377 AffineElement result =
379 AffineElement expected = naive_msm(test_scalars, generators);
380 EXPECT_EQ(result, expected);
381 }
382
384 {
385 BB_BENCH_NAME("BatchMultiScalarMul");
386
387 const size_t num_msms = static_cast<size_t>(engine.get_random_uint8()) % kMaxBatchMSMs;
388 std::vector<AffineElement> expected(num_msms);
389
390 std::vector<std::vector<ScalarField>> batch_scalars_copies(num_msms);
391 std::vector<size_t> start_indices(num_msms);
392 std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;
393
394 size_t vector_offset = 0;
395 for (size_t k = 0; k < num_msms; ++k) {
396 const size_t num_pts = static_cast<size_t>(engine.get_random_uint16()) % kMaxBatchPointsPerMSM;
397
398 ASSERT_LT(vector_offset + num_pts, num_points);
399
400 batch_scalars_copies[k].resize(num_pts);
401 for (size_t i = 0; i < num_pts; ++i) {
402 batch_scalars_copies[k][i] = scalars[vector_offset + i];
403 }
404
405 start_indices[k] = vector_offset;
406 batch_scalars_spans.emplace_back(vector_offset, std::span<ScalarField>(batch_scalars_copies[k]));
407 vector_offset += num_pts;
408
409 std::span<const AffineElement> batch_points(&generators[start_indices[k]], num_pts);
410 expected[k] = naive_msm(batch_scalars_copies[k], batch_points);
411 }
412
413 std::vector<AffineElement> result =
415
416 EXPECT_EQ(result, expected);
417 }
418
420 {
421 const size_t num_msms = 10;
422 std::vector<AffineElement> expected(num_msms);
423
424 std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
425 std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;
426
427 for (size_t k = 0; k < num_msms; ++k) {
428 const size_t num_pts = 33;
429 auto& test_scalars = batch_scalars[k];
430
431 test_scalars.resize(num_pts);
432
433 size_t fixture_offset = k * num_pts;
434
435 std::span<const AffineElement> batch_points(&generators[fixture_offset], num_pts);
436 for (size_t i = 0; i < 13; ++i) {
437 test_scalars[i] = 0;
438 }
439 for (size_t i = 13; i < 23; ++i) {
440 test_scalars[i] = scalars[fixture_offset + i + 13];
441 }
442 for (size_t i = 23; i < num_pts; ++i) {
443 test_scalars[i] = 0;
444 }
445 batch_scalars_spans.emplace_back(fixture_offset, std::span<ScalarField>(batch_scalars[k]));
446
447 expected[k] = naive_msm(batch_scalars[k], batch_points);
448 }
449
450 std::vector<AffineElement> result =
452
453 EXPECT_EQ(result, expected);
454 }
455
456 // Larger workload that crosses the batched dispatcher's `total_nonzero > 4096` eligibility
457 // threshold so the multi-MSM Phases 1-6b pipeline (REBALANCE path) is exercised, not the
458 // per-MSM delegation fallback.
460 {
461 constexpr size_t num_msms = 4;
462 constexpr size_t per_msm_n = 1 << 13; // 8192 points per MSM, total = 32768
463
464 std::vector<AffineElement> expected(num_msms);
465 std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
466 std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;
467
468 for (size_t k = 0; k < num_msms; ++k) {
469 batch_scalars[k].resize(per_msm_n);
470 for (size_t i = 0; i < per_msm_n; ++i) {
471 // num_msms * per_msm_n = 32768 > num_points (31013); wrap to stay in bounds
472 // (matches the ragged test's indexing). Caught as an out-of-bounds read by the
473 // _GLIBCXX_DEBUG / ASAN build otherwise.
474 batch_scalars[k][i] = scalars[(k * per_msm_n + i) % num_points];
475 }
476 std::span<const AffineElement> pts(&generators[0], per_msm_n);
477 batch_scalars_spans.emplace_back(0, std::span<ScalarField>(batch_scalars[k]));
478 expected[k] = naive_msm(batch_scalars[k], pts);
479 }
480
481 std::vector<AffineElement> result =
483
484 for (size_t k = 0; k < num_msms; ++k) {
485 EXPECT_EQ(result[k], expected[k]) << "MSM " << k << " mismatched";
486 }
487 }
488
489 // Ragged batch with mixed densities — the workload pattern for translator wires + databus.
490 // K=5 MSMs of varying sizes, varying zero density, all sharing the same SRS prefix.
492 {
493 const std::vector<size_t> sizes = { 16384, 4096, 8192, 1024, 12000 };
494 const size_t num_msms = sizes.size();
495
496 std::vector<AffineElement> expected(num_msms);
497 std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
498 std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;
499
500 for (size_t k = 0; k < num_msms; ++k) {
501 const size_t n = sizes[k];
502 batch_scalars[k].resize(n);
503 for (size_t i = 0; i < n; ++i) {
504 if ((k == 1 || k == 3) && (i % 4 != 0)) {
505 batch_scalars[k][i] = ScalarField::zero();
506 } else {
507 batch_scalars[k][i] = scalars[(k * 17 + i) % num_points];
508 }
509 }
510 std::span<const AffineElement> pts(&generators[0], n);
511 batch_scalars_spans.emplace_back(0, std::span<ScalarField>(batch_scalars[k]));
512 expected[k] = naive_msm(batch_scalars[k], pts);
513 }
514
515 std::vector<AffineElement> result =
517
518 for (size_t k = 0; k < num_msms; ++k) {
519 EXPECT_EQ(result[k], expected[k]) << "MSM " << k << " (n=" << sizes[k] << ") mismatched";
520 }
521 }
522
523 void test_msm()
524 {
525 const size_t start_index = 1234;
526 const size_t num_pts = num_points - start_index;
527
528 PolynomialSpan<ScalarField> scalar_span =
531
532 std::span<AffineElement> points(&generators[start_index], num_pts);
533 AffineElement expected = naive_msm(scalar_span.span, points);
534 EXPECT_EQ(result, expected);
535 }
536
538 {
539 const size_t start_index = 1234;
540 const size_t num_pts = num_points - start_index;
541 std::vector<ScalarField> test_scalars(num_pts, ScalarField::zero());
542
543 PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(start_index, test_scalars);
545
546 EXPECT_EQ(result, Group::affine_point_at_infinity);
547 }
548
550 {
551 std::vector<ScalarField> test_scalars;
552 std::vector<AffineElement> input_points;
553 PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(0, test_scalars);
554 AffineElement result = scalar_multiplication::MSM<Curve>::msm(input_points, scalar_span);
555
556 EXPECT_EQ(result, Group::affine_point_at_infinity);
557 }
558
560 {
561 const size_t num_pts = 100;
562 std::vector<ScalarField> test_scalars(num_pts);
563 std::vector<ScalarField> scalars_copy(num_pts);
564
565 for (size_t i = 0; i < num_pts; ++i) {
566 test_scalars[i] = scalars[i];
567 scalars_copy[i] = test_scalars[i];
568 }
569
570 std::span<const AffineElement> points(&generators[0], num_pts);
571 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
572
573 scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
574
575 for (size_t i = 0; i < num_pts; ++i) {
576 EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "Scalar at index " << i << " was modified";
577 }
578 }
579
581 {
582 const size_t num_msms = 3;
583 const size_t num_pts = 100;
584
585 std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
586 std::vector<std::vector<ScalarField>> scalars_copies(num_msms);
587 std::vector<PolynomialSpan<ScalarField>> batch_scalar_spans;
588
589 for (size_t k = 0; k < num_msms; ++k) {
590 batch_scalars[k].resize(num_pts);
591 scalars_copies[k].resize(num_pts);
592
593 for (size_t i = 0; i < num_pts; ++i) {
594 batch_scalars[k][i] = scalars[k * num_pts + i];
595 scalars_copies[k][i] = batch_scalars[k][i];
596 }
597
598 batch_scalar_spans.emplace_back(k * num_pts, std::span<ScalarField>(batch_scalars[k]));
599 }
600
602
603 for (size_t k = 0; k < num_msms; ++k) {
604 for (size_t i = 0; i < num_pts; ++i) {
605 EXPECT_EQ(batch_scalars[k][i], scalars_copies[k][i])
606 << "Scalar at MSM " << k << ", index " << i << " was modified";
607 }
608 }
609 }
610
612 {
613#ifdef __wasm__
614 GTEST_SKIP() << "WASM GLV threshold exceeds the fixture size; non-GLV restoration is native-only here.";
615#else
616 namespace rpd = scalar_multiplication::round_parallel_detail;
617 const size_t num_pts = rpd::GLV_SMALL_N_THRESHOLD + 257;
618 ASSERT_LE(num_pts, num_points);
619
620 std::vector<ScalarField> test_scalars(num_pts);
621 std::vector<ScalarField> scalars_copy(num_pts);
622 for (size_t i = 0; i < num_pts; ++i) {
623 test_scalars[i] = scalars[i];
624 scalars_copy[i] = test_scalars[i];
625 }
626
627 std::span<const AffineElement> points(&generators[0], num_pts);
628 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
629 scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/false);
630
631 for (size_t i = 0; i < num_pts; ++i) {
632 EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "non-GLV scalar at index " << i << " was modified";
633 }
634#endif
635 }
636
638 {
639 const size_t num_pts = 5;
640 std::vector<ScalarField> test_scalars(num_pts, ScalarField::one());
641 std::span<const AffineElement> points(&generators[0], num_pts);
642
643 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
644 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
645
646 Element expected;
647 expected.self_set_infinity();
648 for (size_t i = 0; i < num_pts; ++i) {
649 expected += points[i];
650 }
651
652 EXPECT_EQ(result, AffineElement(expected));
653 }
654
656 {
657 const size_t num_pts = 5;
658 std::vector<ScalarField> test_scalars(num_pts, -ScalarField::one());
659 std::span<const AffineElement> points(&generators[0], num_pts);
660
661 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
662 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
663
664 Element expected;
665 expected.self_set_infinity();
666 for (size_t i = 0; i < num_pts; ++i) {
667 expected -= points[i];
668 }
669
670 EXPECT_EQ(result, AffineElement(expected));
671 }
672
674 {
675 std::vector<ScalarField> test_scalars = { scalars[0] };
676 std::span<const AffineElement> points(&generators[0], 1);
677
678 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
679 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
680
681 AffineElement expected(points[0] * test_scalars[0]);
682 EXPECT_EQ(result, expected);
683 }
684
686 {
687 std::vector<size_t> test_sizes = { 1, 2, 15, 16, 17, 50, 127, 128, 129, 256, 512 };
688
689 for (size_t num_pts : test_sizes) {
690 ASSERT_LE(num_pts, num_points);
691
692 std::vector<ScalarField> test_scalars(num_pts);
693 for (size_t i = 0; i < num_pts; ++i) {
694 test_scalars[i] = scalars[i];
695 }
696
697 std::span<const AffineElement> points(&generators[0], num_pts);
698 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
699
700 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
701 AffineElement expected = naive_msm(test_scalars, points);
702
703 EXPECT_EQ(result, expected) << "Failed for size " << num_pts;
704 }
705 }
706
708 {
709 // Use enough points to trigger Pippenger (> PIPPENGER_THRESHOLD = 16)
710 const size_t num_pts = 32;
711 AffineElement base_point = generators[0];
712
713 std::vector<AffineElement> points(num_pts, base_point);
714 std::vector<ScalarField> test_scalars(num_pts);
715 ScalarField scalar_sum = ScalarField::zero();
716
717 for (size_t i = 0; i < num_pts; ++i) {
718 test_scalars[i] = scalars[i];
719 scalar_sum += test_scalars[i];
720 }
721
722 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
723 // Duplicate points are an edge case (P + P requires doubling, not addition).
724 // Must use handle_edge_cases=true for correctness with Pippenger.
725 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);
726
727 AffineElement expected(base_point * scalar_sum);
728 EXPECT_EQ(result, expected);
729 }
730
732 {
733 const size_t num_pts = 100;
734 std::vector<ScalarField> test_scalars(num_pts);
735 Element expected;
736 expected.self_set_infinity();
737
738 for (size_t i = 0; i < num_pts; ++i) {
739 if (i % 2 == 0) {
740 test_scalars[i] = ScalarField::zero();
741 } else {
742 test_scalars[i] = scalars[i];
743 expected += generators[i] * test_scalars[i];
744 }
745 }
746
747 std::span<const AffineElement> points(&generators[0], num_pts);
748 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
749
750 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
751 EXPECT_EQ(result, AffineElement(expected));
752 }
753
755 {
756 const size_t num_pts = 200;
757 std::vector<ScalarField> test_scalars(num_pts);
758 for (size_t i = 0; i < num_pts; ++i) {
759 test_scalars[i] = scalars[i];
760 }
761
762 std::span<const AffineElement> points(&generators[0], num_pts);
763 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
764
765 auto result = scalar_multiplication::pippenger<Curve>(scalar_span, points);
766
767 AffineElement expected = naive_msm(test_scalars, points);
768 EXPECT_EQ(AffineElement(result), expected);
769 }
770
772 {
773 const size_t num_pts = 200;
774 std::vector<ScalarField> test_scalars(num_pts);
775 for (size_t i = 0; i < num_pts; ++i) {
776 test_scalars[i] = scalars[i];
777 }
778
779 std::span<const AffineElement> points(&generators[0], num_pts);
780 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
781
782 auto result = scalar_multiplication::pippenger_unsafe<Curve>(scalar_span, points);
783
784 AffineElement expected = naive_msm(test_scalars, points);
785 EXPECT_EQ(AffineElement(result), expected);
786 }
787
795 void test_offset_span(size_t n_total, size_t start_index, size_t n_used, uint64_t seed)
796 {
797 auto& rng = numeric::get_debug_randomness(true, seed);
798 std::vector<ScalarField> test_scalars(n_total);
799 std::vector<AffineElement> input_points(start_index + n_used);
800 for (size_t i = 0; i < n_total; ++i) {
801 test_scalars[i] = ScalarField::random_element(&rng);
802 }
803 for (size_t i = 0; i < input_points.size(); ++i) {
804 input_points[i] = AffineElement(Element::random_element(&rng));
805 }
806
808 start_index, std::span<const ScalarField>{ test_scalars.data() + start_index, n_used }
809 };
810
811 Element actual = scalar_multiplication::pippenger_unsafe<Curve>(scalar_span, input_points);
812
813 Element expected;
814 expected.self_set_infinity();
815 for (size_t i = 0; i < n_used; ++i) {
816 expected += input_points[start_index + i] * test_scalars[start_index + i];
817 }
818 EXPECT_EQ(AffineElement(actual), AffineElement(expected))
819 << "Offset MSM mismatch at n_total=" << n_total << " start_index=" << start_index << " n_used=" << n_used;
820 }
821
827 {
829 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 35);
830 std::vector<AffineElement> points(num_pts);
831 std::vector<ScalarField> test_scalars(num_pts);
832 for (size_t i = 0; i < num_pts; ++i) {
833 points[i] = AffineElement(Element::random_element(&rng));
834 test_scalars[i] = ScalarField::random_element(&rng);
835 }
836
837 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
838 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
839 AffineElement expected = naive_msm(test_scalars, points);
840 EXPECT_EQ(result, expected);
841 }
842
856 {
857 const size_t num_pts = 100000;
858 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 36);
859 std::vector<AffineElement> points(num_pts);
860 for (size_t i = 0; i < num_pts; ++i) {
861 points[i] = AffineElement(Element::random_element(&rng));
862 }
863 std::vector<ScalarField> uniform_scalars(num_pts, ScalarField(7));
864 PolynomialSpan<ScalarField> scalar_span(0, uniform_scalars);
865
866 AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);
867 AffineElement expected =
868 naive_msm(std::span<ScalarField>(uniform_scalars), std::span<const AffineElement>(points));
869 EXPECT_EQ(result, expected);
870 }
871
890 {
891 const size_t num_pts = 50000;
892 // Pick a dedup-eligible scalar: msb >= c (c ≈ 11 for n ≈ 50 000), so any value
893 // ≥ 2^11 works. Use 2^200 so msb is firmly large for any c the dispatch picks.
894 const ScalarField val = ScalarField(uint256_t(0, 0, 0, uint64_t{ 1 } << (200 - 192))); // 2^200
895 std::vector<ScalarField> uniform_scalars(num_pts, val);
896 std::vector<AffineElement> points = make_repeated_test_points(num_pts);
897 PolynomialSpan<ScalarField> scalar_span(0, uniform_scalars);
898
900 points, scalar_span, /*handle_edge_cases=*/false, /*dedup_hint=*/true);
901
902 AffineElement expected =
903 naive_msm(std::span<ScalarField>(uniform_scalars), std::span<const AffineElement>(points));
904 EXPECT_EQ(result, expected);
905 }
906
916 {
917 constexpr size_t NUM_CLUSTERS = 12000;
918 constexpr size_t CLUSTER_SIZE = 3;
919 const size_t num_pts = NUM_CLUSTERS * CLUSTER_SIZE;
920
921 std::vector<ScalarField> scalars;
922 scalars.reserve(num_pts);
923 const uint256_t high_bit(0, 0, 0, uint64_t{ 1 } << (200 - 192));
924 for (size_t i = 0; i < NUM_CLUSTERS; ++i) {
925 const ScalarField val = ScalarField(high_bit + uint256_t(i + 1));
926 for (size_t j = 0; j < CLUSTER_SIZE; ++j) {
927 scalars.push_back(val);
928 }
929 }
930
931 std::vector<AffineElement> points = make_repeated_test_points(num_pts);
932 PolynomialSpan<ScalarField> scalar_span(0, scalars);
933
934 AffineElement result =
935 scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/false, true);
936 AffineElement expected = naive_msm(std::span<ScalarField>(scalars), std::span<const AffineElement>(points));
937 EXPECT_EQ(result, expected);
938 }
939
940 // ============================================================================
941 // Dispatch-coverage tests for `pippenger_round_parallel`.
942 //
943 // The function has several branches that need to all be exercised:
944 // * `n_input == 0` → infinity
945 // * `pts_per_thread < MIN_PTS_PER_THREAD_FOR_PIPPENGER` → trivial_msm_threaded
946 // (single-thread → trivial_msm, otherwise straus_msm per worker)
947 // * Otherwise → main pippenger pipeline
948 // - use_glv=true (n_input ≤ GLV_SMALL_N_THRESHOLD)
949 // - use_glv=false (n_input > GLV_SMALL_N_THRESHOLD; only on huge N)
950 // * `external_glv_doubled` provided vs not (drives one of the GLV-split branches)
951 //
952 // Each test below restores `bb::set_parallel_for_concurrency` to its original
953 // value before returning, even if the assertion fails, so subsequent tests are
954 // unaffected.
955 // ============================================================================
956
973
977 void check_internal_against_naive(size_t n, size_t start_index, const char* label)
978 {
979 ASSERT_LE(start_index + n, num_points) << label;
980
981 std::span<ScalarField> scalar_subspan(&scalars[start_index], n);
982 std::span<const AffineElement> point_subspan(&generators[0], start_index + n);
983 PolynomialSpan<const ScalarField> scalar_span{ start_index, scalar_subspan };
984
985 Element actual = scalar_multiplication::pippenger_round_parallel<Curve>(scalar_span, point_subspan);
986
987 Element expected;
988 expected.self_set_infinity();
989 for (size_t i = 0; i < n; ++i) {
990 expected += point_subspan[start_index + i] * scalar_subspan[i];
991 }
992
993 EXPECT_EQ(AffineElement(actual), AffineElement(expected))
994 << label << " (n=" << n << ", start_index=" << start_index << ")";
995 }
996
1001 {
1002 ConcurrencyScope scope(1);
1003 // n_input == 0: infinity short-circuit.
1004 {
1005 std::span<const AffineElement> empty_points;
1006 std::span<ScalarField> empty_scalars;
1007 PolynomialSpan<const ScalarField> empty_span{ 0, empty_scalars };
1008 Element r = scalar_multiplication::pippenger_round_parallel<Curve>(empty_span, empty_points);
1009 EXPECT_TRUE(r.is_point_at_infinity());
1010 }
1011 // Walk N across all dispatch boundaries with a single thread. With 1 thread,
1012 // pts_per_thread == n; the trivial dispatch fires up to N=23, falls through
1013 // at N=24+. The fall-through path then runs the affine pippenger with
1014 // num_threads=1.
1015 for (size_t n : { size_t{ 1 },
1016 size_t{ 2 },
1017 size_t{ 3 },
1018 size_t{ 4 },
1019 size_t{ 23 },
1020 size_t{ 24 },
1021 size_t{ 25 },
1022 size_t{ 32 },
1023 size_t{ 64 },
1024 size_t{ 100 },
1025 size_t{ 192 },
1026 size_t{ 1000 } }) {
1027 check_internal_against_naive(n, 0, "single_thread");
1028 }
1029 }
1030
1034 {
1035 ConcurrencyScope scope(1);
1036 constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;
1037 check_internal_against_naive(kThreshold + 1, 0, "single_thread n=Threshold+1");
1038 // Also exercise N just below where `chunk_len = n / num_threads = n / 1 = n`
1039 // approaches MIN_BATCH_CAPACITY=32 — the (now-removed) brittle fallback used
1040 // to fire here; we want the affine path to still run and produce correct
1041 // output even with very small chunks.
1042 for (size_t n : { kThreshold + 1, size_t{ 32 }, size_t{ 33 }, size_t{ 50 }, size_t{ 100 } }) {
1043 check_internal_against_naive(n, 0, "single_thread small-chunk");
1044 }
1045 }
1046
1051 {
1052 constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;
1053 for (size_t threads : { size_t{ 2 }, size_t{ 4 }, size_t{ 8 }, size_t{ 16 } }) {
1054 ConcurrencyScope scope(threads);
1055 // Dispatch boundary is at n = threads * kThreshold (= pts_per_thread = 24).
1056 const size_t boundary = threads * kThreshold;
1057 for (size_t n : { boundary - 1, boundary, boundary + 1 }) {
1058 check_internal_against_naive(n, 0, "dispatch_boundary");
1059 }
1060 }
1061 }
1062
1067 {
1068 ConcurrencyScope scope(8);
1069 // Small N (will dispatch to trivial_msm_threaded).
1070 check_internal_against_naive(/*n=*/64, /*start_index=*/17, "offset small-N");
1071 // Just above dispatch threshold (8 threads → boundary at 192).
1072 check_internal_against_naive(/*n=*/200, /*start_index=*/13, "offset just-above-boundary");
1073 // Mid-N falls through into pippenger.
1074 check_internal_against_naive(/*n=*/1024, /*start_index=*/41, "offset mid-N");
1075 }
1076
1081 {
1082 ConcurrencyScope scope(8);
1083 // Save and restore the global scalars buffer.
1084 std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 1024);
1085 for (size_t i = 0; i < saved.size(); ++i) {
1086 scalars[i] = ScalarField::zero();
1087 }
1088 for (size_t n : { size_t{ 1 }, size_t{ 24 }, size_t{ 100 }, size_t{ 1000 } }) {
1089 std::span<ScalarField> sub(&scalars[0], n);
1090 std::span<const AffineElement> pts(&generators[0], n);
1092 Element r = scalar_multiplication::pippenger_round_parallel<Curve>(sp, pts);
1093 EXPECT_TRUE(r.is_point_at_infinity()) << "all-zero n=" << n;
1094 }
1095 // Restore.
1096 for (size_t i = 0; i < saved.size(); ++i) {
1097 scalars[i] = saved[i];
1098 }
1099 }
1100
1104 {
1105 ConcurrencyScope scope(8);
1106 std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 1024);
1107 // Zero out every other scalar.
1108 for (size_t i = 0; i < 1024; i += 2) {
1109 scalars[i] = ScalarField::zero();
1110 }
1111 for (size_t n : { size_t{ 24 }, size_t{ 100 }, size_t{ 1024 } }) {
1112 check_internal_against_naive(n, 0, "mixed-zero");
1113 }
1114 // Restore.
1115 for (size_t i = 0; i < saved.size(); ++i) {
1116 scalars[i] = saved[i];
1117 }
1118 }
1119
1124 {
1125 ConcurrencyScope scope(8);
1126 std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 256);
1127
1128 // Scalar = 1
1129 for (auto& s : saved) {
1130 (void)s;
1131 }
1132 for (size_t i = 0; i < 256; ++i) {
1133 scalars[i] = ScalarField::one();
1134 }
1135 check_internal_against_naive(256, 0, "scalar=1");
1136
1137 // Scalar = -1
1138 for (size_t i = 0; i < 256; ++i) {
1139 scalars[i] = -ScalarField::one();
1140 }
1141 check_internal_against_naive(256, 0, "scalar=-1");
1142
1143 // Restore.
1144 for (size_t i = 0; i < saved.size(); ++i) {
1145 scalars[i] = saved[i];
1146 }
1147 }
1148
1153 {
1154 for (size_t threads : { size_t{ 1 }, size_t{ 2 }, size_t{ 4 }, size_t{ 8 } }) {
1155 ConcurrencyScope scope(threads);
1156 for (size_t n : { size_t{ 1 }, size_t{ 2 }, size_t{ 8 }, size_t{ 32 }, size_t{ 80 }, size_t{ 160 } }) {
1157 std::span<ScalarField> sub(&scalars[0], n);
1158 std::span<const AffineElement> pts(&generators[0], n);
1160 Element actual = scalar_multiplication::trivial_msm_threaded<Curve>(sp, pts);
1161 Element expected;
1162 expected.self_set_infinity();
1163 for (size_t i = 0; i < n; ++i) {
1164 expected += pts[i] * sub[i];
1165 }
1166 EXPECT_EQ(AffineElement(actual), AffineElement(expected))
1167 << "trivial_msm_threaded threads=" << threads << " n=" << n;
1168 }
1169 }
1170 }
1171
1177 {
1178 ConcurrencyScope scope(8);
1179#ifdef __wasm__
1180 constexpr size_t glv_threshold = size_t{ 1 } << 16;
1181#else
1182 constexpr size_t glv_threshold = size_t{ 1 } << 13;
1183#endif
1184 if (glv_threshold >= num_points) {
1185 GTEST_SKIP() << "GLV threshold " << glv_threshold << " not exercisable with " << num_points
1186 << " precomputed points";
1187 }
1188 // Just below threshold: use_glv=true.
1189 check_internal_against_naive(glv_threshold - 1, 0, "glv-boundary minus-1 (use_glv=true)");
1190 // Exactly at threshold: use_glv=true (≤ comparison).
1191 check_internal_against_naive(glv_threshold, 0, "glv-boundary exact (use_glv=true)");
1192 // Just above: use_glv=false.
1193 check_internal_against_naive(glv_threshold + 1, 0, "glv-boundary plus-1 (use_glv=false)");
1194 }
1195
1207 {
1208 ConcurrencyScope scope(1);
1209 constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;
1210 for (size_t n : { kThreshold + 1, size_t{ 50 }, size_t{ 100 }, size_t{ 256 } }) {
1211 std::span<ScalarField> scalar_subspan(&scalars[0], n);
1212 std::span<const AffineElement> point_subspan(&generators[0], n);
1213 PolynomialSpan<const ScalarField> scalar_span{ 0, scalar_subspan };
1214
1215 constexpr size_t kArenaCapacity = size_t{ 64 } * 1024 * 1024;
1216 std::vector<std::byte> raw(kArenaCapacity + 64);
1217 // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)
1218 const auto base = reinterpret_cast<uintptr_t>(raw.data());
1219 const uintptr_t aligned32 = (base + 31) & ~uintptr_t{ 31 };
1220 std::byte* misaligned = raw.data() + (aligned32 - base) + 16;
1221 ASSERT_EQ(reinterpret_cast<uintptr_t>(misaligned) % 32, size_t{ 16 });
1222 std::span<std::byte> external_arena(misaligned, kArenaCapacity);
1223
1224 Element actual = scalar_multiplication::pippenger_round_parallel<Curve>(
1225 scalar_span, point_subspan, /*dedup_hint=*/false, {}, external_arena);
1226
1227 Element expected;
1228 expected.self_set_infinity();
1229 for (size_t i = 0; i < n; ++i) {
1230 expected += point_subspan[i] * scalar_subspan[i];
1231 }
1232 EXPECT_EQ(AffineElement(actual), AffineElement(expected)) << "misaligned external arena (n=" << n << ")";
1233 }
1234 }
1235
1236 // ============================================================================
1237 // Degenerate-input edge cases for the `handle_edge_cases=true` (Jacobian) path,
1238 // at LARGE N and forced multi-threading.
1239 //
1240 // `scalar_multiplication_safe_mode.test.cpp` already covers point-at-infinity,
1241 // P/-P negation, and all-infinity — but only at tiny N (≤ 60 points), which on
1242 // native stays single-threaded (the Jacobian path multi-threads only at
1243 // n ≥ 512). These tests push the same degenerate inputs through the
1244 // multi-threaded Jacobian split + cross-thread reduction, where a per-slice
1245 // infinity / equal-x collision is folded into a per-thread partial before the
1246 // final cross-thread sum. We pin 8 threads so the multi-thread path runs
1247 // regardless of the CI machine's core count.
1248 // ============================================================================
1249
1254 {
1255 ConcurrencyScope scope(8);
1256 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 101);
1257#ifdef __wasm__
1258 const std::vector<size_t> sizes = { 64 };
1259#else
1260 // 3000 crosses the Jacobian path's native multi-thread split (>256 pts/thread).
1261 const std::vector<size_t> sizes = { 64, 3000 };
1262#endif
1263 for (size_t n : sizes) {
1264 std::vector<AffineElement> points(n);
1265 std::vector<ScalarField> test_scalars(n);
1266 for (size_t i = 0; i < n; ++i) {
1267 points[i] = AffineElement(Element::random_element(&rng));
1268 test_scalars[i] = ScalarField::random_element(&rng);
1269 }
1270 for (size_t idx : { size_t{ 0 }, n / 2, n - 1 }) {
1271 points[idx].self_set_infinity();
1272 }
1273 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
1274 AffineElement result =
1275 scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);
1276 AffineElement expected = naive_msm(test_scalars, points);
1277 EXPECT_EQ(result, expected) << "point-at-infinity inputs (n=" << n << ")";
1278 }
1279 }
1280
1285 {
1286 ConcurrencyScope scope(8);
1287 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 102);
1288#ifdef __wasm__
1289 const std::vector<size_t> pair_counts = { 40 };
1290#else
1291 const std::vector<size_t> pair_counts = { 40, 800 };
1292#endif
1293 for (size_t pairs : pair_counts) {
1294 const size_t n = (2 * pairs) + 3; // + a few linearly-independent singletons
1295 std::vector<AffineElement> points(n);
1296 std::vector<ScalarField> test_scalars(n);
1297 for (size_t p = 0; p < pairs; ++p) {
1298 AffineElement r(Element::random_element(&rng));
1299 AffineElement neg_r;
1300 neg_r.x = r.x;
1301 neg_r.y = -r.y;
1302 const ScalarField s = ScalarField::random_element(&rng);
1303 points[2 * p] = r;
1304 points[(2 * p) + 1] = neg_r;
1305 test_scalars[2 * p] = s;
1306 test_scalars[(2 * p) + 1] = s;
1307 }
1308 for (size_t i = 2 * pairs; i < n; ++i) {
1309 points[i] = AffineElement(Element::random_element(&rng));
1310 test_scalars[i] = ScalarField::random_element(&rng);
1311 }
1312 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
1313 AffineElement result =
1314 scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);
1315 AffineElement expected = naive_msm(test_scalars, points);
1316 EXPECT_EQ(result, expected) << "inverse-pair bucket collisions (pairs=" << pairs << ")";
1317 }
1318 }
1319
1327 {
1328 using BaseField = typename Curve::BaseField;
1329 const BaseField beta = BaseField::cube_root_of_unity();
1330 namespace rpd = scalar_multiplication::round_parallel_detail;
1331 const std::vector<size_t> sizes = { 50, 1000, rpd::GLV_SMALL_N_THRESHOLD + 64 };
1332 for (size_t n : sizes) {
1333 ASSERT_LE(n, num_points);
1334 std::span<const AffineElement> points(&generators[0], n);
1335 std::vector<ScalarField> test_scalars(n);
1336 for (size_t i = 0; i < n; ++i) {
1337 test_scalars[i] = scalars[i];
1338 }
1339 std::vector<AffineElement> doubled(2 * n);
1340 for (size_t i = 0; i < n; ++i) {
1341 doubled[2 * i] = points[i];
1342 doubled[(2 * i) + 1].x = points[i].x * beta;
1343 doubled[(2 * i) + 1].y = -points[i].y;
1344 }
1345 PolynomialSpan<const ScalarField> scalar_span(0, std::span<const ScalarField>(test_scalars));
1346 Element result =
1347 scalar_multiplication::pippenger_round_parallel<Curve>(scalar_span,
1348 points,
1349 /*dedup_hint=*/false,
1350 std::span<const AffineElement>(doubled));
1351 AffineElement expected = naive_msm(test_scalars, points);
1352 EXPECT_EQ(AffineElement(result), expected) << "external_glv_doubled (n=" << n << ")";
1353 }
1354 }
1355
1356 // ============================================================================
1357 // GLV split / signed-Booth recoder edge cases.
1358 //
1359 // The GLV path (use_glv) feeds `split_into_endomorphism_scalars` output into the
1360 // round-parallel pipeline as 2-limb (≤128-bit) halves k1, k2 with limbs[2],[3]
1361 // forced to 0 (scalar_multiplication_fast.cpp ~1400). The window schedule is built
1362 // for NUM_BITS=128 (+2 carry). If any half's true magnitude needed bit 128+ — or if
1363 // the top signed-Booth window digit carried past the final window — the recoder would
1364 // silently drop those bits and the MSM result would be wrong while no assert trips.
1365 //
1366 // `PippengerInternalExtremeScalars` already pins scalar=±1; this widens coverage to
1367 // the values that maximise |k1| / |k2|: r-1, (r-1)/2, λ, λ±1, the k2-negative-fix
1368 // boundary (data[1] high bit set just below 2^128), and a deterministic sweep that
1369 // checks every split half stays ≤128 bits before trusting the pipeline output. We run
1370 // every value through the real internal path at an N below the GLV threshold so
1371 // use_glv=true, comparing to a naive reference.
1372 // ============================================================================
1374 {
1375 ConcurrencyScope scope(8);
1376 namespace rpd = scalar_multiplication::round_parallel_detail;
1377 // N small enough to force use_glv=true on both native (2^13) and wasm (2^16),
1378 // but large enough to clear MIN_PTS_PER_THREAD_FOR_PIPPENGER and run the affine
1379 // pippenger (not the trivial bail).
1380 constexpr size_t n = 600;
1381 static_assert(n <= 256 + 4096, "keep n below the native GLV threshold");
1382
1383 const uint256_t r = ScalarField::modulus;
1384
1385 // Worst-case magnitude probes. Each is reduced mod r by the uint256_t ctor.
1386 std::vector<ScalarField> probes;
1387 probes.push_back(-ScalarField::one()); // r - 1
1388 probes.push_back(ScalarField(uint256_t(1))); // 1
1389 probes.push_back(ScalarField(r - uint256_t(1)) * ScalarField(uint256_t(2)).invert()); // (r-1)/2
1390 // Powers-of-two boundaries around the 128-bit split width.
1391 for (size_t bit : { size_t{ 126 }, size_t{ 127 }, size_t{ 128 }, size_t{ 129 }, size_t{ 253 } }) {
1392 probes.push_back(ScalarField(uint256_t(1) << bit));
1393 probes.push_back(ScalarField((uint256_t(1) << bit) - uint256_t(1)));
1394 probes.push_back(ScalarField((uint256_t(1) << bit) + uint256_t(1)));
1395 }
1396 // r/2 ± small, and r - small: the region where compute_endomorphism_k2 can drive
1397 // k2 slightly negative (the data[2]||data[3] != 0 → +endo_minus_b1 correction).
1398 for (uint64_t delta = 0; delta < 8; ++delta) {
1399 probes.push_back(ScalarField(r - uint256_t(delta + 1)));
1400 probes.push_back(ScalarField((r >> 1) + uint256_t(delta)));
1401 }
1402 // Deterministic pseudo-random fill of additional probes (Knuth multiplicative
1403 // hash) so the sweep also covers interior values, not just the curated extremes.
1404 for (uint64_t i = 1; i <= 16; ++i) {
1405 const uint64_t h0 = i * uint64_t{ 0x9E3779B97F4A7C15ULL };
1406 const uint64_t h1 = (i + 1) * uint64_t{ 0xC2B2AE3D27D4EB4FULL };
1407 const uint64_t h2 = (i + 2) * uint64_t{ 0x165667B19E3779F9ULL };
1408 const uint64_t h3 = (i + 3) * uint64_t{ 0xD6E8FEB86659FD93ULL };
1409 probes.push_back(ScalarField(uint256_t(h0, h1, h2, h3)));
1410 }
1411
1412 // Invariant the whole GLV path leans on: the two 128-bit halves the pipeline stores
1413 // (limbs[2]/[3] forced to 0) recombine — via the SAME doubled-point convention the
1414 // MSM uses, φP = (β·x, −y) — back to k·P. If the field ever produced a half needing
1415 // bit 128+, the 2-limb storage at ~line 1400 would truncate it and this point identity
1416 // would break, catching the silent wrong result independent of the GLV λ-sign details.
1417 {
1418 using BaseField = typename Curve::BaseField;
1419 const BaseField beta = BaseField::cube_root_of_unity();
1420 const AffineElement base = generators[0];
1421 AffineElement phi;
1422 phi.x = base.x * beta;
1423 phi.y = -base.y;
1424 for (const ScalarField& s : probes) {
1425 const ScalarField canonical = s.from_montgomery_form_reduced();
1426 const auto split = ScalarField::split_into_endomorphism_scalars(canonical);
1427 // Rebuild k1, k2 from ONLY the 2 returned limbs (exactly what the pipeline does).
1428 const ScalarField k1 = ScalarField(uint256_t(split.first[0], split.first[1], 0, 0));
1429 const ScalarField k2 = ScalarField(uint256_t(split.second[0], split.second[1], 0, 0));
1430 const Element recombined = Element(base) * k1 + Element(phi) * k2;
1431 EXPECT_EQ(AffineElement(recombined), AffineElement(Element(base) * s))
1432 << "GLV split half exceeded 128-bit storage or mis-signed";
1433 }
1434 }
1435
1436 // Now run each probe through the real internal pipeline (use_glv=true) and compare
1437 // to naive. We fill all n scalars with the probe so every working half hits the
1438 // same extreme window, maximising any top-window carry interaction.
1439 std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + n);
1440 for (size_t p = 0; p < probes.size(); ++p) {
1441 for (size_t i = 0; i < n; ++i) {
1442 scalars[i] = probes[p];
1443 }
1444 check_internal_against_naive(n, 0, "glv-extreme-magnitude");
1445 }
1446 // Mixed: alternate r-1 and (interior) probes so k1/k2 of adjacent working scalars
1447 // land in different windows within one MSM.
1448 for (size_t i = 0; i < n; ++i) {
1449 scalars[i] = (i & 1) ? -ScalarField::one() : probes[probes.size() - 1 - (i % probes.size())];
1450 }
1451 check_internal_against_naive(n, 0, "glv-extreme-mixed");
1452
1453 for (size_t i = 0; i < saved.size(); ++i) {
1454 scalars[i] = saved[i];
1455 }
1456 }
1457
1458 // ============================================================================
1459 // effective_num_bits schedule divergence (sizer vs live allocator).
1460 //
1461 // The live path (scalar_multiplication_fast.cpp ~1474) shrinks the window-bit budget
1462 // to the highest observed scalar msb, then re-runs choose_window_bits — which can pick
1463 // a SMALLER window_bits (⇒ MORE windows ⇒ a larger wpb·per_window_bytes Zone S) than the
1464 // full-NUM_BITS pre-sizer. `compute_arena_bytes_for_msm`'s defensive bit-budget sweep
1465 // (lines 1246-1250) only runs for use_glv OR n_input ≥ 2^17. For the native non-GLV
1466 // mid-band 2^13 < n < 2^17 the sweep is SKIPPED, so a workload of uniformly small-msb
1467 // scalars selects a schedule the sizer never sized for. If the live Zone S then exceeds
1468 // the arena, this is a guaranteed out-of-bounds write (SIGSEGV), not a wrong result.
1469 //
1470 // We drive the real MSM in that band with all-small scalars (msb ≪ 254) and compare to
1471 // naive. A correct sizer makes this pass; an under-count crashes under ASAN / corrupts
1472 // the answer. Native-only: the band does not exist on wasm (GLV up to 2^16).
1473 // ============================================================================
1475 {
1476#ifdef __wasm__
1477 GTEST_SKIP() << "non-GLV mid-band (2^13<n<2^17) does not exist on wasm";
1478#else
1479 ConcurrencyScope scope(8);
1480 namespace rpd = scalar_multiplication::round_parallel_detail;
1481 const size_t glv_threshold = rpd::GLV_SMALL_N_THRESHOLD; // 2^13 native
1482 // Sizes just above the GLV threshold (use_glv=false) and inside the sweep-skipped
1483 // band. num_points is 31013, so every size below stays in-bounds.
1484 const std::vector<size_t> sizes = { glv_threshold + 1, size_t{ 1 } << 14 };
1485 // Bit budgets to drive effective_num_bits to representative small/mid schedules.
1486 const std::vector<size_t> bit_widths = { 1, 64, 120 };
1487
1488 size_t max_size = 0;
1489 for (size_t s : sizes) {
1490 if (s <= num_points) {
1491 max_size = std::max(max_size, s);
1492 }
1493 }
1494 std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + static_cast<std::ptrdiff_t>(max_size));
1495 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 909);
1496 for (size_t n : sizes) {
1497 if (n > num_points) {
1498 continue;
1499 }
1500 for (size_t bits : bit_widths) {
1501 const uint256_t mask = (bits >= 256) ? ~uint256_t(0) : ((uint256_t(1) << bits) - uint256_t(1));
1502 for (size_t i = 0; i < n; ++i) {
1503 // Random value masked to `bits` bits; force the top bit so msb == bits-1
1504 // for at least some scalars, pinning effective_num_bits == bits.
1505 uint256_t v(rng.get_random_uint64(),
1506 rng.get_random_uint64(),
1507 rng.get_random_uint64(),
1508 rng.get_random_uint64());
1509 v = v & mask;
1510 if (i == 0 && bits >= 1) {
1511 v = v | (uint256_t(1) << (bits - 1));
1512 }
1513 scalars[i] = ScalarField(v);
1514 }
1515 check_internal_against_naive(n, 0, "effective-num-bits-band");
1516 }
1517 }
1518 for (size_t i = 0; i < saved.size(); ++i) {
1519 scalars[i] = saved[i];
1520 }
1521#endif
1522 }
1523
1524 // ============================================================================
1525 // Dedup multi-chunk tree-reduce carry + cap fallback.
1526 //
1527 // pippenger_dedup.hpp's Phase A tree-reduce (lines ~470-520) consolidates each
1528 // equal-value cluster's base points. A cluster larger than DEDUP_MAX_CHUNK_MEMBERS
1529 // (2048) is split across chunks and its partial sum threaded via the `carry` slot;
1530 // a cluster larger than the per-bucket staged cap (~1024) is only PARTIALLY deduped,
1531 // with the overflow members falling through to the normal pippenger path with their
1532 // original signed digits. Both the carry-threading and the partial-consolidation
1533 // fallback must produce the same sum as no dedup at all.
1534 //
1535 // We build inputs with one (or a few) huge equal-value clusters of identical points
1536 // so a single combined point + many fall-through members coexist in one MSM, then
1537 // compare dedup_hint=true against naive. Sizes are chosen to exceed both the chunk
1538 // cap (2048) and the staged cap, exercising the carry and the cap fallback in the
1539 // same run. Native-only for the largest sizes (wasm runs the smaller ones).
1540 // ============================================================================
1542 {
1543 ConcurrencyScope scope(8);
1544 auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 303);
1545
1546#ifdef __wasm__
1547 const std::vector<size_t> cluster_sizes = { 2100 };
1548#else
1549 // 2100 > DEDUP_MAX_CHUNK_MEMBERS(2048) → multi-chunk carry.
1550 // 5000 → several carries + staged-cap partial-consolidation fallback.
1551 const std::vector<size_t> cluster_sizes = { 2100, 5000 };
1552#endif
1553 for (size_t cluster : cluster_sizes) {
1554 // Dedup clusters by equal scalar VALUE, combining the cluster's DISTINCT base
1555 // points into one (rep, Σpoints) pair. So the giant cluster is `cluster` distinct
1556 // generators all sharing scalar s_big — the realistic dedup trigger (range-check /
1557 // counter polynomials) and a valid input for the affine path (distinct x). The
1558 // tree-reduce then sums distinct points (no equal-x affine-add edge case), and the
1559 // cluster size drives the multi-chunk carry + staged-cap fallback.
1560 const size_t singles = 400;
1561 const size_t medium = 600;
1562 const size_t n = cluster + medium + singles;
1563 ASSERT_LE(n, num_points);
1564
1565 std::vector<ScalarField> test_scalars(n);
1566 std::vector<AffineElement> points(n);
1567
1568 const ScalarField s_big = ScalarField::random_element(&rng);
1569 const ScalarField s_med = ScalarField::random_element(&rng);
1570 ASSERT_NE(s_big, s_med);
1571
1572 for (size_t i = 0; i < cluster; ++i) {
1573 points[i] = generators[i];
1574 test_scalars[i] = s_big;
1575 }
1576 for (size_t i = 0; i < medium; ++i) {
1577 points[cluster + i] = generators[cluster + i];
1578 test_scalars[cluster + i] = s_med;
1579 }
1580 for (size_t i = 0; i < singles; ++i) {
1581 points[cluster + medium + i] = generators[cluster + medium + i];
1582 test_scalars[cluster + medium + i] = ScalarField::random_element(&rng);
1583 }
1584
1585 PolynomialSpan<ScalarField> scalar_span(0, test_scalars);
1586 // dedup_hint=true forces Phase A; handle_edge_cases=false stays on the affine
1587 // path (all points/distinct-x are valid for it).
1589 points, scalar_span, /*handle_edge_cases=*/false, /*dedup_hint=*/true);
1590 AffineElement expected = naive_msm(test_scalars, points);
1591 EXPECT_EQ(deduped, expected) << "dedup large-cluster carry/caps (cluster=" << cluster << ", n=" << n << ")";
1592
1593 // Cross-check: the same input with dedup_hint=false must agree (dedup is a pure
1594 // optimisation; a mismatch isolates the regression to the dedup pre-pass).
1595 PolynomialSpan<ScalarField> scalar_span2(0, test_scalars);
1597 points, scalar_span2, /*handle_edge_cases=*/false, /*dedup_hint=*/false);
1598 EXPECT_EQ(deduped, undeduped) << "dedup vs no-dedup divergence (cluster=" << cluster << ")";
1599 }
1600 }
1601
1616 void run_batch_driver_paths(bool handle_edge_cases)
1617 {
1618 // 16384 > native GLV threshold (2^13) → its own non-GLV group; the rest are GLV.
1619 const std::vector<size_t> sizes = { 0, 1, 5, 0, 4096, 33, 0, 16384, 64, 2 };
1620 const size_t num_msms = sizes.size();
1621
1622 std::vector<std::vector<ScalarField>> batch_scalars(num_msms);
1623 std::vector<std::vector<ScalarField>> scalar_copies(num_msms);
1625 std::vector<AffineElement> expected(num_msms);
1626 std::vector<uint8_t> dedup_hints(num_msms, 0);
1627
1628 size_t offset = 0;
1629 for (size_t k = 0; k < num_msms; ++k) {
1630 const size_t n = sizes[k];
1631 ASSERT_LE(offset + n, num_points);
1632 batch_scalars[k].resize(n);
1633 scalar_copies[k].resize(n);
1634 const bool all_zero = (k % 4 == 2); // a couple of fully-zero MSMs → infinity result
1635 for (size_t i = 0; i < n; ++i) {
1636 batch_scalars[k][i] = all_zero ? ScalarField::zero() : scalars[offset + i];
1637 scalar_copies[k][i] = batch_scalars[k][i];
1638 }
1639 dedup_hints[k] = static_cast<uint8_t>((k % 2 == 0) ? 1 : 0);
1640 spans.emplace_back(offset, std::span<ScalarField>(batch_scalars[k]));
1641 std::span<const AffineElement> pts(&generators[offset], n);
1642 expected[k] = naive_msm(batch_scalars[k], pts);
1643 offset += n;
1644 }
1645
1646 std::vector<AffineElement> result = scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(
1647 generators, spans, handle_edge_cases, std::span<const uint8_t>(dedup_hints));
1648
1649 ASSERT_EQ(result.size(), num_msms);
1650 for (size_t k = 0; k < num_msms; ++k) {
1651 EXPECT_EQ(result[k], expected[k]) << "batch MSM " << k << " (n=" << sizes[k]
1652 << ", handle_edge_cases=" << handle_edge_cases << ") mismatched";
1653 EXPECT_EQ(batch_scalars[k], scalar_copies[k]) << "batch MSM " << k << " scalars were not restored";
1654 }
1655 }
1656
1657 void test_batch_driver_shared_path() { run_batch_driver_paths(/*handle_edge_cases=*/false); }
1658};
1659
1660using CurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;
1662
1663TEST(ScalarMultiplicationArenaTest, LargeBn254RecursionVkShapeFitsComputedArena)
1664{
1665 const size_t saved_threads = bb::get_num_cpus();
1666
1667 // CI regression from HonkRecursionConstraintTestWithoutPredicate/2.GenerateVKFromConstraints:
1668 // Zone S attempted a uint32_t schedule allocation whose aligned end was 26,454,272
1669 // bytes after the computed arena left only 25,505,329 bytes in Zone S. The log does
1670 // not expose windows_per_batch, so cover every plausible n_input divisor for that
1671 // schedule size.
1672 constexpr size_t schedule_slots = size_t{ 26454272 } / sizeof(uint32_t);
1673 constexpr std::array<size_t, 8> candidate_window_batches{ 1, 2, 4, 8, 13, 16, 26, 32 };
1674 for (const size_t threads : { size_t{ 4 }, size_t{ 32 } }) {
1676 for (const size_t windows_per_batch : candidate_window_batches) {
1677 const size_t n = schedule_slots / windows_per_batch;
1678 for (size_t effective_num_bits = 1; effective_num_bits <= 254; ++effective_num_bits) {
1679 EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(
1680 n, /*external_glv_provided=*/false, /*dedup_active=*/false, effective_num_bits))
1681 << "threads=" << threads << " windows_per_batch=" << windows_per_batch << " n=" << n
1682 << " effective_num_bits=" << effective_num_bits;
1683 }
1684 }
1685 }
1686
1687 bb::set_parallel_for_concurrency(saved_threads);
1688}
1689
1690// Sweeps the sizer/allocator agreement (the historical "arena drift" bug class) across the
1691// full dispatch parameter space rather than the single recursion-VK shape above: thread count,
1692// N around every dispatch boundary, GLV-provided vs not, dedup-active vs not, and a range of
1693// effective bit budgets. `pippenger_bn254_arena_layout_fits_for_test` walks the live
1694// Zone P / Zone W / Zone S allocator and must always fit inside `compute_arena_bytes_for_msm`'s
1695// promise; any `false` here is an under-count (a guaranteed Zone overflow / SIGSEGV at runtime).
1696// Notably this is the first coverage of `dedup_active=true` and `external_glv_provided=true`
1697// against the sizer.
1698TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace)
1699{
1700 const size_t saved_threads = bb::get_num_cpus();
1701
1702 constexpr std::array<size_t, 6> thread_counts{ 1, 3, 8, 16, 32, 64 };
1703 // Dispatch boundaries: MIN_PTS_PER_THREAD (24), powers of two ±1, and both GLV thresholds
1704 // (2^13 native, 2^16 wasm), up to a large 2^18 shape.
1705 constexpr std::array<size_t, 21> ns{ 4, 5, 23, 24, 25, 31, 32, 33, 63, 64, 65,
1706 255, 256, 257, 4095, 4096, 4097, 8191, 8192, 8193, 262144 };
1707 constexpr std::array<size_t, 4> bit_budgets{ 0, 1, 128, 254 };
1708
1709 for (const size_t threads : thread_counts) {
1711 for (const size_t n : ns) {
1712 for (const bool ext_glv : { false, true }) {
1713 for (const bool dedup : { false, true }) {
1714 for (const size_t bits : bit_budgets) {
1715 EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(n, ext_glv, dedup, bits))
1716 << "threads=" << threads << " n=" << n << " ext_glv=" << ext_glv << " dedup=" << dedup
1717 << " bits=" << bits;
1718 }
1719 }
1720 }
1721 }
1722 }
1723
1724 // Deterministic pseudo-random N (Knuth multiplicative hash) to catch under-counts that do
1725 // not sit on a curated boundary. Date/Math.random are unavailable; the hash keeps CI runs
1726 // reproducible.
1727 for (const size_t threads : { size_t{ 4 }, size_t{ 8 }, size_t{ 16 }, size_t{ 32 } }) {
1729 for (size_t i = 1; i <= 32; ++i) {
1730 const size_t n = 4 + ((i * size_t{ 2654435761ULL }) % (size_t{ 1 } << 20));
1731 for (const bool dedup : { false, true }) {
1732 EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(n, /*external_glv_provided=*/false, dedup, 254))
1733 << "random n=" << n << " threads=" << threads << " dedup=" << dedup;
1734 }
1735 }
1736 }
1737
1738 bb::set_parallel_for_concurrency(saved_threads);
1739}
1740
1741// ======================= Test Wrappers =======================
1742
1744{
1745 this->test_pippenger_low_memory();
1746}
1748{
1749 this->test_batch_multi_scalar_mul();
1750}
1751TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse)
1752{
1753 this->test_batch_multi_scalar_mul_sparse();
1754}
1755TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulLargeDense)
1756{
1757 this->test_batch_multi_scalar_mul_large_dense();
1758}
1759TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulRagged)
1760{
1761 this->test_batch_multi_scalar_mul_ragged();
1762}
1764{
1765 this->test_msm();
1766}
1768{
1769 this->test_msm_all_zeroes();
1770}
1772{
1773 this->test_msm_empty_polynomial();
1774}
1775TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterMSM)
1776{
1777 this->test_scalars_unchanged_after_msm();
1778}
1779TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterBatchMultiScalarMul)
1780{
1781 this->test_scalars_unchanged_after_batch_multi_scalar_mul();
1782}
1783TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterLargeNonGlvMSM)
1784{
1785 this->test_scalars_unchanged_after_large_non_glv_msm();
1786}
1788{
1789 this->test_scalar_one();
1790}
1792{
1793 this->test_scalar_minus_one();
1794}
1796{
1797 this->test_single_point();
1798}
1800{
1801 this->test_size_thresholds();
1802}
1804{
1805 this->test_duplicate_points();
1806}
1808{
1809 this->test_mixed_zero_scalars();
1810}
1812{
1813 this->test_pippenger_free_function();
1814}
1815TYPED_TEST(ScalarMultiplicationTest, PippengerUnsafeFreeFunction)
1816{
1817 this->test_pippenger_unsafe_free_function();
1818}
1820{
1821 this->test_offset_span(/*n_total=*/4096, /*start_index=*/7, /*n_used=*/512, 0x5eedu + 33);
1822 this->test_offset_span(/*n_total=*/8192, /*start_index=*/4097, /*n_used=*/2048, 0x5eedu + 34);
1823}
1825{
1826#ifdef __wasm__
1827 GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";
1828#endif
1829 this->test_large_n_non_glv();
1830}
1832{
1833#ifdef __wasm__
1834 GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";
1835#endif
1836 this->test_msm_single_digit_mega_run();
1837}
1839{
1840#ifdef __wasm__
1841 GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";
1842#endif
1843 this->test_msm_dedup_cap_and_carry();
1844}
1845TYPED_TEST(ScalarMultiplicationTest, MSMDedupManySmallClustersCap)
1846{
1847#ifdef __wasm__
1848 GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";
1849#endif
1850 this->test_msm_dedup_many_small_clusters_cap();
1851}
1852
1853// Dispatch-coverage tests for `pippenger_round_parallel`.
1854TYPED_TEST(ScalarMultiplicationTest, PippengerInternalSingleThread)
1855{
1856 this->test_pippenger_internal_single_thread();
1857}
1858TYPED_TEST(ScalarMultiplicationTest, PippengerInternalSingleThreadAtDispatchThresholdPlusOne)
1859{
1860 this->test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one();
1861}
1862TYPED_TEST(ScalarMultiplicationTest, PippengerInternalDispatchThresholdPerThreadCount)
1863{
1864 this->test_pippenger_internal_dispatch_threshold_per_thread_count();
1865}
1866TYPED_TEST(ScalarMultiplicationTest, PippengerInternalOffsetSpanDispatch)
1867{
1868 this->test_pippenger_internal_offset_span_dispatch();
1869}
1870TYPED_TEST(ScalarMultiplicationTest, PippengerInternalAllZeroScalars)
1871{
1872 this->test_pippenger_internal_all_zero_scalars();
1873}
1874TYPED_TEST(ScalarMultiplicationTest, PippengerInternalMixedZeroScalars)
1875{
1876 this->test_pippenger_internal_mixed_zero_scalars();
1877}
1878TYPED_TEST(ScalarMultiplicationTest, PippengerInternalExtremeScalars)
1879{
1880 this->test_pippenger_internal_extreme_scalars();
1881}
1882TYPED_TEST(ScalarMultiplicationTest, TrivialMsmThreadedPerWorkerPaths)
1883{
1884 this->test_trivial_msm_threaded_per_worker_paths();
1885}
1886TYPED_TEST(ScalarMultiplicationTest, PippengerInternalGlvBoundary)
1887{
1888 this->test_pippenger_internal_glv_boundary();
1889}
1890TYPED_TEST(ScalarMultiplicationTest, PippengerInternalMisalignedExternalArena)
1891{
1892 this->test_pippenger_internal_misaligned_external_arena();
1893}
1894TYPED_TEST(ScalarMultiplicationTest, HandleEdgeCasesPointAtInfinity)
1895{
1896 this->test_handle_edge_cases_point_at_infinity();
1897}
1898TYPED_TEST(ScalarMultiplicationTest, HandleEdgeCasesInversePairs)
1899{
1900 this->test_handle_edge_cases_inverse_pairs();
1901}
1902TYPED_TEST(ScalarMultiplicationTest, ExternalGlvDoubledDirect)
1903{
1904#ifdef __wasm__
1905 GTEST_SKIP() << "external_glv_doubled direct coverage is native-only; WASM coverage comes from batch flows.";
1906#endif
1907 this->test_external_glv_doubled_matches_naive();
1908}
1909TYPED_TEST(ScalarMultiplicationTest, GlvExtremeMagnitudeScalars)
1910{
1911 this->test_glv_extreme_magnitude_scalars();
1912}
1913TYPED_TEST(ScalarMultiplicationTest, EffectiveNumBitsBandSmallScalars)
1914{
1915 this->test_effective_num_bits_band_small_scalars();
1916}
1917TYPED_TEST(ScalarMultiplicationTest, DedupLargeClusterCarryAndCaps)
1918{
1919 this->test_dedup_large_cluster_carry_and_caps();
1920}
1921TYPED_TEST(ScalarMultiplicationTest, BatchDriverSharedPathRagged)
1922{
1923#ifdef __wasm__
1924 GTEST_SKIP() << "Large ragged batch coverage is native-only; WASM coverage comes from integration flows.";
1925#endif
1926 this->test_batch_driver_shared_path();
1927}
1928
1929// NOTE: the curve-independent `PartitionByWeight` unit tests that previously lived here
1930// exercised `MSM<>::MSMWorkUnit` / `MSM<>::partition_by_weight` from the OLD radix-sort +
1931// bucket-accumulator pippenger. Both were removed in the round-parallel refactor; the
1932// equivalent multi-MSM work-unit balancing logic has not yet been built (will live in
1933// `pippenger_round_parallel_batched` once Phases 1-6b are complete). The tests are left
1934// out for now and will be rewritten against the batched dispatcher's partitioner.
1935
1936// Variable-c (split-c) Pippenger dispatch — synthetic distributions per spec §"Validation".
1937// These force SPLIT to fire (cliff / decaying / half-zero / all-large) or to fall through
1938// (uniform-random / all-zero) and validate the result against `naive_msm`.
1939template <class Curve> class VariableWindowSplitDispatchTest : public ::testing::Test {
1940 public:
1941 using Group = typename Curve::Group;
1942 using Element = typename Curve::Element;
1945
1946 static AffineElement naive_msm(std::span<ScalarField> input_scalars, std::span<const AffineElement> input_points)
1947 {
1948 return ScalarMultiplicationTest<Curve>::naive_msm(input_scalars, input_points);
1949 }
1950
1951 static std::vector<AffineElement> make_points(size_t n)
1952 {
1953 std::vector<AffineElement> pts(n);
1954 parallel_for_range(n, [&](size_t s, size_t e) {
1955 for (size_t i = s; i < e; ++i) {
1956 pts[i] = Group::one * Curve::ScalarField::random_element(&engine);
1957 }
1958 });
1959 return pts;
1960 }
1961
1962 static ScalarField scalar_below_2pow(size_t bits)
1963 {
1964 // Random scalar with canonical-form msb < `bits`. We pull a random ScalarField
1965 // (Montgomery), reduce to canonical, mask the canonical representation, and
1966 // reconstruct via the canonical-uint256_t constructor (which re-Montgomery-encodes).
1967 // Masking the .data field directly would mask the Montgomery form, producing garbage.
1968 if (bits >= 254) {
1969 return ScalarField::random_element(&engine);
1970 }
1971 ScalarField r = ScalarField::random_element(&engine);
1972 ScalarField canonical = r.from_montgomery_form_reduced();
1973 auto& d = canonical.data;
1974 size_t bits_remaining = bits;
1975 for (size_t l = 0; l < 4; ++l) {
1976 const size_t take = std::min<size_t>(64, bits_remaining);
1977 const uint64_t mask = (take == 64) ? ~uint64_t{ 0 }
1978 : (take == 0) ? uint64_t{ 0 }
1979 : ((uint64_t{ 1 } << take) - 1);
1980 d[l] &= mask;
1981 if (bits_remaining > take) {
1982 bits_remaining -= take;
1983 } else {
1984 bits_remaining = 0;
1985 }
1986 }
1987 return ScalarField(uint256_t(d[0], d[1], d[2], d[3]));
1988 }
1989
1990 static void check_against_naive(std::span<ScalarField> scalars, std::span<const AffineElement> points)
1991 {
1992 AffineElement expected = naive_msm(scalars, points);
1994 EXPECT_EQ(actual, expected);
1995 }
1996
1997 static constexpr size_t kN = 131072;
1998
2000 {
2001 // All scalars < 2^30 plus 16 large scalars (full 254-bit). SPLIT must fire.
2002 constexpr size_t large_count = 16;
2003 auto pts = make_points(kN);
2004 std::vector<ScalarField> ss(kN);
2005 for (size_t i = 0; i < kN - large_count; ++i) {
2006 ss[i] = scalar_below_2pow(30);
2007 }
2008 for (size_t i = kN - large_count; i < kN; ++i) {
2009 ss[i] = ScalarField::random_element(&engine);
2010 }
2011 check_against_naive(ss, pts);
2012 }
2013
2015 {
2016 // Half below-128 + half below-160.
2017 auto pts = make_points(kN);
2018 std::vector<ScalarField> ss(kN);
2019 for (size_t k = 0; k < kN / 2; ++k) {
2020 ss[k] = scalar_below_2pow(128);
2021 }
2022 for (size_t k = kN / 2; k < kN; ++k) {
2023 ss[k] = scalar_below_2pow(160);
2024 }
2025 check_against_naive(ss, pts);
2026 }
2027
2029 {
2030 // Standard random scalars — must hit the NO_SPLIT fall-through.
2031 auto pts = make_points(kN);
2032 std::vector<ScalarField> ss(kN);
2033 for (size_t k = 0; k < kN; ++k) {
2034 ss[k] = ScalarField::random_element(&engine);
2035 }
2036 check_against_naive(ss, pts);
2037 }
2038
2040 {
2041 auto pts = make_points(kN);
2042 std::vector<ScalarField> ss(kN, ScalarField::zero());
2043 AffineElement actual =
2045 EXPECT_TRUE(actual.is_point_at_infinity());
2046 }
2047
2049 {
2050 // Half zero, half full-random.
2051 auto pts = make_points(kN);
2052 std::vector<ScalarField> ss(kN, ScalarField::zero());
2053 for (size_t k = 0; k < kN / 2; ++k) {
2054 ss[k] = ScalarField::random_element(&engine);
2055 }
2056 check_against_naive(ss, pts);
2057 }
2058
2060 {
2061 // Every scalar full-range — NO_SPLIT (Guard A rejects).
2062 auto pts = make_points(kN);
2063 std::vector<ScalarField> ss(kN);
2064 for (size_t k = 0; k < kN; ++k) {
2065 ss[k] = ScalarField::random_element(&engine);
2066 }
2067 check_against_naive(ss, pts);
2068 }
2069
2070 // Synthetic minimal repro for the SPLIT bookkeeping bug:
2071 // half scalars with msb < 64, half full-range. SPLIT may fire (set VAR_WINDOW_FORCE_SPLIT to be sure).
2073 {
2074 auto pts = make_points(kN);
2075 std::vector<ScalarField> ss(kN);
2076 for (size_t k = 0; k < kN / 2; ++k) {
2077 ss[k] = scalar_below_2pow(60);
2078 }
2079 for (size_t k = kN / 2; k < kN; ++k) {
2080 ss[k] = ScalarField::random_element(&engine);
2081 }
2082 check_against_naive(ss, pts);
2083 }
2084
2085 // All scalars with canonical msb < 192. Triggers GLV path's regular (non-shortcut) lattice
2086 // reduction for inputs that fit in 192 bits but not 128 — exposing whether scalars
2087 // strictly below the 128-bit shortcut threshold but with non-trivial msb cause a SPLIT
2088 // bookkeeping bug.
2090 {
2091 auto pts = make_points(kN);
2092 std::vector<ScalarField> ss(kN);
2093 for (size_t k = 0; k < kN; ++k) {
2094 ss[k] = scalar_below_2pow(192);
2095 }
2096 check_against_naive(ss, pts);
2097 }
2098
2099 // Pin-style bitwise-identity check: with VAR_WINDOW_FORCE_SPLIT setting window_bits_lo == window_bits_hi ==
2100 // window_bits_unsplit and b_star at a clean multiple of window_bits_unsplit, the SPLIT path's window decomposition
2101 // is structurally identical to NO_SPLIT. Any divergence in the resulting MSM points to a bookkeeping bug
2102 // (per-region driver, schedule layout, idx_large gating in upper region).
2104 {
2105 auto pts = make_points(kN);
2106 std::vector<ScalarField> ss(kN);
2107 for (size_t k = 0; k < kN; ++k) {
2108 ss[k] = scalar_below_2pow(160);
2109 }
2110 check_against_naive(ss, pts);
2111 }
2112};
2113
2114#ifndef __wasm__
2115using VariableWindowCurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;
2117
2119{
2120 this->test_cliff();
2121}
2123{
2124 this->test_decaying();
2125}
2127{
2128 this->test_uniform_random();
2129}
2131{
2132 this->test_all_zero();
2133}
2135{
2136 this->test_half_zero();
2137}
2139{
2140 this->test_all_large();
2141}
2143{
2144 this->test_mid_distribution();
2145}
2147{
2148 this->test_below_192();
2149}
2151{
2152 this->test_force_split_bitwise_identity();
2153}
2154#endif
2155
2156// Non-templated test for explicit small inputs
2157TEST(ScalarMultiplication, SmallInputsExplicit)
2158{
2159 uint256_t x0(0x68df84429941826a, 0xeb08934ed806781c, 0xc14b6a2e4f796a73, 0x08dc1a9a11a3c8db);
2160 uint256_t y0(0x8ae5c31aa997f141, 0xe85f20c504f2c11b, 0x81a94193f3b1ce2b, 0x26f2c37372adb5b7);
2161 uint256_t x1(0x80f5a592d919d32f, 0x1362652b984e51ca, 0xa0b26666f770c2a1, 0x142c6e1964e5c3c5);
2162 uint256_t y1(0xb6c322ebb5ae4bc5, 0xf9fef6c7909c00f8, 0xb37ca1cc9af3b421, 0x1e331c7fa73d6a59);
2163 uint256_t s0(0xe48bf12a24272e08, 0xf8dd0182577f3567, 0xec8fd222b8a6becb, 0x102d76b945612c9b);
2164 uint256_t s1(0x098ae8d69f1e4e9e, 0xb5c8313c0f6040ed, 0xf78041e30cc46c44, 0x1d1e6e0c21892e13);
2165
2166 std::vector<grumpkin::fr> scalars{ s0, s1 };
2167
2170
2172
2173 auto result = scalar_multiplication::MSM<curve::Grumpkin>::msm(points, scalar_span);
2174
2175 grumpkin::g1::element expected = (points[0] * scalars[0]) + (points[1] * scalars[1]);
2176
2177 EXPECT_EQ(result, grumpkin::g1::affine_element(expected));
2178}
#define BB_BENCH_NAME(name)
Definition bb_bench.hpp:264
RAII helper to scope a bb::set_parallel_for_concurrency change to one test.
ConcurrencyScope & operator=(const ConcurrencyScope &)=delete
ConcurrencyScope & operator=(ConcurrencyScope &&)=delete
ConcurrencyScope(const ConcurrencyScope &)=delete
ConcurrencyScope(ConcurrencyScope &&)=delete
void test_offset_span(size_t n_total, size_t start_index, size_t n_used, uint64_t seed)
Validate that a non-zero start_index in the PolynomialSpan is honoured.
void run_batch_driver_paths(bool handle_edge_cases)
typename Curve::ScalarField ScalarField
static std::vector< AffineElement > generators
void test_large_n_non_glv()
Coverage at very large N (exercises the non-GLV path on WASM, where n_input > 2^16 disables the GLV d...
void test_msm_dedup_many_small_clusters_cap()
Stress-test dedup cap fallback across many small clusters.
void test_msm_dedup_cap_and_carry()
Stress-test the dedup pass's worst-case caps and the split-cluster carry.
static std::vector< AffineElement > make_repeated_test_points(size_t num_pts)
static constexpr size_t kMaxBatchPointsPerMSM
static std::vector< ScalarField > scalars
typename Curve::AffineElement AffineElement
static AffineElement naive_msm(std::span< ScalarField > input_scalars, std::span< const AffineElement > input_points)
void test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one()
void test_msm_single_digit_mega_run()
Force every Pippenger window to contain a single mega-run of one digit.
void check_internal_against_naive(size_t n, size_t start_index, const char *label)
static void check_against_naive(std::span< ScalarField > scalars, std::span< const AffineElement > points)
static AffineElement naive_msm(std::span< ScalarField > input_scalars, std::span< const AffineElement > input_points)
typename Curve::AffineElement AffineElement
static ScalarField scalar_below_2pow(size_t bits)
static std::vector< AffineElement > make_points(size_t n)
typename Group::element Element
Definition grumpkin.hpp:63
typename grumpkin::g1 Group
Definition grumpkin.hpp:62
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64
element class. Implements ecc group arithmetic using Jacobian coordinates See https://hyperelliptic....
Definition element.hpp:35
group_elements::affine_element< Fq, Fr, Params > affine_element
Definition group.hpp:44
virtual uint8_t get_random_uint8()=0
virtual uint16_t get_random_uint16()=0
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< const AffineElement > points, std::span< PolynomialSpan< ScalarField > > scalars, bool handle_edge_cases=true, std::span< const uint8_t > dedup_hints={}) noexcept
static AffineElement msm(std::span< const AffineElement > points, PolynomialSpan< const ScalarField > scalars, bool handle_edge_cases=false, bool dedup_hint=false) noexcept
::testing::Types< bb::curve::BN254, bb::curve::Grumpkin > VariableWindowCurveTypes
numeric::RNG & engine
ssize_t offset
Definition engine.cpp:62
RNG & get_debug_randomness(bool reset, std::uint_fast64_t seed)
Definition engine.cpp:245
RNG & get_randomness()
Definition engine.cpp:258
size_t window_bits_tuning_oversub_factor(size_t n_input)
N-dependent oversubscription factor used ONLY for choose_window_bits' target_load formula (not for ac...
Entry point for Barretenberg command-line interface.
Definition api.hpp:5
TYPED_TEST_SUITE(CommitmentKeyTest, Curves)
size_t get_num_cpus()
Definition thread.cpp:33
::testing::Types< curve::BN254, curve::Grumpkin > CurveTypes
TYPED_TEST(CommitmentKeyTest, CommitToZeroPoly)
TEST(BoomerangMegaCircuitBuilder, BasicCircuit)
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:23
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
Curve::Element Element
std::span< Fr > span
constexpr field invert() const noexcept