Barretenberg: src/barretenberg/ecc/scalar_multiplication/scalar_multiplication.test.cpp Source File

#include "scalar_multiplication.hpp"

#include "barretenberg/api/file_io.hpp"

#include "barretenberg/common/thread.hpp"

#include "barretenberg/ecc/curves/bn254/bn254.hpp"

#include "barretenberg/ecc/curves/grumpkin/grumpkin.hpp"

#include "barretenberg/ecc/curves/types.hpp"

#include "barretenberg/ecc/scalar_multiplication/pippenger_arena_layout.hpp"

#include "barretenberg/numeric/random/engine.hpp"

#include "barretenberg/polynomials/polynomial.hpp"

#include "barretenberg/srs/factories/mem_bn254_crs_factory.hpp"

#include <array>

#include <bit>

#include <filesystem>

#include <gtest/gtest.h>


using namespace bb;


namespace {

auto& engine = numeric::get_randomness();


// Walks the actual Zone P / Zone W / Zone S allocator for a representative BN254

// MSM shape and asserts the result fits in `compute_arena_bytes_for_msm`'s promise.

// Mirrors the live allocator inside `pippenger_round_parallel` exactly; the only

// historical drift bugs (cluster_offsets miscount, wasm aligned_local overflow,

// NO_GLV abort, t1 abort) all came from this walk falling out of sync.

bool pippenger_bn254_arena_layout_fits_for_test(size_t n_input,

                                                bool external_glv_provided = false,

                                                bool dedup_active = false,

                                                size_t effective_num_bits_for_test = 0) noexcept

{

    using Curve = curve::BN254;

    using ScalarField = typename Curve::ScalarField;

    using Element = typename Curve::Element;

    using AffineElement = typename Curve::AffineElement;

    namespace rpd = scalar_multiplication::round_parallel_detail;


    constexpr size_t FULL_NUM_BITS = ScalarField::modulus.get_msb() + 1;

    if (n_input < 4) {

        return true;

    }


    const bool use_glv = external_glv_provided || (n_input <= rpd::GLV_SMALL_N_THRESHOLD);

    const bool inline_glv_double = use_glv && !external_glv_provided;

    const size_t n = use_glv ? 2 * n_input : n_input;

    const size_t NUM_BITS = use_glv ? size_t{ 128 } : FULL_NUM_BITS;

    const size_t arena_capacity =

        scalar_multiplication::compute_arena_bytes_for_msm<Curve>(n_input, external_glv_provided, dedup_active);

    if (arena_capacity == 0) {

        return true;

    }


    const size_t actual_num_bits = (effective_num_bits_for_test == 0 || effective_num_bits_for_test > NUM_BITS)

                                       ? NUM_BITS

                                       : effective_num_bits_for_test;

    const size_t num_logical_threads_for_c =

        bb::get_num_cpus() * scalar_multiplication::window_bits_tuning_oversub_factor(n_input);

    const size_t window_bits = rpd::choose_window_bits(n, actual_num_bits, n_input, num_logical_threads_for_c);

    const auto sched = rpd::build_var_window_schedule(actual_num_bits, window_bits);

    const size_t num_buckets = (size_t{ 1 } << (window_bits - 1)) + 1;


    using rpd::BATCH_CAPACITY;

    constexpr size_t MIN_BATCH_CAPACITY = 32;

    constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL;

    constexpr size_t SUBCHUNK_ENTRIES_CAP = 2048;


    const size_t desired_threads = std::max<size_t>(1, bb::get_num_cpus());

    const size_t max_threads_for_min_batch = std::max<size_t>(1, n / MIN_BATCH_CAPACITY);

    const size_t num_threads = std::min(desired_threads, max_threads_for_min_batch);

    const size_t profile_threads = std::max<size_t>(1, bb::get_num_cpus());

    const size_t worker_total = num_threads;


    size_t B_eff = num_buckets;

    for (size_t w = 0; w < sched.num_windows; ++w) {

        B_eff = std::max(B_eff, static_cast<size_t>(sched.num_buckets[w]));

    }

    const size_t dense_stride_est =

        std::max<size_t>(2, std::bit_ceil((B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) : size_t{ 1 }));

    const size_t bucket_partials_per_window_max = (B_eff > 0) ? (B_eff - 1 + num_threads - 1) : 0;

    const size_t hist_h_bytes_pw_shared = (size_t{ 4 } * num_threads * B_eff);

    const size_t hist_o_bytes_pw_shared =

        (sizeof(rpd::ChunkOutput<Curve>) * num_threads) + (size_t{ 96 } * num_threads);

    const size_t hist_slot_bytes_pw_shared = std::max(hist_h_bytes_pw_shared, hist_o_bytes_pw_shared);

    const size_t dense_slot_bytes_pw_shared = (size_t{ 65 } * bucket_partials_per_window_max);

    const size_t per_window_bytes_shared =

        hist_slot_bytes_pw_shared + dense_slot_bytes_pw_shared + (size_t{ 8 } * (B_eff + 1)) +

        (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * (num_threads + 1)) + (size_t{ 8 } * num_threads) +

        (size_t{ 8 } * num_threads) + (size_t{ 8 } * num_threads) + (size_t{ 16 } * worker_total) +

        (size_t{ 8 } * num_threads) + (size_t{ 87 } * worker_total * dense_stride_est);

    const size_t capacity_lo = n;

    const size_t per_window_bytes_lo = (size_t{ 4 } * capacity_lo) + per_window_bytes_shared;


    const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads;

    const size_t global_max_overflow_per_window =

        (global_max_chunk_len + SUBCHUNK_ENTRIES_CAP - 1) / SUBCHUNK_ENTRIES_CAP;

    const size_t chunk_capacity = std::max(SUBCHUNK_ENTRIES_CAP, 2 * global_max_overflow_per_window);


    const size_t phase_a_cluster_members_cap = std::min(rpd::DEDUP_MAX_MEMBERS, n);

    const size_t phase_a_cluster_offsets_cap = (rpd::DEDUP_MAX_CLUSTERS / num_threads) + 2;


    const size_t phase_one_prologue_bytes = n + (use_glv ? size_t{ 32 } * n : size_t{ 0 }) +

                                            (inline_glv_double ? size_t{ 64 } * n : size_t{ 0 }) +

                                            (profile_threads * size_t{ 1024 });


    const rpd::PerWorkerArenaLayout<Curve> budget_layout(

        /*chunk_capacity=*/SUBCHUNK_ENTRIES_CAP,

        global_max_overflow_per_window,

        dedup_active,

        phase_a_cluster_members_cap,

        phase_a_cluster_offsets_cap,

        /*windows_per_batch=*/0,

        /*dense_stride_est=*/0);

    const size_t worker_union_bytes_for_budget = budget_layout.per_worker_union_bytes;

    const size_t fixed_overhead = (worker_union_bytes_for_budget * worker_total) +

                                  (size_t{ 96 } * rpd::VAR_WINDOW_MAX_WINDOWS) + (size_t{ 8 } * (num_threads + 1)) +

                                  phase_one_prologue_bytes;

    const size_t available_budget =

        (BATCH_MEM_BUDGET > fixed_overhead) ? (BATCH_MEM_BUDGET - fixed_overhead) : size_t{ 0 };

    const size_t windows_per_batch = (per_window_bytes_lo == 0 || available_budget == 0)

                                         ? std::max<size_t>(1, sched.num_windows)

                                         : std::min(std::max<size_t>(1, available_budget / per_window_bytes_lo),

                                                    static_cast<size_t>(sched.num_windows));


    auto align_up = [](size_t off, size_t align) -> size_t { return (off + align - 1) & ~(align - 1); };

    auto layout_add = [&](size_t& off, size_t bytes, size_t align) { off = align_up(off, align) + bytes; };

    auto bump_fits = [&](size_t count,

                         size_t size,

                         size_t align,

                         size_t& cursor,

                         size_t bound,

                         size_t base_offset,

                         size_t base_misalign) {

        const size_t cur_addr_mod = (base_misalign + base_offset + cursor) & (align - 1);

        const size_t align_delta = (cur_addr_mod == 0) ? size_t{ 0 } : (align - cur_addr_mod);

        const size_t aligned_local = cursor + align_delta;

        const size_t bytes = count * size;

        if (aligned_local + bytes > bound) {

            return false;

        }

        cursor = aligned_local + bytes;

        return true;

    };


    for (size_t base_misalign = 0; base_misalign < alignof(AffineElement); ++base_misalign) {

        size_t arena_cursor = 0;

        if (!bump_fits(n, sizeof(uint8_t), alignof(uint8_t), arena_cursor, arena_capacity, 0, base_misalign)) {

            return false;

        }

        if (!bump_fits(profile_threads,

                       sizeof(std::array<uint32_t, 256>),

                       alignof(std::array<uint32_t, 256>),

                       arena_cursor,

                       arena_capacity,

                       0,

                       base_misalign)) {

            return false;

        }

        if (use_glv) {

            if (!bump_fits(

                    n, sizeof(ScalarField), alignof(ScalarField), arena_cursor, arena_capacity, 0, base_misalign)) {

                return false;

            }

            if (inline_glv_double &&

                !bump_fits(

                    n, sizeof(AffineElement), alignof(AffineElement), arena_cursor, arena_capacity, 0, base_misalign)) {

                return false;

            }

        }

        const size_t bytes_P_prefix = arena_cursor;


        const rpd::PerWorkerArenaLayout<Curve> worker_layout(chunk_capacity,

                                                             global_max_overflow_per_window,

                                                             dedup_active,

                                                             phase_a_cluster_members_cap,

                                                             phase_a_cluster_offsets_cap,

                                                             windows_per_batch,

                                                             dense_stride_est);

        constexpr size_t WORKER_SLAB_ALIGN = rpd::PerWorkerArenaLayout<Curve>::WORKER_SLAB_ALIGN;

        const size_t per_worker_bytes = worker_layout.per_worker_bytes;


        size_t bytes_P_extra_layout = 0;

        layout_add(bytes_P_extra_layout, sizeof(Element) * rpd::VAR_WINDOW_MAX_WINDOWS, alignof(Element));

        if (dedup_active) {

            layout_add(bytes_P_extra_layout, sizeof(uint32_t) * n, alignof(uint32_t));

            layout_add(bytes_P_extra_layout, sizeof(AffineElement) * rpd::DEDUP_MAX_CLUSTERS, alignof(AffineElement));

        }

        const size_t bytes_P_min = align_up(bytes_P_prefix, alignof(Element)) + bytes_P_extra_layout;

        const size_t bytes_P = align_up(bytes_P_min + base_misalign, WORKER_SLAB_ALIGN) - base_misalign;

        const size_t bytes_W = per_worker_bytes * worker_total;

        if (bytes_P + bytes_W > arena_capacity) {

            return false;

        }

        const size_t bytes_S_total = arena_capacity - bytes_P - bytes_W;

        size_t zone_S_cursor = 0;

        const size_t zone_S_base = bytes_P + bytes_W;


        const size_t schedule_total = windows_per_batch * capacity_lo;

        if (!bump_fits(schedule_total,

                       sizeof(uint32_t),

                       alignof(uint32_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign)) {

            return false;

        }

        const size_t hist_h_bytes_total = size_t{ 4 } * windows_per_batch * num_threads * B_eff;

        size_t o_layout_cur = 0;

        o_layout_cur = align_up(o_layout_cur, alignof(rpd::ChunkOutput<Curve>));

        o_layout_cur += sizeof(rpd::ChunkOutput<Curve>) * windows_per_batch * num_threads;

        o_layout_cur = align_up(o_layout_cur, alignof(Element));

        o_layout_cur += sizeof(Element) * num_threads * windows_per_batch;

        const size_t hist_slot_cells =

            (std::max(hist_h_bytes_total, o_layout_cur) + sizeof(AffineElement) - 1) / sizeof(AffineElement);

        const size_t dense_slot_cells =

            ((size_t{ 65 } * windows_per_batch * bucket_partials_per_window_max) + sizeof(AffineElement) - 1) /

            sizeof(AffineElement);

        if (!bump_fits(hist_slot_cells,

                       sizeof(AffineElement),

                       alignof(AffineElement),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(dense_slot_cells,

                       sizeof(AffineElement),

                       alignof(AffineElement),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * (B_eff + 1),

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * (num_threads + 1),

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * (num_threads + 1),

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * num_threads,

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits((num_threads * windows_per_batch) + 1,

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(num_threads + 1,

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * num_threads,

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign) ||

            !bump_fits(windows_per_batch * num_threads,

                       sizeof(size_t),

                       alignof(size_t),

                       zone_S_cursor,

                       bytes_S_total,

                       zone_S_base,

                       base_misalign)) {

            return false;

        }

    }

    return true;

}

} // namespace


template <class Curve> class ScalarMultiplicationTest : public ::testing::Test {

  public:

    using Group = typename Curve::Group;

    using Element = typename Curve::Element;

    using AffineElement = typename Curve::AffineElement;

    using ScalarField = typename Curve::ScalarField;


    static constexpr size_t num_points = 31013;


    // Bounds used by test_batch_multi_scalar_mul. Kept small so num_points (and therefore

    // SetUpTestSuite, which builds num_points random EC points) stays cheap — especially under wasm,

    // where the fixture build previously dominated the whole ecc_tests run.

    static constexpr size_t kMaxBatchMSMs = 32;

    static constexpr size_t kMaxBatchPointsPerMSM = 400;


    // Pinning invariants: these tests walk generators[]/scalars[] without bounds checks beyond an

    // occasional runtime ASSERT_LT. Pin the relationships at compile time so changing any one of

    // these constants in isolation cannot regress into an out-of-bounds walk.

    static_assert(kMaxBatchMSMs * kMaxBatchPointsPerMSM < num_points,

                  "test_batch_multi_scalar_mul can exceed num_points; "

                  "raise num_points or lower kMaxBatchMSMs / kMaxBatchPointsPerMSM");


    static inline std::vector<AffineElement> generators{};

    static inline std::vector<ScalarField> scalars{};


    static AffineElement naive_msm(std::span<ScalarField> input_scalars, std::span<const AffineElement> input_points)

    {

        size_t total_points = input_scalars.size();

        size_t num_threads = get_num_cpus();

        std::vector<Element> expected_accs(num_threads);

        size_t range_per_thread = (total_points + num_threads - 1) / num_threads;

        parallel_for(num_threads, [&](size_t thread_idx) {

            Element expected_thread_acc;

            expected_thread_acc.self_set_infinity();

            size_t start = thread_idx * range_per_thread;

            size_t end = ((thread_idx + 1) * range_per_thread > total_points) ? total_points

                                                                              : (thread_idx + 1) * range_per_thread;

            bool skip = start >= total_points;

            if (!skip) {

                for (size_t i = start; i < end; ++i) {

                    expected_thread_acc += input_points[i] * input_scalars[i];

                }

            }

            expected_accs[thread_idx] = expected_thread_acc;

        });


        Element expected_acc = Element();

        expected_acc.self_set_infinity();

        for (auto& acc : expected_accs) {

            expected_acc += acc;

        }

        return AffineElement(expected_acc);

    }


    static std::vector<AffineElement> make_repeated_test_points(size_t num_pts)

    {

        std::vector<AffineElement> points(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            points[i] = generators[i % generators.size()];

        }

        return points;

    }


    static void SetUpTestSuite()

    {

        generators.resize(num_points);

        scalars.resize(num_points);

        parallel_for_range(num_points, [&](size_t start, size_t end) {

            for (size_t i = start; i < end; ++i) {

                generators[i] = Group::one * Curve::ScalarField::random_element(&engine);

                scalars[i] = Curve::ScalarField::random_element(&engine);

            }

        });

        for (size_t i = 0; i < num_points - 1; ++i) {

            ASSERT_EQ(generators[i].x == generators[i + 1].x, false);

        }

    };


    // ======================= Test Methods =======================


    void test_pippenger_low_memory()

    {

        std::span<ScalarField> test_scalars(&scalars[0], num_points);

        AffineElement result =

            scalar_multiplication::MSM<Curve>::msm(generators, PolynomialSpan<ScalarField>(0, test_scalars));

        AffineElement expected = naive_msm(test_scalars, generators);

        EXPECT_EQ(result, expected);

    }


    void test_batch_multi_scalar_mul()

    {

        BB_BENCH_NAME("BatchMultiScalarMul");


        const size_t num_msms = static_cast<size_t>(engine.get_random_uint8()) % kMaxBatchMSMs;

        std::vector<AffineElement> expected(num_msms);


        std::vector<std::vector<ScalarField>> batch_scalars_copies(num_msms);

        std::vector<size_t> start_indices(num_msms);

        std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;


        size_t vector_offset = 0;

        for (size_t k = 0; k < num_msms; ++k) {

            const size_t num_pts = static_cast<size_t>(engine.get_random_uint16()) % kMaxBatchPointsPerMSM;


            ASSERT_LT(vector_offset + num_pts, num_points);


            batch_scalars_copies[k].resize(num_pts);

            for (size_t i = 0; i < num_pts; ++i) {

                batch_scalars_copies[k][i] = scalars[vector_offset + i];

            }


            start_indices[k] = vector_offset;

            batch_scalars_spans.emplace_back(vector_offset, std::span<ScalarField>(batch_scalars_copies[k]));

            vector_offset += num_pts;


            std::span<const AffineElement> batch_points(&generators[start_indices[k]], num_pts);

            expected[k] = naive_msm(batch_scalars_copies[k], batch_points);

        }


        std::vector<AffineElement> result =

            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(generators, batch_scalars_spans);


        EXPECT_EQ(result, expected);

    }


    void test_batch_multi_scalar_mul_sparse()

    {

        const size_t num_msms = 10;

        std::vector<AffineElement> expected(num_msms);


        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);

        std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;


        for (size_t k = 0; k < num_msms; ++k) {

            const size_t num_pts = 33;

            auto& test_scalars = batch_scalars[k];


            test_scalars.resize(num_pts);


            size_t fixture_offset = k * num_pts;


            std::span<const AffineElement> batch_points(&generators[fixture_offset], num_pts);

            for (size_t i = 0; i < 13; ++i) {

                test_scalars[i] = 0;

            }

            for (size_t i = 13; i < 23; ++i) {

                test_scalars[i] = scalars[fixture_offset + i + 13];

            }

            for (size_t i = 23; i < num_pts; ++i) {

                test_scalars[i] = 0;

            }

            batch_scalars_spans.emplace_back(fixture_offset, std::span<ScalarField>(batch_scalars[k]));


            expected[k] = naive_msm(batch_scalars[k], batch_points);

        }


        std::vector<AffineElement> result =

            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(generators, batch_scalars_spans);


        EXPECT_EQ(result, expected);

    }


    // Larger workload that crosses the batched dispatcher's `total_nonzero > 4096` eligibility

    // threshold so the multi-MSM Phases 1-6b pipeline (REBALANCE path) is exercised, not the

    // per-MSM delegation fallback.


    void test_batch_multi_scalar_mul_large_dense()

    {

        constexpr size_t num_msms = 4;

        constexpr size_t per_msm_n = 1 << 13; // 8192 points per MSM, total = 32768


        std::vector<AffineElement> expected(num_msms);

        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);

        std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;


        for (size_t k = 0; k < num_msms; ++k) {

            batch_scalars[k].resize(per_msm_n);

            for (size_t i = 0; i < per_msm_n; ++i) {

                // num_msms * per_msm_n = 32768 > num_points (31013); wrap to stay in bounds

                // (matches the ragged test's indexing). Caught as an out-of-bounds read by the

                // _GLIBCXX_DEBUG / ASAN build otherwise.

                batch_scalars[k][i] = scalars[(k * per_msm_n + i) % num_points];

            }

            std::span<const AffineElement> pts(&generators[0], per_msm_n);

            batch_scalars_spans.emplace_back(0, std::span<ScalarField>(batch_scalars[k]));

            expected[k] = naive_msm(batch_scalars[k], pts);

        }


        std::vector<AffineElement> result =

            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(generators, batch_scalars_spans);


        for (size_t k = 0; k < num_msms; ++k) {

            EXPECT_EQ(result[k], expected[k]) << "MSM " << k << " mismatched";

        }

    }


    // Ragged batch with mixed densities — the workload pattern for translator wires + databus.

    // K=5 MSMs of varying sizes, varying zero density, all sharing the same SRS prefix.


    void test_batch_multi_scalar_mul_ragged()

    {

        const std::vector<size_t> sizes = { 16384, 4096, 8192, 1024, 12000 };

        const size_t num_msms = sizes.size();


        std::vector<AffineElement> expected(num_msms);

        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);

        std::vector<PolynomialSpan<ScalarField>> batch_scalars_spans;


        for (size_t k = 0; k < num_msms; ++k) {

            const size_t n = sizes[k];

            batch_scalars[k].resize(n);

            for (size_t i = 0; i < n; ++i) {

                if ((k == 1 || k == 3) && (i % 4 != 0)) {

                    batch_scalars[k][i] = ScalarField::zero();

                } else {

                    batch_scalars[k][i] = scalars[(k * 17 + i) % num_points];

                }

            }

            std::span<const AffineElement> pts(&generators[0], n);

            batch_scalars_spans.emplace_back(0, std::span<ScalarField>(batch_scalars[k]));

            expected[k] = naive_msm(batch_scalars[k], pts);

        }


        std::vector<AffineElement> result =

            scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(generators, batch_scalars_spans);


        for (size_t k = 0; k < num_msms; ++k) {

            EXPECT_EQ(result[k], expected[k]) << "MSM " << k << " (n=" << sizes[k] << ") mismatched";

        }

    }


    void test_msm()

    {

        const size_t start_index = 1234;

        const size_t num_pts = num_points - start_index;


        PolynomialSpan<ScalarField> scalar_span =

            PolynomialSpan<ScalarField>(start_index, std::span<ScalarField>(&scalars[0], num_pts));

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(generators, scalar_span);


        std::span<AffineElement> points(&generators[start_index], num_pts);

        AffineElement expected = naive_msm(scalar_span.span, points);

        EXPECT_EQ(result, expected);

    }


    void test_msm_all_zeroes()

    {

        const size_t start_index = 1234;

        const size_t num_pts = num_points - start_index;

        std::vector<ScalarField> test_scalars(num_pts, ScalarField::zero());


        PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(start_index, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(generators, scalar_span);


        EXPECT_EQ(result, Group::affine_point_at_infinity);

    }


    void test_msm_empty_polynomial()

    {

        std::vector<ScalarField> test_scalars;

        std::vector<AffineElement> input_points;

        PolynomialSpan<ScalarField> scalar_span = PolynomialSpan<ScalarField>(0, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(input_points, scalar_span);


        EXPECT_EQ(result, Group::affine_point_at_infinity);

    }


    void test_scalars_unchanged_after_msm()

    {

        const size_t num_pts = 100;

        std::vector<ScalarField> test_scalars(num_pts);

        std::vector<ScalarField> scalars_copy(num_pts);


        for (size_t i = 0; i < num_pts; ++i) {

            test_scalars[i] = scalars[i];

            scalars_copy[i] = test_scalars[i];

        }


        std::span<const AffineElement> points(&generators[0], num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);


        scalar_multiplication::MSM<Curve>::msm(points, scalar_span);


        for (size_t i = 0; i < num_pts; ++i) {

            EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "Scalar at index " << i << " was modified";

        }

    }


    void test_scalars_unchanged_after_batch_multi_scalar_mul()

    {

        const size_t num_msms = 3;

        const size_t num_pts = 100;


        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);

        std::vector<std::vector<ScalarField>> scalars_copies(num_msms);

        std::vector<PolynomialSpan<ScalarField>> batch_scalar_spans;


        for (size_t k = 0; k < num_msms; ++k) {

            batch_scalars[k].resize(num_pts);

            scalars_copies[k].resize(num_pts);


            for (size_t i = 0; i < num_pts; ++i) {

                batch_scalars[k][i] = scalars[k * num_pts + i];

                scalars_copies[k][i] = batch_scalars[k][i];

            }


            batch_scalar_spans.emplace_back(k * num_pts, std::span<ScalarField>(batch_scalars[k]));

        }


        scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(generators, batch_scalar_spans);


        for (size_t k = 0; k < num_msms; ++k) {

            for (size_t i = 0; i < num_pts; ++i) {

                EXPECT_EQ(batch_scalars[k][i], scalars_copies[k][i])

                    << "Scalar at MSM " << k << ", index " << i << " was modified";

            }

        }

    }


    void test_scalars_unchanged_after_large_non_glv_msm()

    {

#ifdef __wasm__

        GTEST_SKIP() << "WASM GLV threshold exceeds the fixture size; non-GLV restoration is native-only here.";

#else

        namespace rpd = scalar_multiplication::round_parallel_detail;

        const size_t num_pts = rpd::GLV_SMALL_N_THRESHOLD + 257;

        ASSERT_LE(num_pts, num_points);


        std::vector<ScalarField> test_scalars(num_pts);

        std::vector<ScalarField> scalars_copy(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            test_scalars[i] = scalars[i];

            scalars_copy[i] = test_scalars[i];

        }


        std::span<const AffineElement> points(&generators[0], num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/false);


        for (size_t i = 0; i < num_pts; ++i) {

            EXPECT_EQ(test_scalars[i], scalars_copy[i]) << "non-GLV scalar at index " << i << " was modified";

        }

#endif

    }


    void test_scalar_one()

    {

        const size_t num_pts = 5;

        std::vector<ScalarField> test_scalars(num_pts, ScalarField::one());

        std::span<const AffineElement> points(&generators[0], num_pts);


        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);


        Element expected;

        expected.self_set_infinity();

        for (size_t i = 0; i < num_pts; ++i) {

            expected += points[i];

        }


        EXPECT_EQ(result, AffineElement(expected));

    }


    void test_scalar_minus_one()

    {

        const size_t num_pts = 5;

        std::vector<ScalarField> test_scalars(num_pts, -ScalarField::one());

        std::span<const AffineElement> points(&generators[0], num_pts);


        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);


        Element expected;

        expected.self_set_infinity();

        for (size_t i = 0; i < num_pts; ++i) {

            expected -= points[i];

        }


        EXPECT_EQ(result, AffineElement(expected));

    }


    void test_single_point()

    {

        std::vector<ScalarField> test_scalars = { scalars[0] };

        std::span<const AffineElement> points(&generators[0], 1);


        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);


        AffineElement expected(points[0] * test_scalars[0]);

        EXPECT_EQ(result, expected);

    }


    void test_size_thresholds()

    {

        std::vector<size_t> test_sizes = { 1, 2, 15, 16, 17, 50, 127, 128, 129, 256, 512 };


        for (size_t num_pts : test_sizes) {

            ASSERT_LE(num_pts, num_points);


            std::vector<ScalarField> test_scalars(num_pts);

            for (size_t i = 0; i < num_pts; ++i) {

                test_scalars[i] = scalars[i];

            }


            std::span<const AffineElement> points(&generators[0], num_pts);

            PolynomialSpan<ScalarField> scalar_span(0, test_scalars);


            AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);

            AffineElement expected = naive_msm(test_scalars, points);


            EXPECT_EQ(result, expected) << "Failed for size " << num_pts;

        }

    }


    void test_duplicate_points()

    {

        // Use enough points to trigger Pippenger (> PIPPENGER_THRESHOLD = 16)

        const size_t num_pts = 32;

        AffineElement base_point = generators[0];


        std::vector<AffineElement> points(num_pts, base_point);

        std::vector<ScalarField> test_scalars(num_pts);

        ScalarField scalar_sum = ScalarField::zero();


        for (size_t i = 0; i < num_pts; ++i) {

            test_scalars[i] = scalars[i];

            scalar_sum += test_scalars[i];

        }


        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        // Duplicate points are an edge case (P + P requires doubling, not addition).

        // Must use handle_edge_cases=true for correctness with Pippenger.

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);


        AffineElement expected(base_point * scalar_sum);

        EXPECT_EQ(result, expected);

    }


    void test_mixed_zero_scalars()

    {

        const size_t num_pts = 100;

        std::vector<ScalarField> test_scalars(num_pts);

        Element expected;

        expected.self_set_infinity();


        for (size_t i = 0; i < num_pts; ++i) {

            if (i % 2 == 0) {

                test_scalars[i] = ScalarField::zero();

            } else {

                test_scalars[i] = scalars[i];

                expected += generators[i] * test_scalars[i];

            }

        }


        std::span<const AffineElement> points(&generators[0], num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);


        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);

        EXPECT_EQ(result, AffineElement(expected));

    }


    void test_pippenger_free_function()

    {

        const size_t num_pts = 200;

        std::vector<ScalarField> test_scalars(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            test_scalars[i] = scalars[i];

        }


        std::span<const AffineElement> points(&generators[0], num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);


        auto result = scalar_multiplication::pippenger<Curve>(scalar_span, points);


        AffineElement expected = naive_msm(test_scalars, points);

        EXPECT_EQ(AffineElement(result), expected);

    }


    void test_pippenger_unsafe_free_function()

    {

        const size_t num_pts = 200;

        std::vector<ScalarField> test_scalars(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            test_scalars[i] = scalars[i];

        }


        std::span<const AffineElement> points(&generators[0], num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);


        auto result = scalar_multiplication::pippenger_unsafe<Curve>(scalar_span, points);


        AffineElement expected = naive_msm(test_scalars, points);

        EXPECT_EQ(AffineElement(result), expected);

    }


    void test_offset_span(size_t n_total, size_t start_index, size_t n_used, uint64_t seed)

    {

        auto& rng = numeric::get_debug_randomness(true, seed);

        std::vector<ScalarField> test_scalars(n_total);

        std::vector<AffineElement> input_points(start_index + n_used);

        for (size_t i = 0; i < n_total; ++i) {

            test_scalars[i] = ScalarField::random_element(&rng);

        }

        for (size_t i = 0; i < input_points.size(); ++i) {

            input_points[i] = AffineElement(Element::random_element(&rng));

        }


        PolynomialSpan<const ScalarField> scalar_span{

            start_index, std::span<const ScalarField>{ test_scalars.data() + start_index, n_used }

        };


        Element actual = scalar_multiplication::pippenger_unsafe<Curve>(scalar_span, input_points);


        Element expected;

        expected.self_set_infinity();

        for (size_t i = 0; i < n_used; ++i) {

            expected += input_points[start_index + i] * test_scalars[start_index + i];

        }

        EXPECT_EQ(AffineElement(actual), AffineElement(expected))

            << "Offset MSM mismatch at n_total=" << n_total << " start_index=" << start_index << " n_used=" << n_used;

    }


    void test_large_n_non_glv()

    {

        const size_t num_pts = scalar_multiplication::round_parallel_detail::GLV_SMALL_N_THRESHOLD + 31;

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 35);

        std::vector<AffineElement> points(num_pts);

        std::vector<ScalarField> test_scalars(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            points[i] = AffineElement(Element::random_element(&rng));

            test_scalars[i] = ScalarField::random_element(&rng);

        }


        PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);

        AffineElement expected = naive_msm(test_scalars, points);

        EXPECT_EQ(result, expected);

    }


    void test_msm_single_digit_mega_run()

    {

        const size_t num_pts = 100000;

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 36);

        std::vector<AffineElement> points(num_pts);

        for (size_t i = 0; i < num_pts; ++i) {

            points[i] = AffineElement(Element::random_element(&rng));

        }

        std::vector<ScalarField> uniform_scalars(num_pts, ScalarField(7));

        PolynomialSpan<ScalarField> scalar_span(0, uniform_scalars);


        AffineElement result = scalar_multiplication::MSM<Curve>::msm(points, scalar_span);

        AffineElement expected =

            naive_msm(std::span<ScalarField>(uniform_scalars), std::span<const AffineElement>(points));

        EXPECT_EQ(result, expected);

    }


    void test_msm_dedup_cap_and_carry()

    {

        const size_t num_pts = 50000;

        // Pick a dedup-eligible scalar: msb >= c (c ≈ 11 for n ≈ 50 000), so any value

        // ≥ 2^11 works. Use 2^200 so msb is firmly large for any c the dispatch picks.

        const ScalarField val = ScalarField(uint256_t(0, 0, 0, uint64_t{ 1 } << (200 - 192))); // 2^200

        std::vector<ScalarField> uniform_scalars(num_pts, val);

        std::vector<AffineElement> points = make_repeated_test_points(num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, uniform_scalars);


        AffineElement result = scalar_multiplication::MSM<Curve>::msm(

            points, scalar_span, /*handle_edge_cases=*/false, /*dedup_hint=*/true);


        AffineElement expected =

            naive_msm(std::span<ScalarField>(uniform_scalars), std::span<const AffineElement>(points));

        EXPECT_EQ(result, expected);

    }


    void test_msm_dedup_many_small_clusters_cap()

    {

        constexpr size_t NUM_CLUSTERS = 12000;

        constexpr size_t CLUSTER_SIZE = 3;

        const size_t num_pts = NUM_CLUSTERS * CLUSTER_SIZE;


        std::vector<ScalarField> scalars;

        scalars.reserve(num_pts);

        const uint256_t high_bit(0, 0, 0, uint64_t{ 1 } << (200 - 192));

        for (size_t i = 0; i < NUM_CLUSTERS; ++i) {

            const ScalarField val = ScalarField(high_bit + uint256_t(i + 1));

            for (size_t j = 0; j < CLUSTER_SIZE; ++j) {

                scalars.push_back(val);

            }

        }


        std::vector<AffineElement> points = make_repeated_test_points(num_pts);

        PolynomialSpan<ScalarField> scalar_span(0, scalars);


        AffineElement result =

            scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/false, true);

        AffineElement expected = naive_msm(std::span<ScalarField>(scalars), std::span<const AffineElement>(points));

        EXPECT_EQ(result, expected);

    }


    // ============================================================================

    // Dispatch-coverage tests for `pippenger_round_parallel`.

    //

    // The function has several branches that need to all be exercised:

    //   * `n_input == 0` → infinity

    //   * `pts_per_thread < MIN_PTS_PER_THREAD_FOR_PIPPENGER` → trivial_msm_threaded

    //         (single-thread → trivial_msm, otherwise straus_msm per worker)

    //   * Otherwise → main pippenger pipeline

    //         - use_glv=true (n_input ≤ GLV_SMALL_N_THRESHOLD)

    //         - use_glv=false (n_input > GLV_SMALL_N_THRESHOLD; only on huge N)

    //   * `external_glv_doubled` provided vs not (drives one of the GLV-split branches)

    //

    // Each test below restores `bb::set_parallel_for_concurrency` to its original

    // value before returning, even if the assertion fails, so subsequent tests are

    // unaffected.

    // ============================================================================


    class ConcurrencyScope {

        size_t prev_;


      public:


        explicit ConcurrencyScope(size_t n)

            : prev_(bb::get_num_cpus())

        {

            bb::set_parallel_for_concurrency(n);

        }


        ~ConcurrencyScope() { bb::set_parallel_for_concurrency(prev_); }

        ConcurrencyScope(const ConcurrencyScope&) = delete;

        ConcurrencyScope& operator=(const ConcurrencyScope&) = delete;

        ConcurrencyScope(ConcurrencyScope&&) = delete;

        ConcurrencyScope& operator=(ConcurrencyScope&&) = delete;

    };


    void check_internal_against_naive(size_t n, size_t start_index, const char* label)

    {

        ASSERT_LE(start_index + n, num_points) << label;


        std::span<ScalarField> scalar_subspan(&scalars[start_index], n);

        std::span<const AffineElement> point_subspan(&generators[0], start_index + n);

        PolynomialSpan<const ScalarField> scalar_span{ start_index, scalar_subspan };


        Element actual = scalar_multiplication::pippenger_round_parallel<Curve>(scalar_span, point_subspan);


        Element expected;

        expected.self_set_infinity();

        for (size_t i = 0; i < n; ++i) {

            expected += point_subspan[start_index + i] * scalar_subspan[i];

        }


        EXPECT_EQ(AffineElement(actual), AffineElement(expected))

            << label << " (n=" << n << ", start_index=" << start_index << ")";

    }


    void test_pippenger_internal_single_thread()

    {

        ConcurrencyScope scope(1);

        // n_input == 0: infinity short-circuit.

        {

            std::span<const AffineElement> empty_points;

            std::span<ScalarField> empty_scalars;

            PolynomialSpan<const ScalarField> empty_span{ 0, empty_scalars };

            Element r = scalar_multiplication::pippenger_round_parallel<Curve>(empty_span, empty_points);

            EXPECT_TRUE(r.is_point_at_infinity());

        }

        // Walk N across all dispatch boundaries with a single thread. With 1 thread,

        // pts_per_thread == n; the trivial dispatch fires up to N=23, falls through

        // at N=24+. The fall-through path then runs the affine pippenger with

        // num_threads=1.

        for (size_t n : { size_t{ 1 },

                          size_t{ 2 },

                          size_t{ 3 },

                          size_t{ 4 },

                          size_t{ 23 },

                          size_t{ 24 },

                          size_t{ 25 },

                          size_t{ 32 },

                          size_t{ 64 },

                          size_t{ 100 },

                          size_t{ 192 },

                          size_t{ 1000 } }) {

            check_internal_against_naive(n, 0, "single_thread");

        }

    }


    void test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one()

    {

        ConcurrencyScope scope(1);

        constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;

        check_internal_against_naive(kThreshold + 1, 0, "single_thread n=Threshold+1");

        // Also exercise N just below where `chunk_len = n / num_threads = n / 1 = n`

        // approaches MIN_BATCH_CAPACITY=32 — the (now-removed) brittle fallback used

        // to fire here; we want the affine path to still run and produce correct

        // output even with very small chunks.

        for (size_t n : { kThreshold + 1, size_t{ 32 }, size_t{ 33 }, size_t{ 50 }, size_t{ 100 } }) {

            check_internal_against_naive(n, 0, "single_thread small-chunk");

        }

    }


    void test_pippenger_internal_dispatch_threshold_per_thread_count()

    {

        constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;

        for (size_t threads : { size_t{ 2 }, size_t{ 4 }, size_t{ 8 }, size_t{ 16 } }) {

            ConcurrencyScope scope(threads);

            // Dispatch boundary is at n = threads * kThreshold (= pts_per_thread = 24).

            const size_t boundary = threads * kThreshold;

            for (size_t n : { boundary - 1, boundary, boundary + 1 }) {

                check_internal_against_naive(n, 0, "dispatch_boundary");

            }

        }

    }


    void test_pippenger_internal_offset_span_dispatch()

    {

        ConcurrencyScope scope(8);

        // Small N (will dispatch to trivial_msm_threaded).

        check_internal_against_naive(/*n=*/64, /*start_index=*/17, "offset small-N");

        // Just above dispatch threshold (8 threads → boundary at 192).

        check_internal_against_naive(/*n=*/200, /*start_index=*/13, "offset just-above-boundary");

        // Mid-N falls through into pippenger.

        check_internal_against_naive(/*n=*/1024, /*start_index=*/41, "offset mid-N");

    }


    void test_pippenger_internal_all_zero_scalars()

    {

        ConcurrencyScope scope(8);

        // Save and restore the global scalars buffer.

        std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 1024);

        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = ScalarField::zero();

        }

        for (size_t n : { size_t{ 1 }, size_t{ 24 }, size_t{ 100 }, size_t{ 1000 } }) {

            std::span<ScalarField> sub(&scalars[0], n);

            std::span<const AffineElement> pts(&generators[0], n);

            PolynomialSpan<const ScalarField> sp{ 0, sub };

            Element r = scalar_multiplication::pippenger_round_parallel<Curve>(sp, pts);

            EXPECT_TRUE(r.is_point_at_infinity()) << "all-zero n=" << n;

        }

        // Restore.

        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = saved[i];

        }

    }


    void test_pippenger_internal_mixed_zero_scalars()

    {

        ConcurrencyScope scope(8);

        std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 1024);

        // Zero out every other scalar.

        for (size_t i = 0; i < 1024; i += 2) {

            scalars[i] = ScalarField::zero();

        }

        for (size_t n : { size_t{ 24 }, size_t{ 100 }, size_t{ 1024 } }) {

            check_internal_against_naive(n, 0, "mixed-zero");

        }

        // Restore.

        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = saved[i];

        }

    }


    void test_pippenger_internal_extreme_scalars()

    {

        ConcurrencyScope scope(8);

        std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + 256);


        // Scalar = 1

        for (auto& s : saved) {

            (void)s;

        }

        for (size_t i = 0; i < 256; ++i) {

            scalars[i] = ScalarField::one();

        }

        check_internal_against_naive(256, 0, "scalar=1");


        // Scalar = -1

        for (size_t i = 0; i < 256; ++i) {

            scalars[i] = -ScalarField::one();

        }

        check_internal_against_naive(256, 0, "scalar=-1");


        // Restore.

        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = saved[i];

        }

    }


    void test_trivial_msm_threaded_per_worker_paths()

    {

        for (size_t threads : { size_t{ 1 }, size_t{ 2 }, size_t{ 4 }, size_t{ 8 } }) {

            ConcurrencyScope scope(threads);

            for (size_t n : { size_t{ 1 }, size_t{ 2 }, size_t{ 8 }, size_t{ 32 }, size_t{ 80 }, size_t{ 160 } }) {

                std::span<ScalarField> sub(&scalars[0], n);

                std::span<const AffineElement> pts(&generators[0], n);

                PolynomialSpan<const ScalarField> sp{ 0, sub };

                Element actual = scalar_multiplication::trivial_msm_threaded<Curve>(sp, pts);

                Element expected;

                expected.self_set_infinity();

                for (size_t i = 0; i < n; ++i) {

                    expected += pts[i] * sub[i];

                }

                EXPECT_EQ(AffineElement(actual), AffineElement(expected))

                    << "trivial_msm_threaded threads=" << threads << " n=" << n;

            }

        }

    }


    void test_pippenger_internal_glv_boundary()

    {

        ConcurrencyScope scope(8);

#ifdef __wasm__

        constexpr size_t glv_threshold = size_t{ 1 } << 16;

#else

        constexpr size_t glv_threshold = size_t{ 1 } << 13;

#endif

        if (glv_threshold >= num_points) {

            GTEST_SKIP() << "GLV threshold " << glv_threshold << " not exercisable with " << num_points

                         << " precomputed points";

        }

        // Just below threshold: use_glv=true.

        check_internal_against_naive(glv_threshold - 1, 0, "glv-boundary minus-1 (use_glv=true)");

        // Exactly at threshold: use_glv=true (≤ comparison).

        check_internal_against_naive(glv_threshold, 0, "glv-boundary exact (use_glv=true)");

        // Just above: use_glv=false.

        check_internal_against_naive(glv_threshold + 1, 0, "glv-boundary plus-1 (use_glv=false)");

    }


    void test_pippenger_internal_misaligned_external_arena()

    {

        ConcurrencyScope scope(1);

        constexpr size_t kThreshold = scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER;

        for (size_t n : { kThreshold + 1, size_t{ 50 }, size_t{ 100 }, size_t{ 256 } }) {

            std::span<ScalarField> scalar_subspan(&scalars[0], n);

            std::span<const AffineElement> point_subspan(&generators[0], n);

            PolynomialSpan<const ScalarField> scalar_span{ 0, scalar_subspan };


            constexpr size_t kArenaCapacity = size_t{ 64 } * 1024 * 1024;

            std::vector<std::byte> raw(kArenaCapacity + 64);

            // NOLINTNEXTLINE(cppcoreguidelines-pro-type-reinterpret-cast)

            const auto base = reinterpret_cast<uintptr_t>(raw.data());

            const uintptr_t aligned32 = (base + 31) & ~uintptr_t{ 31 };

            std::byte* misaligned = raw.data() + (aligned32 - base) + 16;

            ASSERT_EQ(reinterpret_cast<uintptr_t>(misaligned) % 32, size_t{ 16 });

            std::span<std::byte> external_arena(misaligned, kArenaCapacity);


            Element actual = scalar_multiplication::pippenger_round_parallel<Curve>(

                scalar_span, point_subspan, /*dedup_hint=*/false, {}, external_arena);


            Element expected;

            expected.self_set_infinity();

            for (size_t i = 0; i < n; ++i) {

                expected += point_subspan[i] * scalar_subspan[i];

            }

            EXPECT_EQ(AffineElement(actual), AffineElement(expected)) << "misaligned external arena (n=" << n << ")";

        }

    }


    // ============================================================================

    // Degenerate-input edge cases for the `handle_edge_cases=true` (Jacobian) path,

    // at LARGE N and forced multi-threading.

    //

    // `scalar_multiplication_safe_mode.test.cpp` already covers point-at-infinity,

    // P/-P negation, and all-infinity — but only at tiny N (≤ 60 points), which on

    // native stays single-threaded (the Jacobian path multi-threads only at

    // n ≥ 512). These tests push the same degenerate inputs through the

    // multi-threaded Jacobian split + cross-thread reduction, where a per-slice

    // infinity / equal-x collision is folded into a per-thread partial before the

    // final cross-thread sum. We pin 8 threads so the multi-thread path runs

    // regardless of the CI machine's core count.

    // ============================================================================


    void test_handle_edge_cases_point_at_infinity()

    {

        ConcurrencyScope scope(8);

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 101);

#ifdef __wasm__

        const std::vector<size_t> sizes = { 64 };

#else

        // 3000 crosses the Jacobian path's native multi-thread split (>256 pts/thread).

        const std::vector<size_t> sizes = { 64, 3000 };

#endif

        for (size_t n : sizes) {

            std::vector<AffineElement> points(n);

            std::vector<ScalarField> test_scalars(n);

            for (size_t i = 0; i < n; ++i) {

                points[i] = AffineElement(Element::random_element(&rng));

                test_scalars[i] = ScalarField::random_element(&rng);

            }

            for (size_t idx : { size_t{ 0 }, n / 2, n - 1 }) {

                points[idx].self_set_infinity();

            }

            PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

            AffineElement result =

                scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);

            AffineElement expected = naive_msm(test_scalars, points);

            EXPECT_EQ(result, expected) << "point-at-infinity inputs (n=" << n << ")";

        }

    }


    void test_handle_edge_cases_inverse_pairs()

    {

        ConcurrencyScope scope(8);

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 102);

#ifdef __wasm__

        const std::vector<size_t> pair_counts = { 40 };

#else

        const std::vector<size_t> pair_counts = { 40, 800 };

#endif

        for (size_t pairs : pair_counts) {

            const size_t n = (2 * pairs) + 3; // + a few linearly-independent singletons

            std::vector<AffineElement> points(n);

            std::vector<ScalarField> test_scalars(n);

            for (size_t p = 0; p < pairs; ++p) {

                AffineElement r(Element::random_element(&rng));

                AffineElement neg_r;

                neg_r.x = r.x;

                neg_r.y = -r.y;

                const ScalarField s = ScalarField::random_element(&rng);

                points[2 * p] = r;

                points[(2 * p) + 1] = neg_r;

                test_scalars[2 * p] = s;

                test_scalars[(2 * p) + 1] = s;

            }

            for (size_t i = 2 * pairs; i < n; ++i) {

                points[i] = AffineElement(Element::random_element(&rng));

                test_scalars[i] = ScalarField::random_element(&rng);

            }

            PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

            AffineElement result =

                scalar_multiplication::MSM<Curve>::msm(points, scalar_span, /*handle_edge_cases=*/true);

            AffineElement expected = naive_msm(test_scalars, points);

            EXPECT_EQ(result, expected) << "inverse-pair bucket collisions (pairs=" << pairs << ")";

        }

    }


    void test_external_glv_doubled_matches_naive()

    {

        using BaseField = typename Curve::BaseField;

        const BaseField beta = BaseField::cube_root_of_unity();

        namespace rpd = scalar_multiplication::round_parallel_detail;

        const std::vector<size_t> sizes = { 50, 1000, rpd::GLV_SMALL_N_THRESHOLD + 64 };

        for (size_t n : sizes) {

            ASSERT_LE(n, num_points);

            std::span<const AffineElement> points(&generators[0], n);

            std::vector<ScalarField> test_scalars(n);

            for (size_t i = 0; i < n; ++i) {

                test_scalars[i] = scalars[i];

            }

            std::vector<AffineElement> doubled(2 * n);

            for (size_t i = 0; i < n; ++i) {

                doubled[2 * i] = points[i];

                doubled[(2 * i) + 1].x = points[i].x * beta;

                doubled[(2 * i) + 1].y = -points[i].y;

            }

            PolynomialSpan<const ScalarField> scalar_span(0, std::span<const ScalarField>(test_scalars));

            Element result =

                scalar_multiplication::pippenger_round_parallel<Curve>(scalar_span,

                                                                       points,

                                                                       /*dedup_hint=*/false,

                                                                       std::span<const AffineElement>(doubled));

            AffineElement expected = naive_msm(test_scalars, points);

            EXPECT_EQ(AffineElement(result), expected) << "external_glv_doubled (n=" << n << ")";

        }

    }


    // ============================================================================

    // GLV split / signed-Booth recoder edge cases.

    //

    // The GLV path (use_glv) feeds `split_into_endomorphism_scalars` output into the

    // round-parallel pipeline as 2-limb (≤128-bit) halves k1, k2 with limbs[2],[3]

    // forced to 0 (scalar_multiplication_fast.cpp ~1400). The window schedule is built

    // for NUM_BITS=128 (+2 carry). If any half's true magnitude needed bit 128+ — or if

    // the top signed-Booth window digit carried past the final window — the recoder would

    // silently drop those bits and the MSM result would be wrong while no assert trips.

    //

    // `PippengerInternalExtremeScalars` already pins scalar=±1; this widens coverage to

    // the values that maximise |k1| / |k2|: r-1, (r-1)/2, λ, λ±1, the k2-negative-fix

    // boundary (data[1] high bit set just below 2^128), and a deterministic sweep that

    // checks every split half stays ≤128 bits before trusting the pipeline output. We run

    // every value through the real internal path at an N below the GLV threshold so

    // use_glv=true, comparing to a naive reference.

    // ============================================================================


    void test_glv_extreme_magnitude_scalars()

    {

        ConcurrencyScope scope(8);

        namespace rpd = scalar_multiplication::round_parallel_detail;

        // N small enough to force use_glv=true on both native (2^13) and wasm (2^16),

        // but large enough to clear MIN_PTS_PER_THREAD_FOR_PIPPENGER and run the affine

        // pippenger (not the trivial bail).

        constexpr size_t n = 600;

        static_assert(n <= 256 + 4096, "keep n below the native GLV threshold");


        const uint256_t r = ScalarField::modulus;


        // Worst-case magnitude probes. Each is reduced mod r by the uint256_t ctor.

        std::vector<ScalarField> probes;

        probes.push_back(-ScalarField::one());                                                // r - 1

        probes.push_back(ScalarField(uint256_t(1)));                                          // 1

        probes.push_back(ScalarField(r - uint256_t(1)) * ScalarField(uint256_t(2)).invert()); // (r-1)/2

        // Powers-of-two boundaries around the 128-bit split width.

        for (size_t bit : { size_t{ 126 }, size_t{ 127 }, size_t{ 128 }, size_t{ 129 }, size_t{ 253 } }) {

            probes.push_back(ScalarField(uint256_t(1) << bit));

            probes.push_back(ScalarField((uint256_t(1) << bit) - uint256_t(1)));

            probes.push_back(ScalarField((uint256_t(1) << bit) + uint256_t(1)));

        }

        // r/2 ± small, and r - small: the region where compute_endomorphism_k2 can drive

        // k2 slightly negative (the data[2]||data[3] != 0 → +endo_minus_b1 correction).

        for (uint64_t delta = 0; delta < 8; ++delta) {

            probes.push_back(ScalarField(r - uint256_t(delta + 1)));

            probes.push_back(ScalarField((r >> 1) + uint256_t(delta)));

        }

        // Deterministic pseudo-random fill of additional probes (Knuth multiplicative

        // hash) so the sweep also covers interior values, not just the curated extremes.

        for (uint64_t i = 1; i <= 16; ++i) {

            const uint64_t h0 = i * uint64_t{ 0x9E3779B97F4A7C15ULL };

            const uint64_t h1 = (i + 1) * uint64_t{ 0xC2B2AE3D27D4EB4FULL };

            const uint64_t h2 = (i + 2) * uint64_t{ 0x165667B19E3779F9ULL };

            const uint64_t h3 = (i + 3) * uint64_t{ 0xD6E8FEB86659FD93ULL };

            probes.push_back(ScalarField(uint256_t(h0, h1, h2, h3)));

        }


        // Invariant the whole GLV path leans on: the two 128-bit halves the pipeline stores

        // (limbs[2]/[3] forced to 0) recombine — via the SAME doubled-point convention the

        // MSM uses, φP = (β·x, −y) — back to k·P. If the field ever produced a half needing

        // bit 128+, the 2-limb storage at ~line 1400 would truncate it and this point identity

        // would break, catching the silent wrong result independent of the GLV λ-sign details.

        {

            using BaseField = typename Curve::BaseField;

            const BaseField beta = BaseField::cube_root_of_unity();

            const AffineElement base = generators[0];

            AffineElement phi;

            phi.x = base.x * beta;

            phi.y = -base.y;

            for (const ScalarField& s : probes) {

                const ScalarField canonical = s.from_montgomery_form_reduced();

                const auto split = ScalarField::split_into_endomorphism_scalars(canonical);

                // Rebuild k1, k2 from ONLY the 2 returned limbs (exactly what the pipeline does).

                const ScalarField k1 = ScalarField(uint256_t(split.first[0], split.first[1], 0, 0));

                const ScalarField k2 = ScalarField(uint256_t(split.second[0], split.second[1], 0, 0));

                const Element recombined = Element(base) * k1 + Element(phi) * k2;

                EXPECT_EQ(AffineElement(recombined), AffineElement(Element(base) * s))

                    << "GLV split half exceeded 128-bit storage or mis-signed";

            }

        }


        // Now run each probe through the real internal pipeline (use_glv=true) and compare

        // to naive. We fill all n scalars with the probe so every working half hits the

        // same extreme window, maximising any top-window carry interaction.

        std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + n);

        for (size_t p = 0; p < probes.size(); ++p) {

            for (size_t i = 0; i < n; ++i) {

                scalars[i] = probes[p];

            }

            check_internal_against_naive(n, 0, "glv-extreme-magnitude");

        }

        // Mixed: alternate r-1 and (interior) probes so k1/k2 of adjacent working scalars

        // land in different windows within one MSM.

        for (size_t i = 0; i < n; ++i) {

            scalars[i] = (i & 1) ? -ScalarField::one() : probes[probes.size() - 1 - (i % probes.size())];

        }

        check_internal_against_naive(n, 0, "glv-extreme-mixed");


        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = saved[i];

        }

    }


    // ============================================================================

    // effective_num_bits schedule divergence (sizer vs live allocator).

    //

    // The live path (scalar_multiplication_fast.cpp ~1474) shrinks the window-bit budget

    // to the highest observed scalar msb, then re-runs choose_window_bits — which can pick

    // a SMALLER window_bits (⇒ MORE windows ⇒ a larger wpb·per_window_bytes Zone S) than the

    // full-NUM_BITS pre-sizer. `compute_arena_bytes_for_msm`'s defensive bit-budget sweep

    // (lines 1246-1250) only runs for use_glv OR n_input ≥ 2^17. For the native non-GLV

    // mid-band 2^13 < n < 2^17 the sweep is SKIPPED, so a workload of uniformly small-msb

    // scalars selects a schedule the sizer never sized for. If the live Zone S then exceeds

    // the arena, this is a guaranteed out-of-bounds write (SIGSEGV), not a wrong result.

    //

    // We drive the real MSM in that band with all-small scalars (msb ≪ 254) and compare to

    // naive. A correct sizer makes this pass; an under-count crashes under ASAN / corrupts

    // the answer. Native-only: the band does not exist on wasm (GLV up to 2^16).

    // ============================================================================


    void test_effective_num_bits_band_small_scalars()

    {

#ifdef __wasm__

        GTEST_SKIP() << "non-GLV mid-band (2^13<n<2^17) does not exist on wasm";

#else

        ConcurrencyScope scope(8);

        namespace rpd = scalar_multiplication::round_parallel_detail;

        const size_t glv_threshold = rpd::GLV_SMALL_N_THRESHOLD; // 2^13 native

        // Sizes just above the GLV threshold (use_glv=false) and inside the sweep-skipped

        // band. num_points is 31013, so every size below stays in-bounds.

        const std::vector<size_t> sizes = { glv_threshold + 1, size_t{ 1 } << 14 };

        // Bit budgets to drive effective_num_bits to representative small/mid schedules.

        const std::vector<size_t> bit_widths = { 1, 64, 120 };


        size_t max_size = 0;

        for (size_t s : sizes) {

            if (s <= num_points) {

                max_size = std::max(max_size, s);

            }

        }

        std::vector<ScalarField> saved(scalars.begin(), scalars.begin() + static_cast<std::ptrdiff_t>(max_size));

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 909);

        for (size_t n : sizes) {

            if (n > num_points) {

                continue;

            }

            for (size_t bits : bit_widths) {

                const uint256_t mask = (bits >= 256) ? ~uint256_t(0) : ((uint256_t(1) << bits) - uint256_t(1));

                for (size_t i = 0; i < n; ++i) {

                    // Random value masked to `bits` bits; force the top bit so msb == bits-1

                    // for at least some scalars, pinning effective_num_bits == bits.

                    uint256_t v(rng.get_random_uint64(),

                                rng.get_random_uint64(),

                                rng.get_random_uint64(),

                                rng.get_random_uint64());

                    v = v & mask;

                    if (i == 0 && bits >= 1) {

                        v = v | (uint256_t(1) << (bits - 1));

                    }

                    scalars[i] = ScalarField(v);

                }

                check_internal_against_naive(n, 0, "effective-num-bits-band");

            }

        }

        for (size_t i = 0; i < saved.size(); ++i) {

            scalars[i] = saved[i];

        }

#endif

    }


    // ============================================================================

    // Dedup multi-chunk tree-reduce carry + cap fallback.

    //

    // pippenger_dedup.hpp's Phase A tree-reduce (lines ~470-520) consolidates each

    // equal-value cluster's base points. A cluster larger than DEDUP_MAX_CHUNK_MEMBERS

    // (2048) is split across chunks and its partial sum threaded via the `carry` slot;

    // a cluster larger than the per-bucket staged cap (~1024) is only PARTIALLY deduped,

    // with the overflow members falling through to the normal pippenger path with their

    // original signed digits. Both the carry-threading and the partial-consolidation

    // fallback must produce the same sum as no dedup at all.

    //

    // We build inputs with one (or a few) huge equal-value clusters of identical points

    // so a single combined point + many fall-through members coexist in one MSM, then

    // compare dedup_hint=true against naive. Sizes are chosen to exceed both the chunk

    // cap (2048) and the staged cap, exercising the carry and the cap fallback in the

    // same run. Native-only for the largest sizes (wasm runs the smaller ones).

    // ============================================================================


    void test_dedup_large_cluster_carry_and_caps()

    {

        ConcurrencyScope scope(8);

        auto& rng = numeric::get_debug_randomness(true, 0x5eedu + 303);


#ifdef __wasm__

        const std::vector<size_t> cluster_sizes = { 2100 };

#else

        // 2100 > DEDUP_MAX_CHUNK_MEMBERS(2048) → multi-chunk carry.

        // 5000 → several carries + staged-cap partial-consolidation fallback.

        const std::vector<size_t> cluster_sizes = { 2100, 5000 };

#endif

        for (size_t cluster : cluster_sizes) {

            // Dedup clusters by equal scalar VALUE, combining the cluster's DISTINCT base

            // points into one (rep, Σpoints) pair. So the giant cluster is `cluster` distinct

            // generators all sharing scalar s_big — the realistic dedup trigger (range-check /

            // counter polynomials) and a valid input for the affine path (distinct x). The

            // tree-reduce then sums distinct points (no equal-x affine-add edge case), and the

            // cluster size drives the multi-chunk carry + staged-cap fallback.

            const size_t singles = 400;

            const size_t medium = 600;

            const size_t n = cluster + medium + singles;

            ASSERT_LE(n, num_points);


            std::vector<ScalarField> test_scalars(n);

            std::vector<AffineElement> points(n);


            const ScalarField s_big = ScalarField::random_element(&rng);

            const ScalarField s_med = ScalarField::random_element(&rng);

            ASSERT_NE(s_big, s_med);


            for (size_t i = 0; i < cluster; ++i) {

                points[i] = generators[i];

                test_scalars[i] = s_big;

            }

            for (size_t i = 0; i < medium; ++i) {

                points[cluster + i] = generators[cluster + i];

                test_scalars[cluster + i] = s_med;

            }

            for (size_t i = 0; i < singles; ++i) {

                points[cluster + medium + i] = generators[cluster + medium + i];

                test_scalars[cluster + medium + i] = ScalarField::random_element(&rng);

            }


            PolynomialSpan<ScalarField> scalar_span(0, test_scalars);

            // dedup_hint=true forces Phase A; handle_edge_cases=false stays on the affine

            // path (all points/distinct-x are valid for it).

            AffineElement deduped = scalar_multiplication::MSM<Curve>::msm(

                points, scalar_span, /*handle_edge_cases=*/false, /*dedup_hint=*/true);

            AffineElement expected = naive_msm(test_scalars, points);

            EXPECT_EQ(deduped, expected) << "dedup large-cluster carry/caps (cluster=" << cluster << ", n=" << n << ")";


            // Cross-check: the same input with dedup_hint=false must agree (dedup is a pure

            // optimisation; a mismatch isolates the regression to the dedup pre-pass).

            PolynomialSpan<ScalarField> scalar_span2(0, test_scalars);

            AffineElement undeduped = scalar_multiplication::MSM<Curve>::msm(

                points, scalar_span2, /*handle_edge_cases=*/false, /*dedup_hint=*/false);

            EXPECT_EQ(deduped, undeduped) << "dedup vs no-dedup divergence (cluster=" << cluster << ")";

        }

    }


    void run_batch_driver_paths(bool handle_edge_cases)

    {

        // 16384 > native GLV threshold (2^13) → its own non-GLV group; the rest are GLV.

        const std::vector<size_t> sizes = { 0, 1, 5, 0, 4096, 33, 0, 16384, 64, 2 };

        const size_t num_msms = sizes.size();


        std::vector<std::vector<ScalarField>> batch_scalars(num_msms);

        std::vector<std::vector<ScalarField>> scalar_copies(num_msms);

        std::vector<PolynomialSpan<ScalarField>> spans;

        std::vector<AffineElement> expected(num_msms);

        std::vector<uint8_t> dedup_hints(num_msms, 0);


        size_t offset = 0;

        for (size_t k = 0; k < num_msms; ++k) {

            const size_t n = sizes[k];

            ASSERT_LE(offset + n, num_points);

            batch_scalars[k].resize(n);

            scalar_copies[k].resize(n);

            const bool all_zero = (k % 4 == 2); // a couple of fully-zero MSMs → infinity result

            for (size_t i = 0; i < n; ++i) {

                batch_scalars[k][i] = all_zero ? ScalarField::zero() : scalars[offset + i];

                scalar_copies[k][i] = batch_scalars[k][i];

            }

            dedup_hints[k] = static_cast<uint8_t>((k % 2 == 0) ? 1 : 0);

            spans.emplace_back(offset, std::span<ScalarField>(batch_scalars[k]));

            std::span<const AffineElement> pts(&generators[offset], n);

            expected[k] = naive_msm(batch_scalars[k], pts);

            offset += n;

        }


        std::vector<AffineElement> result = scalar_multiplication::MSM<Curve>::batch_multi_scalar_mul(

            generators, spans, handle_edge_cases, std::span<const uint8_t>(dedup_hints));


        ASSERT_EQ(result.size(), num_msms);

        for (size_t k = 0; k < num_msms; ++k) {

            EXPECT_EQ(result[k], expected[k]) << "batch MSM " << k << " (n=" << sizes[k]

                                              << ", handle_edge_cases=" << handle_edge_cases << ") mismatched";

            EXPECT_EQ(batch_scalars[k], scalar_copies[k]) << "batch MSM " << k << " scalars were not restored";

        }

    }


    void test_batch_driver_shared_path() { run_batch_driver_paths(/*handle_edge_cases=*/false); }

};


using CurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;

TYPED_TEST_SUITE(ScalarMultiplicationTest, CurveTypes);


TEST(ScalarMultiplicationArenaTest, LargeBn254RecursionVkShapeFitsComputedArena)

{

    const size_t saved_threads = bb::get_num_cpus();


    // CI regression from HonkRecursionConstraintTestWithoutPredicate/2.GenerateVKFromConstraints:

    // Zone S attempted a uint32_t schedule allocation whose aligned end was 26,454,272

    // bytes after the computed arena left only 25,505,329 bytes in Zone S. The log does

    // not expose windows_per_batch, so cover every plausible n_input divisor for that

    // schedule size.

    constexpr size_t schedule_slots = size_t{ 26454272 } / sizeof(uint32_t);

    constexpr std::array<size_t, 8> candidate_window_batches{ 1, 2, 4, 8, 13, 16, 26, 32 };

    for (const size_t threads : { size_t{ 4 }, size_t{ 32 } }) {

        bb::set_parallel_for_concurrency(threads);

        for (const size_t windows_per_batch : candidate_window_batches) {

            const size_t n = schedule_slots / windows_per_batch;

            for (size_t effective_num_bits = 1; effective_num_bits <= 254; ++effective_num_bits) {

                EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(

                    n, /*external_glv_provided=*/false, /*dedup_active=*/false, effective_num_bits))

                    << "threads=" << threads << " windows_per_batch=" << windows_per_batch << " n=" << n

                    << " effective_num_bits=" << effective_num_bits;

            }

        }

    }


    bb::set_parallel_for_concurrency(saved_threads);

}


// Sweeps the sizer/allocator agreement (the historical "arena drift" bug class) across the

// full dispatch parameter space rather than the single recursion-VK shape above: thread count,

// N around every dispatch boundary, GLV-provided vs not, dedup-active vs not, and a range of

// effective bit budgets. `pippenger_bn254_arena_layout_fits_for_test` walks the live

// Zone P / Zone W / Zone S allocator and must always fit inside `compute_arena_bytes_for_msm`'s

// promise; any `false` here is an under-count (a guaranteed Zone overflow / SIGSEGV at runtime).

// Notably this is the first coverage of `dedup_active=true` and `external_glv_provided=true`

// against the sizer.


TEST(ScalarMultiplicationArenaTest, ArenaLayoutFitsAcrossDispatchSpace)

{

    const size_t saved_threads = bb::get_num_cpus();


    constexpr std::array<size_t, 6> thread_counts{ 1, 3, 8, 16, 32, 64 };

    // Dispatch boundaries: MIN_PTS_PER_THREAD (24), powers of two ±1, and both GLV thresholds

    // (2^13 native, 2^16 wasm), up to a large 2^18 shape.

    constexpr std::array<size_t, 21> ns{ 4,   5,   23,  24,   25,   31,   32,   33,   63,   64,    65,

                                         255, 256, 257, 4095, 4096, 4097, 8191, 8192, 8193, 262144 };

    constexpr std::array<size_t, 4> bit_budgets{ 0, 1, 128, 254 };


    for (const size_t threads : thread_counts) {

        bb::set_parallel_for_concurrency(threads);

        for (const size_t n : ns) {

            for (const bool ext_glv : { false, true }) {

                for (const bool dedup : { false, true }) {

                    for (const size_t bits : bit_budgets) {

                        EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(n, ext_glv, dedup, bits))

                            << "threads=" << threads << " n=" << n << " ext_glv=" << ext_glv << " dedup=" << dedup

                            << " bits=" << bits;

                    }

                }

            }

        }

    }


    // Deterministic pseudo-random N (Knuth multiplicative hash) to catch under-counts that do

    // not sit on a curated boundary. Date/Math.random are unavailable; the hash keeps CI runs

    // reproducible.

    for (const size_t threads : { size_t{ 4 }, size_t{ 8 }, size_t{ 16 }, size_t{ 32 } }) {

        bb::set_parallel_for_concurrency(threads);

        for (size_t i = 1; i <= 32; ++i) {

            const size_t n = 4 + ((i * size_t{ 2654435761ULL }) % (size_t{ 1 } << 20));

            for (const bool dedup : { false, true }) {

                EXPECT_TRUE(pippenger_bn254_arena_layout_fits_for_test(n, /*external_glv_provided=*/false, dedup, 254))

                    << "random n=" << n << " threads=" << threads << " dedup=" << dedup;

            }

        }

    }


    bb::set_parallel_for_concurrency(saved_threads);

}


// ======================= Test Wrappers =======================


TYPED_TEST(ScalarMultiplicationTest, PippengerLowMemory)

{

    this->test_pippenger_low_memory();

}


TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMul)

{

    this->test_batch_multi_scalar_mul();

}


TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulSparse)

{

    this->test_batch_multi_scalar_mul_sparse();

}


TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulLargeDense)

{

    this->test_batch_multi_scalar_mul_large_dense();

}


TYPED_TEST(ScalarMultiplicationTest, BatchMultiScalarMulRagged)

{

    this->test_batch_multi_scalar_mul_ragged();

}


TYPED_TEST(ScalarMultiplicationTest, MSM)

{

    this->test_msm();

}


TYPED_TEST(ScalarMultiplicationTest, MSMAllZeroes)

{

    this->test_msm_all_zeroes();

}


TYPED_TEST(ScalarMultiplicationTest, MSMEmptyPolynomial)

{

    this->test_msm_empty_polynomial();

}


TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterMSM)

{

    this->test_scalars_unchanged_after_msm();

}


TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterBatchMultiScalarMul)

{

    this->test_scalars_unchanged_after_batch_multi_scalar_mul();

}


TYPED_TEST(ScalarMultiplicationTest, ScalarsUnchangedAfterLargeNonGlvMSM)

{

    this->test_scalars_unchanged_after_large_non_glv_msm();

}


TYPED_TEST(ScalarMultiplicationTest, ScalarOne)

{

    this->test_scalar_one();

}


TYPED_TEST(ScalarMultiplicationTest, ScalarMinusOne)

{

    this->test_scalar_minus_one();

}


TYPED_TEST(ScalarMultiplicationTest, SinglePoint)

{

    this->test_single_point();

}


TYPED_TEST(ScalarMultiplicationTest, SizeThresholds)

{

    this->test_size_thresholds();

}


TYPED_TEST(ScalarMultiplicationTest, DuplicatePoints)

{

    this->test_duplicate_points();

}


TYPED_TEST(ScalarMultiplicationTest, MixedZeroScalars)

{

    this->test_mixed_zero_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerFreeFunction)

{

    this->test_pippenger_free_function();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerUnsafeFreeFunction)

{

    this->test_pippenger_unsafe_free_function();

}


TYPED_TEST(ScalarMultiplicationTest, OffsetSpan)

{

    this->test_offset_span(/*n_total=*/4096, /*start_index=*/7, /*n_used=*/512, 0x5eedu + 33);

    this->test_offset_span(/*n_total=*/8192, /*start_index=*/4097, /*n_used=*/2048, 0x5eedu + 34);

}


TYPED_TEST(ScalarMultiplicationTest, LargeNNonGLV)

{

#ifdef __wasm__

    GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";

#endif

    this->test_large_n_non_glv();

}


TYPED_TEST(ScalarMultiplicationTest, MSMSingleDigitMegaRun)

{

#ifdef __wasm__

    GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";

#endif

    this->test_msm_single_digit_mega_run();

}


TYPED_TEST(ScalarMultiplicationTest, MSMDedupCapAndCarry)

{

#ifdef __wasm__

    GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";

#endif

    this->test_msm_dedup_cap_and_carry();

}


TYPED_TEST(ScalarMultiplicationTest, MSMDedupManySmallClustersCap)

{

#ifdef __wasm__

    GTEST_SKIP() << "Large synthetic MSM coverage is native-only; WASM coverage comes from integration flows.";

#endif

    this->test_msm_dedup_many_small_clusters_cap();

}


// Dispatch-coverage tests for `pippenger_round_parallel`.


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalSingleThread)

{

    this->test_pippenger_internal_single_thread();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalSingleThreadAtDispatchThresholdPlusOne)

{

    this->test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalDispatchThresholdPerThreadCount)

{

    this->test_pippenger_internal_dispatch_threshold_per_thread_count();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalOffsetSpanDispatch)

{

    this->test_pippenger_internal_offset_span_dispatch();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalAllZeroScalars)

{

    this->test_pippenger_internal_all_zero_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalMixedZeroScalars)

{

    this->test_pippenger_internal_mixed_zero_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalExtremeScalars)

{

    this->test_pippenger_internal_extreme_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, TrivialMsmThreadedPerWorkerPaths)

{

    this->test_trivial_msm_threaded_per_worker_paths();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalGlvBoundary)

{

    this->test_pippenger_internal_glv_boundary();

}


TYPED_TEST(ScalarMultiplicationTest, PippengerInternalMisalignedExternalArena)

{

    this->test_pippenger_internal_misaligned_external_arena();

}


TYPED_TEST(ScalarMultiplicationTest, HandleEdgeCasesPointAtInfinity)

{

    this->test_handle_edge_cases_point_at_infinity();

}


TYPED_TEST(ScalarMultiplicationTest, HandleEdgeCasesInversePairs)

{

    this->test_handle_edge_cases_inverse_pairs();

}


TYPED_TEST(ScalarMultiplicationTest, ExternalGlvDoubledDirect)

{

#ifdef __wasm__

    GTEST_SKIP() << "external_glv_doubled direct coverage is native-only; WASM coverage comes from batch flows.";

#endif

    this->test_external_glv_doubled_matches_naive();

}


TYPED_TEST(ScalarMultiplicationTest, GlvExtremeMagnitudeScalars)

{

    this->test_glv_extreme_magnitude_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, EffectiveNumBitsBandSmallScalars)

{

    this->test_effective_num_bits_band_small_scalars();

}


TYPED_TEST(ScalarMultiplicationTest, DedupLargeClusterCarryAndCaps)

{

    this->test_dedup_large_cluster_carry_and_caps();

}


TYPED_TEST(ScalarMultiplicationTest, BatchDriverSharedPathRagged)

{

#ifdef __wasm__

    GTEST_SKIP() << "Large ragged batch coverage is native-only; WASM coverage comes from integration flows.";

#endif

    this->test_batch_driver_shared_path();

}


// NOTE: the curve-independent `PartitionByWeight` unit tests that previously lived here

// exercised `MSM<>::MSMWorkUnit` / `MSM<>::partition_by_weight` from the OLD radix-sort +

// bucket-accumulator pippenger. Both were removed in the round-parallel refactor; the

// equivalent multi-MSM work-unit balancing logic has not yet been built (will live in

// `pippenger_round_parallel_batched` once Phases 1-6b are complete). The tests are left

// out for now and will be rewritten against the batched dispatcher's partitioner.


// Variable-c (split-c) Pippenger dispatch — synthetic distributions per spec §"Validation".

// These force SPLIT to fire (cliff / decaying / half-zero / all-large) or to fall through

// (uniform-random / all-zero) and validate the result against `naive_msm`.


template <class Curve> class VariableWindowSplitDispatchTest : public ::testing::Test {

  public:

    using Group = typename Curve::Group;

    using Element = typename Curve::Element;

    using AffineElement = typename Curve::AffineElement;

    using ScalarField = typename Curve::ScalarField;


    static AffineElement naive_msm(std::span<ScalarField> input_scalars, std::span<const AffineElement> input_points)

    {

        return ScalarMultiplicationTest<Curve>::naive_msm(input_scalars, input_points);

    }


    static std::vector<AffineElement> make_points(size_t n)

    {

        std::vector<AffineElement> pts(n);

        parallel_for_range(n, [&](size_t s, size_t e) {

            for (size_t i = s; i < e; ++i) {

                pts[i] = Group::one * Curve::ScalarField::random_element(&engine);

            }

        });

        return pts;

    }


    static ScalarField scalar_below_2pow(size_t bits)

    {

        // Random scalar with canonical-form msb < `bits`. We pull a random ScalarField

        // (Montgomery), reduce to canonical, mask the canonical representation, and

        // reconstruct via the canonical-uint256_t constructor (which re-Montgomery-encodes).

        // Masking the .data field directly would mask the Montgomery form, producing garbage.

        if (bits >= 254) {

            return ScalarField::random_element(&engine);

        }

        ScalarField r = ScalarField::random_element(&engine);

        ScalarField canonical = r.from_montgomery_form_reduced();

        auto& d = canonical.data;

        size_t bits_remaining = bits;

        for (size_t l = 0; l < 4; ++l) {

            const size_t take = std::min<size_t>(64, bits_remaining);

            const uint64_t mask = (take == 64)  ? ~uint64_t{ 0 }

                                  : (take == 0) ? uint64_t{ 0 }

                                                : ((uint64_t{ 1 } << take) - 1);

            d[l] &= mask;

            if (bits_remaining > take) {

                bits_remaining -= take;

            } else {

                bits_remaining = 0;

            }

        }

        return ScalarField(uint256_t(d[0], d[1], d[2], d[3]));

    }


    static void check_against_naive(std::span<ScalarField> scalars, std::span<const AffineElement> points)

    {

        AffineElement expected = naive_msm(scalars, points);

        AffineElement actual = scalar_multiplication::MSM<Curve>::msm(points, PolynomialSpan<ScalarField>(0, scalars));

        EXPECT_EQ(actual, expected);

    }


    static constexpr size_t kN = 131072;


    void test_cliff()

    {

        // All scalars < 2^30 plus 16 large scalars (full 254-bit). SPLIT must fire.

        constexpr size_t large_count = 16;

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t i = 0; i < kN - large_count; ++i) {

            ss[i] = scalar_below_2pow(30);

        }

        for (size_t i = kN - large_count; i < kN; ++i) {

            ss[i] = ScalarField::random_element(&engine);

        }

        check_against_naive(ss, pts);

    }


    void test_decaying()

    {

        // Half below-128 + half below-160.

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN / 2; ++k) {

            ss[k] = scalar_below_2pow(128);

        }

        for (size_t k = kN / 2; k < kN; ++k) {

            ss[k] = scalar_below_2pow(160);

        }

        check_against_naive(ss, pts);

    }


    void test_uniform_random()

    {

        // Standard random scalars — must hit the NO_SPLIT fall-through.

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN; ++k) {

            ss[k] = ScalarField::random_element(&engine);

        }

        check_against_naive(ss, pts);

    }


    void test_all_zero()

    {

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN, ScalarField::zero());

        AffineElement actual =

            scalar_multiplication::MSM<Curve>::msm(pts, PolynomialSpan<ScalarField>(0, std::span<ScalarField>(ss)));

        EXPECT_TRUE(actual.is_point_at_infinity());

    }


    void test_half_zero()

    {

        // Half zero, half full-random.

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN, ScalarField::zero());

        for (size_t k = 0; k < kN / 2; ++k) {

            ss[k] = ScalarField::random_element(&engine);

        }

        check_against_naive(ss, pts);

    }


    void test_all_large()

    {

        // Every scalar full-range — NO_SPLIT (Guard A rejects).

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN; ++k) {

            ss[k] = ScalarField::random_element(&engine);

        }

        check_against_naive(ss, pts);

    }


    // Synthetic minimal repro for the SPLIT bookkeeping bug:

    // half scalars with msb < 64, half full-range. SPLIT may fire (set VAR_WINDOW_FORCE_SPLIT to be sure).


    void test_mid_distribution()

    {

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN / 2; ++k) {

            ss[k] = scalar_below_2pow(60);

        }

        for (size_t k = kN / 2; k < kN; ++k) {

            ss[k] = ScalarField::random_element(&engine);

        }

        check_against_naive(ss, pts);

    }


    // All scalars with canonical msb < 192. Triggers GLV path's regular (non-shortcut) lattice

    // reduction for inputs that fit in 192 bits but not 128 — exposing whether scalars

    // strictly below the 128-bit shortcut threshold but with non-trivial msb cause a SPLIT

    // bookkeeping bug.


    void test_below_192()

    {

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN; ++k) {

            ss[k] = scalar_below_2pow(192);

        }

        check_against_naive(ss, pts);

    }


    // Pin-style bitwise-identity check: with VAR_WINDOW_FORCE_SPLIT setting window_bits_lo == window_bits_hi ==

    // window_bits_unsplit and b_star at a clean multiple of window_bits_unsplit, the SPLIT path's window decomposition

    // is structurally identical to NO_SPLIT. Any divergence in the resulting MSM points to a bookkeeping bug

    // (per-region driver, schedule layout, idx_large gating in upper region).


    void test_force_split_bitwise_identity()

    {

        auto pts = make_points(kN);

        std::vector<ScalarField> ss(kN);

        for (size_t k = 0; k < kN; ++k) {

            ss[k] = scalar_below_2pow(160);

        }

        check_against_naive(ss, pts);

    }


};


#ifndef __wasm__

using VariableWindowCurveTypes = ::testing::Types<bb::curve::BN254, bb::curve::Grumpkin>;

TYPED_TEST_SUITE(VariableWindowSplitDispatchTest, VariableWindowCurveTypes);


TYPED_TEST(VariableWindowSplitDispatchTest, Cliff)

{

    this->test_cliff();

}


TYPED_TEST(VariableWindowSplitDispatchTest, Decaying)

{

    this->test_decaying();

}


TYPED_TEST(VariableWindowSplitDispatchTest, UniformRandom)

{

    this->test_uniform_random();

}


TYPED_TEST(VariableWindowSplitDispatchTest, AllZero)

{

    this->test_all_zero();

}


TYPED_TEST(VariableWindowSplitDispatchTest, HalfZero)

{

    this->test_half_zero();

}


TYPED_TEST(VariableWindowSplitDispatchTest, AllLarge)

{

    this->test_all_large();

}


TYPED_TEST(VariableWindowSplitDispatchTest, MidDistribution)

{

    this->test_mid_distribution();

}


TYPED_TEST(VariableWindowSplitDispatchTest, Below192)

{

    this->test_below_192();

}


TYPED_TEST(VariableWindowSplitDispatchTest, ForceSplitBitwiseIdentity)

{

    this->test_force_split_bitwise_identity();

}


#endif


// Non-templated test for explicit small inputs


TEST(ScalarMultiplication, SmallInputsExplicit)

{

    uint256_t x0(0x68df84429941826a, 0xeb08934ed806781c, 0xc14b6a2e4f796a73, 0x08dc1a9a11a3c8db);

    uint256_t y0(0x8ae5c31aa997f141, 0xe85f20c504f2c11b, 0x81a94193f3b1ce2b, 0x26f2c37372adb5b7);

    uint256_t x1(0x80f5a592d919d32f, 0x1362652b984e51ca, 0xa0b26666f770c2a1, 0x142c6e1964e5c3c5);

    uint256_t y1(0xb6c322ebb5ae4bc5, 0xf9fef6c7909c00f8, 0xb37ca1cc9af3b421, 0x1e331c7fa73d6a59);

    uint256_t s0(0xe48bf12a24272e08, 0xf8dd0182577f3567, 0xec8fd222b8a6becb, 0x102d76b945612c9b);

    uint256_t s1(0x098ae8d69f1e4e9e, 0xb5c8313c0f6040ed, 0xf78041e30cc46c44, 0x1d1e6e0c21892e13);


    std::vector<grumpkin::fr> scalars{ s0, s1 };


    std::vector<grumpkin::g1::affine_element> points{ grumpkin::g1::affine_element(x0, y0),

                                                      grumpkin::g1::affine_element(x1, y1) };


    PolynomialSpan<grumpkin::fr> scalar_span = PolynomialSpan<grumpkin::fr>(0, scalars);


    auto result = scalar_multiplication::MSM<curve::Grumpkin>::msm(points, scalar_span);


    grumpkin::g1::element expected = (points[0] * scalars[0]) + (points[1] * scalars[1]);


    EXPECT_EQ(result, grumpkin::g1::affine_element(expected));

}


BB_BENCH_NAME
#define BB_BENCH_NAME(name)
Definition bb_bench.hpp:264

ScalarMultiplicationTest::ConcurrencyScope
RAII helper to scope a bb::set_parallel_for_concurrency change to one test.
Definition scalar_multiplication.test.cpp:958

ScalarMultiplicationTest::ConcurrencyScope::operator=
ConcurrencyScope & operator=(const ConcurrencyScope &)=delete

ScalarMultiplicationTest::ConcurrencyScope::operator=
ConcurrencyScope & operator=(ConcurrencyScope &&)=delete

ScalarMultiplicationTest::ConcurrencyScope::prev_
size_t prev_
Definition scalar_multiplication.test.cpp:959

ScalarMultiplicationTest::ConcurrencyScope::ConcurrencyScope
ConcurrencyScope(size_t n)
Definition scalar_multiplication.test.cpp:962

ScalarMultiplicationTest::ConcurrencyScope::ConcurrencyScope
ConcurrencyScope(const ConcurrencyScope &)=delete

ScalarMultiplicationTest::ConcurrencyScope::ConcurrencyScope
ConcurrencyScope(ConcurrencyScope &&)=delete

ScalarMultiplicationTest::ConcurrencyScope::~ConcurrencyScope
~ConcurrencyScope()
Definition scalar_multiplication.test.cpp:967

ScalarMultiplicationTest
Definition scalar_multiplication.test.cpp:294

ScalarMultiplicationTest::test_batch_driver_shared_path
void test_batch_driver_shared_path()
Definition scalar_multiplication.test.cpp:1657

ScalarMultiplicationTest::test_offset_span
void test_offset_span(size_t n_total, size_t start_index, size_t n_used, uint64_t seed)
Validate that a non-zero start_index in the PolynomialSpan is honoured.
Definition scalar_multiplication.test.cpp:795

ScalarMultiplicationTest::test_pippenger_low_memory
void test_pippenger_low_memory()
Definition scalar_multiplication.test.cpp:374

ScalarMultiplicationTest::run_batch_driver_paths
void run_batch_driver_paths(bool handle_edge_cases)
Definition scalar_multiplication.test.cpp:1616

ScalarMultiplicationTest::ScalarField
typename Curve::ScalarField ScalarField
Definition scalar_multiplication.test.cpp:299

ScalarMultiplicationTest::test_pippenger_internal_dispatch_threshold_per_thread_count
void test_pippenger_internal_dispatch_threshold_per_thread_count()
Definition scalar_multiplication.test.cpp:1050

ScalarMultiplicationTest::test_msm
void test_msm()
Definition scalar_multiplication.test.cpp:523

ScalarMultiplicationTest::test_pippenger_internal_extreme_scalars
void test_pippenger_internal_extreme_scalars()
Definition scalar_multiplication.test.cpp:1123

ScalarMultiplicationTest::test_pippenger_internal_single_thread
void test_pippenger_internal_single_thread()
Definition scalar_multiplication.test.cpp:1000

ScalarMultiplicationTest::test_dedup_large_cluster_carry_and_caps
void test_dedup_large_cluster_carry_and_caps()
Definition scalar_multiplication.test.cpp:1541

ScalarMultiplicationTest::generators
static std::vector< AffineElement > generators
Definition scalar_multiplication.test.cpp:316

ScalarMultiplicationTest::kMaxBatchMSMs
static constexpr size_t kMaxBatchMSMs
Definition scalar_multiplication.test.cpp:306

ScalarMultiplicationTest::test_mixed_zero_scalars
void test_mixed_zero_scalars()
Definition scalar_multiplication.test.cpp:731

ScalarMultiplicationTest::test_batch_multi_scalar_mul_sparse
void test_batch_multi_scalar_mul_sparse()
Definition scalar_multiplication.test.cpp:419

ScalarMultiplicationTest::test_duplicate_points
void test_duplicate_points()
Definition scalar_multiplication.test.cpp:707

ScalarMultiplicationTest::test_single_point
void test_single_point()
Definition scalar_multiplication.test.cpp:673

ScalarMultiplicationTest::test_batch_multi_scalar_mul_large_dense
void test_batch_multi_scalar_mul_large_dense()
Definition scalar_multiplication.test.cpp:459

ScalarMultiplicationTest::test_scalars_unchanged_after_batch_multi_scalar_mul
void test_scalars_unchanged_after_batch_multi_scalar_mul()
Definition scalar_multiplication.test.cpp:580

ScalarMultiplicationTest::test_large_n_non_glv
void test_large_n_non_glv()
Coverage at very large N (exercises the non-GLV path on WASM, where n_input > 2^16 disables the GLV d...
Definition scalar_multiplication.test.cpp:826

ScalarMultiplicationTest::test_msm_all_zeroes
void test_msm_all_zeroes()
Definition scalar_multiplication.test.cpp:537

ScalarMultiplicationTest::test_msm_dedup_many_small_clusters_cap
void test_msm_dedup_many_small_clusters_cap()
Stress-test dedup cap fallback across many small clusters.
Definition scalar_multiplication.test.cpp:915

ScalarMultiplicationTest::test_pippenger_unsafe_free_function
void test_pippenger_unsafe_free_function()
Definition scalar_multiplication.test.cpp:771

ScalarMultiplicationTest::test_batch_multi_scalar_mul
void test_batch_multi_scalar_mul()
Definition scalar_multiplication.test.cpp:383

ScalarMultiplicationTest::num_points
static constexpr size_t num_points
Definition scalar_multiplication.test.cpp:301

ScalarMultiplicationTest::test_msm_dedup_cap_and_carry
void test_msm_dedup_cap_and_carry()
Stress-test the dedup pass's worst-case caps and the split-cluster carry.
Definition scalar_multiplication.test.cpp:889

ScalarMultiplicationTest::SetUpTestSuite
static void SetUpTestSuite()
Definition scalar_multiplication.test.cpp:357

ScalarMultiplicationTest::test_scalar_minus_one
void test_scalar_minus_one()
Definition scalar_multiplication.test.cpp:655

ScalarMultiplicationTest::test_pippenger_internal_all_zero_scalars
void test_pippenger_internal_all_zero_scalars()
Definition scalar_multiplication.test.cpp:1080

ScalarMultiplicationTest::test_glv_extreme_magnitude_scalars
void test_glv_extreme_magnitude_scalars()
Definition scalar_multiplication.test.cpp:1373

ScalarMultiplicationTest::test_handle_edge_cases_inverse_pairs
void test_handle_edge_cases_inverse_pairs()
Definition scalar_multiplication.test.cpp:1284

ScalarMultiplicationTest::Element
typename Curve::Element Element
Definition scalar_multiplication.test.cpp:297

ScalarMultiplicationTest::test_external_glv_doubled_matches_naive
void test_external_glv_doubled_matches_naive()
Definition scalar_multiplication.test.cpp:1326

ScalarMultiplicationTest::make_repeated_test_points
static std::vector< AffineElement > make_repeated_test_points(size_t num_pts)
Definition scalar_multiplication.test.cpp:348

ScalarMultiplicationTest::kMaxBatchPointsPerMSM
static constexpr size_t kMaxBatchPointsPerMSM
Definition scalar_multiplication.test.cpp:307

ScalarMultiplicationTest::test_msm_empty_polynomial
void test_msm_empty_polynomial()
Definition scalar_multiplication.test.cpp:549

ScalarMultiplicationTest::test_scalars_unchanged_after_large_non_glv_msm
void test_scalars_unchanged_after_large_non_glv_msm()
Definition scalar_multiplication.test.cpp:611

ScalarMultiplicationTest::test_pippenger_internal_misaligned_external_arena
void test_pippenger_internal_misaligned_external_arena()
Definition scalar_multiplication.test.cpp:1206

ScalarMultiplicationTest::test_pippenger_free_function
void test_pippenger_free_function()
Definition scalar_multiplication.test.cpp:754

ScalarMultiplicationTest::Group
typename Curve::Group Group
Definition scalar_multiplication.test.cpp:296

ScalarMultiplicationTest::test_scalars_unchanged_after_msm
void test_scalars_unchanged_after_msm()
Definition scalar_multiplication.test.cpp:559

ScalarMultiplicationTest::test_pippenger_internal_glv_boundary
void test_pippenger_internal_glv_boundary()
Definition scalar_multiplication.test.cpp:1176

ScalarMultiplicationTest::test_handle_edge_cases_point_at_infinity
void test_handle_edge_cases_point_at_infinity()
Definition scalar_multiplication.test.cpp:1253

ScalarMultiplicationTest::test_pippenger_internal_offset_span_dispatch
void test_pippenger_internal_offset_span_dispatch()
Definition scalar_multiplication.test.cpp:1066

ScalarMultiplicationTest::scalars
static std::vector< ScalarField > scalars
Definition scalar_multiplication.test.cpp:317

ScalarMultiplicationTest::test_scalar_one
void test_scalar_one()
Definition scalar_multiplication.test.cpp:637

ScalarMultiplicationTest::test_pippenger_internal_mixed_zero_scalars
void test_pippenger_internal_mixed_zero_scalars()
Definition scalar_multiplication.test.cpp:1103

ScalarMultiplicationTest::AffineElement
typename Curve::AffineElement AffineElement
Definition scalar_multiplication.test.cpp:298

ScalarMultiplicationTest::test_effective_num_bits_band_small_scalars
void test_effective_num_bits_band_small_scalars()
Definition scalar_multiplication.test.cpp:1474

ScalarMultiplicationTest::naive_msm
static AffineElement naive_msm(std::span< ScalarField > input_scalars, std::span< const AffineElement > input_points)
Definition scalar_multiplication.test.cpp:319

ScalarMultiplicationTest::test_batch_multi_scalar_mul_ragged
void test_batch_multi_scalar_mul_ragged()
Definition scalar_multiplication.test.cpp:491

ScalarMultiplicationTest::test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one
void test_pippenger_internal_single_thread_at_dispatch_threshold_plus_one()
Definition scalar_multiplication.test.cpp:1033

ScalarMultiplicationTest::test_msm_single_digit_mega_run
void test_msm_single_digit_mega_run()
Force every Pippenger window to contain a single mega-run of one digit.
Definition scalar_multiplication.test.cpp:855

ScalarMultiplicationTest::test_trivial_msm_threaded_per_worker_paths
void test_trivial_msm_threaded_per_worker_paths()
Definition scalar_multiplication.test.cpp:1152

ScalarMultiplicationTest::check_internal_against_naive
void check_internal_against_naive(size_t n, size_t start_index, const char *label)
Definition scalar_multiplication.test.cpp:977

ScalarMultiplicationTest::test_size_thresholds
void test_size_thresholds()
Definition scalar_multiplication.test.cpp:685

VariableWindowSplitDispatchTest
Definition scalar_multiplication.test.cpp:1939

VariableWindowSplitDispatchTest::ScalarField
typename Curve::ScalarField ScalarField
Definition scalar_multiplication.test.cpp:1944

VariableWindowSplitDispatchTest::check_against_naive
static void check_against_naive(std::span< ScalarField > scalars, std::span< const AffineElement > points)
Definition scalar_multiplication.test.cpp:1990

VariableWindowSplitDispatchTest::kN
static constexpr size_t kN
Definition scalar_multiplication.test.cpp:1997

VariableWindowSplitDispatchTest::test_uniform_random
void test_uniform_random()
Definition scalar_multiplication.test.cpp:2028

VariableWindowSplitDispatchTest::test_force_split_bitwise_identity
void test_force_split_bitwise_identity()
Definition scalar_multiplication.test.cpp:2103

VariableWindowSplitDispatchTest::naive_msm
static AffineElement naive_msm(std::span< ScalarField > input_scalars, std::span< const AffineElement > input_points)
Definition scalar_multiplication.test.cpp:1946

VariableWindowSplitDispatchTest::AffineElement
typename Curve::AffineElement AffineElement
Definition scalar_multiplication.test.cpp:1943

VariableWindowSplitDispatchTest::test_mid_distribution
void test_mid_distribution()
Definition scalar_multiplication.test.cpp:2072

VariableWindowSplitDispatchTest::test_below_192
void test_below_192()
Definition scalar_multiplication.test.cpp:2089

VariableWindowSplitDispatchTest::scalar_below_2pow
static ScalarField scalar_below_2pow(size_t bits)
Definition scalar_multiplication.test.cpp:1962

VariableWindowSplitDispatchTest::Element
typename Curve::Element Element
Definition scalar_multiplication.test.cpp:1942

VariableWindowSplitDispatchTest::test_cliff
void test_cliff()
Definition scalar_multiplication.test.cpp:1999

VariableWindowSplitDispatchTest::test_half_zero
void test_half_zero()
Definition scalar_multiplication.test.cpp:2048

VariableWindowSplitDispatchTest::test_all_zero
void test_all_zero()
Definition scalar_multiplication.test.cpp:2039

VariableWindowSplitDispatchTest::make_points
static std::vector< AffineElement > make_points(size_t n)
Definition scalar_multiplication.test.cpp:1951

VariableWindowSplitDispatchTest::test_all_large
void test_all_large()
Definition scalar_multiplication.test.cpp:2059

VariableWindowSplitDispatchTest::test_decaying
void test_decaying()
Definition scalar_multiplication.test.cpp:2014

VariableWindowSplitDispatchTest::Group
typename Curve::Group Group
Definition scalar_multiplication.test.cpp:1941

bb::curve::BN254
Definition bn254.hpp:16

bb::curve::Grumpkin::Element
typename Group::element Element
Definition grumpkin.hpp:63

bb::curve::Grumpkin::Group
typename grumpkin::g1 Group
Definition grumpkin.hpp:62

bb::curve::Grumpkin::BaseField
bb::fr BaseField
Definition grumpkin.hpp:61

bb::curve::Grumpkin::AffineElement
typename Group::affine_element AffineElement
Definition grumpkin.hpp:64

bb::curve::Grumpkin::ScalarField
bb::fq ScalarField
Definition grumpkin.hpp:60

bb::group_elements::affine_element
Definition affine_element.hpp:27

bb::group_elements::element
element class. Implements ecc group arithmetic using Jacobian coordinates See https://hyperelliptic....
Definition element.hpp:35

bb::group::affine_element
group_elements::affine_element< Fq, Fr, Params > affine_element
Definition group.hpp:44

bb::numeric::RNG::get_random_uint8
virtual uint8_t get_random_uint8()=0

bb::numeric::RNG::get_random_uint16
virtual uint16_t get_random_uint16()=0

bb::numeric::uint256_t
Definition uint256.hpp:32

bb::scalar_multiplication::MSM::batch_multi_scalar_mul
static std::vector< AffineElement > batch_multi_scalar_mul(std::span< const AffineElement > points, std::span< PolynomialSpan< ScalarField > > scalars, bool handle_edge_cases=true, std::span< const uint8_t > dedup_hints={}) noexcept
Definition scalar_multiplication.cpp:679

bb::scalar_multiplication::MSM::msm
static AffineElement msm(std::span< const AffineElement > points, PolynomialSpan< const ScalarField > scalars, bool handle_edge_cases=false, bool dedup_hint=false) noexcept
Definition scalar_multiplication.cpp:670

bn254.hpp

grumpkin.hpp

types.hpp

VariableWindowCurveTypes
::testing::Types< bb::curve::BN254, bb::curve::Grumpkin > VariableWindowCurveTypes
Definition scalar_multiplication.test.cpp:2115

engine
numeric::RNG & engine
Definition eccvm_transcript.test.cpp:282

offset
ssize_t offset
Definition engine.cpp:62

engine.hpp

file_io.hpp

mem_bn254_crs_factory.hpp

bb::numeric::get_debug_randomness
RNG & get_debug_randomness(bool reset, std::uint_fast64_t seed)
Definition engine.cpp:245

bb::numeric::get_randomness
RNG & get_randomness()
Definition engine.cpp:258

bb::scalar_multiplication::round_parallel_detail::BATCH_MEM_BUDGET
constexpr size_t BATCH_MEM_BUDGET
Definition pippenger_arena_layout.hpp:154

bb::scalar_multiplication::round_parallel_detail::MIN_BATCH_CAPACITY
constexpr size_t MIN_BATCH_CAPACITY
Definition pippenger_arena_layout.hpp:151

bb::scalar_multiplication::round_parallel_detail::SUBCHUNK_ENTRIES_CAP
constexpr size_t SUBCHUNK_ENTRIES_CAP
Definition pippenger_arena_layout.hpp:153

bb::scalar_multiplication::round_parallel_detail::GLV_SMALL_N_THRESHOLD
constexpr size_t GLV_SMALL_N_THRESHOLD
Definition scalar_multiplication_fast.hpp:191

bb::scalar_multiplication::MIN_PTS_PER_THREAD_FOR_PIPPENGER
constexpr size_t MIN_PTS_PER_THREAD_FOR_PIPPENGER
Definition scalar_multiplication_fast.hpp:177

bb::scalar_multiplication::window_bits_tuning_oversub_factor
size_t window_bits_tuning_oversub_factor(size_t n_input)
N-dependent oversubscription factor used ONLY for choose_window_bits' target_load formula (not for ac...
Definition scalar_multiplication_fast.cpp:30

bb
Entry point for Barretenberg command-line interface.
Definition api.hpp:5

bb::TYPED_TEST_SUITE
TYPED_TEST_SUITE(CommitmentKeyTest, Curves)

bb::get_num_cpus
size_t get_num_cpus()
Definition thread.cpp:33

bb::CurveTypes
::testing::Types< curve::BN254, curve::Grumpkin > CurveTypes
Definition shplonk.test.cpp:17

bb::TYPED_TEST
TYPED_TEST(CommitmentKeyTest, CommitToZeroPoly)
Definition commitment_key.test.cpp:217

bb::TEST
TEST(BoomerangMegaCircuitBuilder, BasicCircuit)
Definition graph_description_megacircuitbuilder.test.cpp:22

bb::set_parallel_for_concurrency
void set_parallel_for_concurrency(size_t num_cores)
Definition thread.cpp:23

bb::parallel_for
void parallel_for(size_t num_iterations, const std::function< void(size_t)> &func)
Definition thread.cpp:111

bb::parallel_for_range
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141

std::get
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13

pippenger_arena_layout.hpp

polynomial.hpp

scalar_multiplication.hpp

cursor
size_t cursor
Definition scalar_multiplication_fast.cpp:209

Element
Curve::Element Element
Definition small_msm_matrix.bench.cpp:40

bb::PolynomialSpan
Definition polynomial.hpp:27

bb::PolynomialSpan::span
std::span< Fr > span
Definition polynomial.hpp:29

bb::PolynomialSpan::data
Fr * data()
Definition polynomial.hpp:35

bb::field::invert
constexpr field invert() const noexcept
Definition field_impl.hpp:388

thread.hpp