Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
bb::scalar_multiplication::round_parallel_detail Namespace Reference

Classes

struct  AffineBucketChunkInfo
 
struct  ChunkOutput
 
struct  ConstantineSliceParamsU32
 
struct  DedupResult
 
struct  PerWorkerArenaLayout
 
struct  PhaseACaps
 
struct  PhaseAScratch
 
struct  VariableWindowSchedule
 

Typedefs

using ConstantineSliceParams = BoothSliceParams
 
using SimdU32x4 = uint32_t __attribute__((vector_size(16)))
 

Enumerations

enum class  ConstantineSlicePath : uint8_t { Localised = 0 , Bottom = 1 , Boundary = 2 }
 

Functions

uint32_t choose_window_bits (size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads) noexcept
 
VariableWindowSchedule build_var_window_schedule (size_t num_bits, size_t window_bits) noexcept
 
size_t compute_dense_stride (size_t B_eff, size_t num_threads) noexcept
 
size_t compute_bucket_partials_max (size_t B_eff, size_t num_threads) noexcept
 
size_t compute_global_max_overflow_per_window (size_t n, size_t num_threads, size_t subchunk_entries_cap) noexcept
 
template<typename Curve >
size_t compute_per_window_bytes (size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
 
size_t compute_phase_one_prologue_bytes (size_t n, bool use_glv, bool inline_glv_double, size_t profile_threads) noexcept
 
PhaseACaps compute_phase_a_caps (size_t n, size_t num_threads) noexcept
 
size_t solve_wpb (size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
 
ConstantineSliceParams compute_constantine_slice_params (size_t bit_offset, size_t window_bits, size_t num_uint64_limbs) noexcept
 
uint32_t get_constantine_packed_digit (const uint64_t *scalar_data, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, uint32_t lo_mask, uint32_t hi_mask, bool slice_localised_to_one_u64, size_t window_bits) noexcept
 Read (window_bits+1) bits from scalar_data (uint64 limbs) using precomputed slice params and apply Constantine's signedWindowEncoding to produce a (sign | bucket) packed digit.
 
ConstantineSliceParamsU32 compute_constantine_slice_params_u32 (size_t bit_offset, size_t window_bits, size_t num_u32_limbs) noexcept
 
SimdU32x4 gather_x4_u32 (const uint32_t *p0, const uint32_t *p1, const uint32_t *p2, const uint32_t *p3, uint32_t idx) noexcept
 
void simd_u32x4_store (uint32_t *dst, SimdU32x4 v) noexcept
 
void store_constantine_packed_digits_x4_localised (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t lo_off, SimdU32x4 lo_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
 
void store_constantine_packed_digits_x4_bottom (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t hi_limb, uint32_t lo_bits, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
 
void store_constantine_packed_digits_x4_boundary (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, SimdU32x4 lo_mask_v, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
 
ConstantineSlicePath classify_slice_path_u32 (const ConstantineSliceParamsU32 &sp) noexcept
 
uint64_t dedup_scalar_fingerprint (const uint64_t *scalar_data) noexcept
 
size_t dedup_fingerprint_slot (uint64_t fingerprint, size_t mask) noexcept
 
template<typename Curve >
size_t dedup_tree_reduce_in_place (typename Curve::AffineElement *pts, uint32_t *ids, size_t initial_len, typename Curve::AffineElement *scratch_pts, uint32_t *pair_dest, typename Curve::BaseField *inversion_scratch) noexcept
 
template<typename Curve >
size_t dedup_phase_a_worker_hash (const uint32_t *schedule_w0, const size_t *w0_bucket_start, size_t b_lo, size_t b_hi, std::span< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, std::span< typename Curve::AffineElement > extra_points, std::span< uint32_t > redirect_lookup, const uint8_t *msb_per_scalar, size_t c_threshold, uint32_t cid_lo, uint32_t cid_max, PhaseAScratch< Curve > &scratch) noexcept
 
template<typename Curve >
void dedup_patch_schedule_window (uint32_t *__restrict sched_w, size_t *__restrict bucket_start, size_t num_buckets, const uint32_t *__restrict redirect_lookup) noexcept
 
template<typename Curve >
Curve::Element pippenger_round_parallel_jacobian_fast (std::span< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, size_t min_pts_per_thread_override) noexcept
 Small-N fast-path: per-thread Jacobian Pippenger over a partition of the input.
 
template curve::BN254::Element pippenger_round_parallel_jacobian_fast< curve::BN254 > (std::span< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, size_t min_pts_per_thread_override) noexcept
 
template curve::Grumpkin::Element pippenger_round_parallel_jacobian_fast< curve::Grumpkin > (std::span< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, size_t min_pts_per_thread_override) noexcept
 

Variables

constexpr size_t VAR_WINDOW_MAX_WINDOWS = 128
 
constexpr size_t DEDUP_MAX_CLUSTERS = 16384
 
constexpr size_t DEDUP_MAX_MEMBERS = 32768
 
constexpr size_t BATCH_CAPACITY = 256
 
constexpr size_t DEDUP_MAX_CHUNK_MEMBERS = 2048
 
constexpr size_t MIN_BATCH_CAPACITY = 32
 
constexpr size_t MIN_AFFINE_THREAD_RATIO = 2
 
constexpr size_t SUBCHUNK_ENTRIES_CAP = 2048
 
constexpr size_t BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL
 
constexpr uint32_t SCHEDULE_SIGN_BIT = uint32_t{ 1 } << 31
 
constexpr uint32_t DEDUP_REDIRECT_BIT = uint32_t{ 1 } << 30
 
constexpr uint32_t DEDUP_SKIP_BIT = uint32_t{ 1 } << 29
 
constexpr uint32_t SCHEDULE_INDEX_MASK = DEDUP_SKIP_BIT - 1
 
constexpr uint32_t DEDUP_INVALID_EXTRA = ~uint32_t{ 0 }
 
constexpr size_t GLV_SMALL_N_THRESHOLD = size_t{ 1 } << 13
 

Typedef Documentation

◆ ConstantineSliceParams

◆ SimdU32x4

Definition at line 138 of file pippenger_constantine.hpp.

Enumeration Type Documentation

◆ ConstantineSlicePath

Enumerator
Localised 
Bottom 
Boundary 

Definition at line 306 of file pippenger_constantine.hpp.

Function Documentation

◆ build_var_window_schedule()

VariableWindowSchedule bb::scalar_multiplication::round_parallel_detail::build_var_window_schedule ( size_t  num_bits,
size_t  window_bits 
)
inlinenoexcept

Definition at line 122 of file pippenger_arena_layout.hpp.

◆ choose_window_bits()

uint32_t bb::scalar_multiplication::round_parallel_detail::choose_window_bits ( size_t  num_points,
size_t  num_bits,
size_t  n_input,
size_t  num_logical_threads 
)
inlinenoexcept

Definition at line 76 of file pippenger_arena_layout.hpp.

◆ classify_slice_path_u32()

ConstantineSlicePath bb::scalar_multiplication::round_parallel_detail::classify_slice_path_u32 ( const ConstantineSliceParamsU32 sp)
inlinenoexcept

Definition at line 312 of file pippenger_constantine.hpp.

◆ compute_bucket_partials_max()

size_t bb::scalar_multiplication::round_parallel_detail::compute_bucket_partials_max ( size_t  B_eff,
size_t  num_threads 
)
inlinenoexcept

Definition at line 258 of file pippenger_arena_layout.hpp.

◆ compute_constantine_slice_params()

ConstantineSliceParams bb::scalar_multiplication::round_parallel_detail::compute_constantine_slice_params ( size_t  bit_offset,
size_t  window_bits,
size_t  num_uint64_limbs 
)
inlinenoexcept

Definition at line 49 of file pippenger_constantine.hpp.

◆ compute_constantine_slice_params_u32()

ConstantineSliceParamsU32 bb::scalar_multiplication::round_parallel_detail::compute_constantine_slice_params_u32 ( size_t  bit_offset,
size_t  window_bits,
size_t  num_u32_limbs 
)
inlinenoexcept

Definition at line 157 of file pippenger_constantine.hpp.

◆ compute_dense_stride()

size_t bb::scalar_multiplication::round_parallel_detail::compute_dense_stride ( size_t  B_eff,
size_t  num_threads 
)
inlinenoexcept

Definition at line 250 of file pippenger_arena_layout.hpp.

◆ compute_global_max_overflow_per_window()

size_t bb::scalar_multiplication::round_parallel_detail::compute_global_max_overflow_per_window ( size_t  n,
size_t  num_threads,
size_t  subchunk_entries_cap 
)
inlinenoexcept

Definition at line 264 of file pippenger_arena_layout.hpp.

◆ compute_per_window_bytes()

template<typename Curve >
size_t bb::scalar_multiplication::round_parallel_detail::compute_per_window_bytes ( size_t  num_threads,
size_t  B_eff,
size_t  n,
size_t  dense_stride,
size_t  worker_total 
)
inlinenoexcept

Definition at line 283 of file pippenger_arena_layout.hpp.

◆ compute_phase_a_caps()

PhaseACaps bb::scalar_multiplication::round_parallel_detail::compute_phase_a_caps ( size_t  n,
size_t  num_threads 
)
inlinenoexcept

Definition at line 318 of file pippenger_arena_layout.hpp.

◆ compute_phase_one_prologue_bytes()

size_t bb::scalar_multiplication::round_parallel_detail::compute_phase_one_prologue_bytes ( size_t  n,
bool  use_glv,
bool  inline_glv_double,
size_t  profile_threads 
)
inlinenoexcept

Definition at line 299 of file pippenger_arena_layout.hpp.

◆ dedup_fingerprint_slot()

size_t bb::scalar_multiplication::round_parallel_detail::dedup_fingerprint_slot ( uint64_t  fingerprint,
size_t  mask 
)
inlinenoexcept

Definition at line 60 of file pippenger_dedup.hpp.

◆ dedup_patch_schedule_window()

template<typename Curve >
void bb::scalar_multiplication::round_parallel_detail::dedup_patch_schedule_window ( uint32_t *__restrict  sched_w,
size_t *__restrict  bucket_start,
size_t  num_buckets,
const uint32_t *__restrict  redirect_lookup 
)
inlinenoexcept

Definition at line 554 of file pippenger_dedup.hpp.

◆ dedup_phase_a_worker_hash()

template<typename Curve >
size_t bb::scalar_multiplication::round_parallel_detail::dedup_phase_a_worker_hash ( const uint32_t *  schedule_w0,
const size_t *  w0_bucket_start,
size_t  b_lo,
size_t  b_hi,
std::span< const typename Curve::ScalarField scalars,
std::span< const typename Curve::AffineElement points,
std::span< typename Curve::AffineElement extra_points,
std::span< uint32_t >  redirect_lookup,
const uint8_t *  msb_per_scalar,
size_t  c_threshold,
uint32_t  cid_lo,
uint32_t  cid_max,
PhaseAScratch< Curve > &  scratch 
)
noexcept

Definition at line 232 of file pippenger_dedup.hpp.

◆ dedup_scalar_fingerprint()

uint64_t bb::scalar_multiplication::round_parallel_detail::dedup_scalar_fingerprint ( const uint64_t *  scalar_data)
inlinenoexcept

Definition at line 55 of file pippenger_dedup.hpp.

◆ dedup_tree_reduce_in_place()

template<typename Curve >
size_t bb::scalar_multiplication::round_parallel_detail::dedup_tree_reduce_in_place ( typename Curve::AffineElement pts,
uint32_t *  ids,
size_t  initial_len,
typename Curve::AffineElement scratch_pts,
uint32_t *  pair_dest,
typename Curve::BaseField inversion_scratch 
)
inlinenoexcept

Definition at line 110 of file pippenger_dedup.hpp.

◆ gather_x4_u32()

SimdU32x4 bb::scalar_multiplication::round_parallel_detail::gather_x4_u32 ( const uint32_t *  p0,
const uint32_t *  p1,
const uint32_t *  p2,
const uint32_t *  p3,
uint32_t  idx 
)
inlinenoexcept

Definition at line 198 of file pippenger_constantine.hpp.

◆ get_constantine_packed_digit()

uint32_t bb::scalar_multiplication::round_parallel_detail::get_constantine_packed_digit ( const uint64_t *  scalar_data,
uint32_t  lo_limb,
uint32_t  hi_limb,
uint32_t  lo_off,
uint32_t  lo_bits,
uint32_t  lo_mask,
uint32_t  hi_mask,
bool  slice_localised_to_one_u64,
size_t  window_bits 
)
inlinenoexcept

Read (window_bits+1) bits from scalar_data (uint64 limbs) using precomputed slice params and apply Constantine's signedWindowEncoding to produce a (sign | bucket) packed digit.

Takes the slice params as scalar value parameters rather than a struct reference so the compiler can keep them in registers across the caller loop.

slice_localised_to_one_u64 selects the single-load fast path: ~75% of windows on typical 254-bit scalars (window_bits in [12, 19]) hit this.

Definition at line 66 of file pippenger_constantine.hpp.

◆ pippenger_round_parallel_jacobian_fast()

template<typename Curve >
Curve::Element bb::scalar_multiplication::round_parallel_detail::pippenger_round_parallel_jacobian_fast ( std::span< const typename Curve::ScalarField scalars,
std::span< const typename Curve::AffineElement points,
size_t  min_pts_per_thread_override 
)
noexcept

Small-N fast-path: per-thread Jacobian Pippenger over a partition of the input.

Single-MSM_fast, no-affine-trick Pippenger over window_bits-wide windows.

Bypasses the round-parallel scaffolding (biased recoding, count histogram, prefix sum, scatter, partition, recursive affine bucket reduction) entirely. Caller must have already converted scalars from Montgomery form to standard form. Each thread runs a textbook Pippenger over its slice of the input, with the result summed across threads at the end.

Per round (high-bit slice → low-bit slice):

  • Reset present bitmap.
  • For each point in the thread's range, extract the window_bits-wide scalar slice; if non-zero, either ASSIGN the bucket (Z = 1) on first hit or Element += AffineElement (mixed Jacobian-affine, 7M+4S, no inversion) on subsequent hits.
  • Running suffix sum over populated buckets only.
  • Double the running result by window_bits bits (or remainder for the last round) and add the bucket sum.

No batched-affine path, no modular inversions, no count-sort.

min_pts_per_thread_override lets benchmarks pin behaviour:

  • 0 (default) → use the internal MIN_PTS_PER_THREAD heuristic (256 native, single-threaded on WASM).
  • SIZE_MAX → force single-threaded.
  • 1 → maximally multi-threaded (one worker per logical CPU).

Definition at line 941 of file scalar_multiplication_fast.cpp.

◆ pippenger_round_parallel_jacobian_fast< curve::BN254 >()

template curve::BN254::Element bb::scalar_multiplication::round_parallel_detail::pippenger_round_parallel_jacobian_fast< curve::BN254 > ( std::span< const curve::BN254::ScalarField scalars,
std::span< const curve::BN254::AffineElement points,
size_t  min_pts_per_thread_override 
)
noexcept

◆ pippenger_round_parallel_jacobian_fast< curve::Grumpkin >()

template curve::Grumpkin::Element bb::scalar_multiplication::round_parallel_detail::pippenger_round_parallel_jacobian_fast< curve::Grumpkin > ( std::span< const curve::Grumpkin::ScalarField scalars,
std::span< const curve::Grumpkin::AffineElement points,
size_t  min_pts_per_thread_override 
)
noexcept

◆ simd_u32x4_store()

void bb::scalar_multiplication::round_parallel_detail::simd_u32x4_store ( uint32_t *  dst,
SimdU32x4  v 
)
inlinenoexcept

Definition at line 217 of file pippenger_constantine.hpp.

◆ solve_wpb()

size_t bb::scalar_multiplication::round_parallel_detail::solve_wpb ( size_t  per_window_bytes,
size_t  available_budget,
size_t  W_R 
)
inlinenoexcept

Definition at line 325 of file pippenger_arena_layout.hpp.

◆ store_constantine_packed_digits_x4_bottom()

void bb::scalar_multiplication::round_parallel_detail::store_constantine_packed_digits_x4_bottom ( uint32_t *  dst,
const uint32_t *  scalar_data_0,
const uint32_t *  scalar_data_1,
const uint32_t *  scalar_data_2,
const uint32_t *  scalar_data_3,
uint32_t  hi_limb,
uint32_t  lo_bits,
SimdU32x4  hi_mask_v,
SimdU32x4  one_v,
SimdU32x4  val_mask,
uint32_t  window_bits 
)
inlinenoexcept

Definition at line 254 of file pippenger_constantine.hpp.

◆ store_constantine_packed_digits_x4_boundary()

void bb::scalar_multiplication::round_parallel_detail::store_constantine_packed_digits_x4_boundary ( uint32_t *  dst,
const uint32_t *  scalar_data_0,
const uint32_t *  scalar_data_1,
const uint32_t *  scalar_data_2,
const uint32_t *  scalar_data_3,
uint32_t  lo_limb,
uint32_t  hi_limb,
uint32_t  lo_off,
uint32_t  lo_bits,
SimdU32x4  lo_mask_v,
SimdU32x4  hi_mask_v,
SimdU32x4  one_v,
SimdU32x4  val_mask,
uint32_t  window_bits 
)
inlinenoexcept

Definition at line 276 of file pippenger_constantine.hpp.

◆ store_constantine_packed_digits_x4_localised()

void bb::scalar_multiplication::round_parallel_detail::store_constantine_packed_digits_x4_localised ( uint32_t *  dst,
const uint32_t *  scalar_data_0,
const uint32_t *  scalar_data_1,
const uint32_t *  scalar_data_2,
const uint32_t *  scalar_data_3,
uint32_t  lo_limb,
uint32_t  lo_off,
SimdU32x4  lo_mask_v,
SimdU32x4  one_v,
SimdU32x4  val_mask,
uint32_t  window_bits 
)
inlinenoexcept

Definition at line 232 of file pippenger_constantine.hpp.

Variable Documentation

◆ BATCH_CAPACITY

constexpr size_t bb::scalar_multiplication::round_parallel_detail::BATCH_CAPACITY = 256
inlineconstexpr

Definition at line 145 of file pippenger_arena_layout.hpp.

◆ BATCH_MEM_BUDGET

constexpr size_t bb::scalar_multiplication::round_parallel_detail::BATCH_MEM_BUDGET = 32ULL * 1024ULL * 1024ULL
inlineconstexpr

Definition at line 154 of file pippenger_arena_layout.hpp.

◆ DEDUP_INVALID_EXTRA

constexpr uint32_t bb::scalar_multiplication::round_parallel_detail::DEDUP_INVALID_EXTRA = ~uint32_t{ 0 }
inlineconstexpr

Definition at line 53 of file pippenger_dedup.hpp.

◆ DEDUP_MAX_CHUNK_MEMBERS

constexpr size_t bb::scalar_multiplication::round_parallel_detail::DEDUP_MAX_CHUNK_MEMBERS = 2048
inlineconstexpr

Definition at line 149 of file pippenger_arena_layout.hpp.

◆ DEDUP_MAX_CLUSTERS

constexpr size_t bb::scalar_multiplication::round_parallel_detail::DEDUP_MAX_CLUSTERS = 16384
inlineconstexpr

Definition at line 53 of file pippenger_arena_layout.hpp.

◆ DEDUP_MAX_MEMBERS

constexpr size_t bb::scalar_multiplication::round_parallel_detail::DEDUP_MAX_MEMBERS = 32768
inlineconstexpr

Definition at line 54 of file pippenger_arena_layout.hpp.

◆ DEDUP_REDIRECT_BIT

constexpr uint32_t bb::scalar_multiplication::round_parallel_detail::DEDUP_REDIRECT_BIT = uint32_t{ 1 } << 30
inlineconstexpr

Definition at line 45 of file pippenger_dedup.hpp.

◆ DEDUP_SKIP_BIT

constexpr uint32_t bb::scalar_multiplication::round_parallel_detail::DEDUP_SKIP_BIT = uint32_t{ 1 } << 29
inlineconstexpr

Definition at line 46 of file pippenger_dedup.hpp.

◆ GLV_SMALL_N_THRESHOLD

constexpr size_t bb::scalar_multiplication::round_parallel_detail::GLV_SMALL_N_THRESHOLD = size_t{ 1 } << 13
inlineconstexpr

Definition at line 191 of file scalar_multiplication_fast.hpp.

◆ MIN_AFFINE_THREAD_RATIO

constexpr size_t bb::scalar_multiplication::round_parallel_detail::MIN_AFFINE_THREAD_RATIO = 2
inlineconstexpr

Definition at line 152 of file pippenger_arena_layout.hpp.

◆ MIN_BATCH_CAPACITY

constexpr size_t bb::scalar_multiplication::round_parallel_detail::MIN_BATCH_CAPACITY = 32
inlineconstexpr

Definition at line 151 of file pippenger_arena_layout.hpp.

◆ SCHEDULE_INDEX_MASK

constexpr uint32_t bb::scalar_multiplication::round_parallel_detail::SCHEDULE_INDEX_MASK = DEDUP_SKIP_BIT - 1
inlineconstexpr

Definition at line 47 of file pippenger_dedup.hpp.

◆ SCHEDULE_SIGN_BIT

constexpr uint32_t bb::scalar_multiplication::round_parallel_detail::SCHEDULE_SIGN_BIT = uint32_t{ 1 } << 31
inlineconstexpr

Definition at line 44 of file pippenger_dedup.hpp.

◆ SUBCHUNK_ENTRIES_CAP

constexpr size_t bb::scalar_multiplication::round_parallel_detail::SUBCHUNK_ENTRIES_CAP = 2048
inlineconstexpr

Definition at line 153 of file pippenger_arena_layout.hpp.

◆ VAR_WINDOW_MAX_WINDOWS

constexpr size_t bb::scalar_multiplication::round_parallel_detail::VAR_WINDOW_MAX_WINDOWS = 128
inlineconstexpr

Definition at line 49 of file pippenger_arena_layout.hpp.