|
| uint32_t | choose_window_bits (size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads) noexcept |
| |
| VariableWindowSchedule | build_var_window_schedule (size_t num_bits, size_t window_bits) noexcept |
| |
| size_t | compute_dense_stride (size_t B_eff, size_t num_threads) noexcept |
| |
| size_t | compute_bucket_partials_max (size_t B_eff, size_t num_threads) noexcept |
| |
| size_t | compute_global_max_overflow_per_window (size_t n, size_t num_threads, size_t subchunk_entries_cap) noexcept |
| |
| template<typename Curve > |
| size_t | compute_per_window_bytes (size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept |
| |
| size_t | compute_phase_one_prologue_bytes (size_t n, bool use_glv, bool inline_glv_double, size_t profile_threads) noexcept |
| |
| PhaseACaps | compute_phase_a_caps (size_t n, size_t num_threads) noexcept |
| |
| size_t | solve_wpb (size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept |
| |
| ConstantineSliceParams | compute_constantine_slice_params (size_t bit_offset, size_t window_bits, size_t num_uint64_limbs) noexcept |
| |
| uint32_t | get_constantine_packed_digit (const uint64_t *scalar_data, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, uint32_t lo_mask, uint32_t hi_mask, bool slice_localised_to_one_u64, size_t window_bits) noexcept |
| | Read (window_bits+1) bits from scalar_data (uint64 limbs) using precomputed slice params and apply Constantine's signedWindowEncoding to produce a (sign | bucket) packed digit.
|
| |
| ConstantineSliceParamsU32 | compute_constantine_slice_params_u32 (size_t bit_offset, size_t window_bits, size_t num_u32_limbs) noexcept |
| |
| SimdU32x4 | gather_x4_u32 (const uint32_t *p0, const uint32_t *p1, const uint32_t *p2, const uint32_t *p3, uint32_t idx) noexcept |
| |
| void | simd_u32x4_store (uint32_t *dst, SimdU32x4 v) noexcept |
| |
| void | store_constantine_packed_digits_x4_localised (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t lo_off, SimdU32x4 lo_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept |
| |
| void | store_constantine_packed_digits_x4_bottom (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t hi_limb, uint32_t lo_bits, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept |
| |
| void | store_constantine_packed_digits_x4_boundary (uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, SimdU32x4 lo_mask_v, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept |
| |
| ConstantineSlicePath | classify_slice_path_u32 (const ConstantineSliceParamsU32 &sp) noexcept |
| |
| uint64_t | dedup_scalar_fingerprint (const uint64_t *scalar_data) noexcept |
| |
| size_t | dedup_fingerprint_slot (uint64_t fingerprint, size_t mask) noexcept |
| |
| template<typename Curve > |
| size_t | dedup_tree_reduce_in_place (typename Curve::AffineElement *pts, uint32_t *ids, size_t initial_len, typename Curve::AffineElement *scratch_pts, uint32_t *pair_dest, typename Curve::BaseField *inversion_scratch) noexcept |
| |
| template<typename Curve > |
| size_t | dedup_phase_a_worker_hash (const uint32_t *schedule_w0, const size_t *w0_bucket_start, size_t b_lo, size_t b_hi, std::span< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, std::span< typename Curve::AffineElement > extra_points, std::span< uint32_t > redirect_lookup, const uint8_t *msb_per_scalar, size_t c_threshold, uint32_t cid_lo, uint32_t cid_max, PhaseAScratch< Curve > &scratch) noexcept |
| |
| template<typename Curve > |
| void | dedup_patch_schedule_window (uint32_t *__restrict sched_w, size_t *__restrict bucket_start, size_t num_buckets, const uint32_t *__restrict redirect_lookup) noexcept |
| |
| template<typename Curve > |
| Curve::Element | pippenger_round_parallel_jacobian_fast (std::span< const typename Curve::ScalarField > scalars, std::span< const typename Curve::AffineElement > points, size_t min_pts_per_thread_override) noexcept |
| | Small-N fast-path: per-thread Jacobian Pippenger over a partition of the input.
|
| |
| template curve::BN254::Element | pippenger_round_parallel_jacobian_fast< curve::BN254 > (std::span< const curve::BN254::ScalarField > scalars, std::span< const curve::BN254::AffineElement > points, size_t min_pts_per_thread_override) noexcept |
| |
| template curve::Grumpkin::Element | pippenger_round_parallel_jacobian_fast< curve::Grumpkin > (std::span< const curve::Grumpkin::ScalarField > scalars, std::span< const curve::Grumpkin::AffineElement > points, size_t min_pts_per_thread_override) noexcept |
| |
template<typename
Curve >
Small-N fast-path: per-thread Jacobian Pippenger over a partition of the input.
Single-MSM_fast, no-affine-trick Pippenger over window_bits-wide windows.
Bypasses the round-parallel scaffolding (biased recoding, count histogram, prefix sum, scatter, partition, recursive affine bucket reduction) entirely. Caller must have already converted scalars from Montgomery form to standard form. Each thread runs a textbook Pippenger over its slice of the input, with the result summed across threads at the end.
Per round (high-bit slice → low-bit slice):
- Reset
present bitmap.
- For each point in the thread's range, extract the window_bits-wide scalar slice; if non-zero, either ASSIGN the bucket (Z = 1) on first hit or
Element += AffineElement (mixed Jacobian-affine, 7M+4S, no inversion) on subsequent hits.
- Running suffix sum over populated buckets only.
- Double the running result by
window_bits bits (or remainder for the last round) and add the bucket sum.
No batched-affine path, no modular inversions, no count-sort.
min_pts_per_thread_override lets benchmarks pin behaviour:
- 0 (default) → use the internal
MIN_PTS_PER_THREAD heuristic (256 native, single-threaded on WASM).
- SIZE_MAX → force single-threaded.
- 1 → maximally multi-threaded (one worker per logical CPU).
Definition at line 941 of file scalar_multiplication_fast.cpp.