Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
pippenger_constantine.hpp
Go to the documentation of this file.
1// Constantine-style signed-Booth window recoder for Pippenger MSM.
2//
3// Given a scalar s = sum_i s_i 2^i and a window [b, b + c), this module computes a
4// signed digit d in [-(2^c - 1), 2^c - 1] such that the scalar can be reconstructed as
5// s = sum_w d_w 2^{b_w}. It returns d as a packed `(sign | bucket)` value, where
6// `bucket = |d|` and `sign` records whether d is negative.
7//
8// Implements the carry-less `signedWindowEncoding` / `getSignedFullWindowAt` pattern from
9// `constantine/math/arithmetic/bigints.nim`: each window reads c+1 bits including the
10// previous window boundary bit, lets that shared boundary bit substitute for an explicit
11// carry, and produces a `(sign | bucket)` packed digit.
12//
13// Assumptions: production callers pass `window_bits` in [1, 19] and bit offsets within a
14// 256-bit scalar. The bit-twiddling below assumes `window_bits < 32`.
15//
16// Two parallel paths:
17// * scalar path — `ConstantineSliceParams` + `get_constantine_packed_digit` (uint64-
18// indexed limbs).
19// * SIMD x4 path — `ConstantineSliceParamsU32` + `store_constantine_packed_digits_x4_*`
20// (uint32-indexed limbs, processes 4 scalars per call via GCC vector_size).
21//
22// The SIMD helpers split on slice-path (Localised / Bottom / Boundary) so the per-window
23// branch is hoisted out of the per-scalar loop. `classify_slice_path_u32` returns the
24// matching enum for callers to dispatch on once per window.
25
26#pragma once
27
29
30#include <cstddef>
31#include <cstdint>
32
33#ifdef __wasm_simd128__
34#include <wasm_simd128.h>
35#endif
36
38
39// Bring the shared signed-Booth slice primitive (from ecc/groups/booth_recode.hpp) into
40// this namespace so the MSM-specific readers below can call it unqualified. The same
41// primitive is also used by the GLV-endo straus path in element_impl.hpp; only the
42// MSM-specific u32-indexed variant and perf-tuned packed-digit readers stay local.
45
46// Backward-compat aliases for the MSM-local names; the canonical definitions live in
47// ecc/groups/booth_recode.hpp and are pulled in by the using-declarations above.
49[[nodiscard]] [[gnu::always_inline]] inline ConstantineSliceParams compute_constantine_slice_params(
50 size_t bit_offset, size_t window_bits, size_t num_uint64_limbs) noexcept
51{
52 return compute_booth_slice_params(bit_offset, window_bits, num_uint64_limbs);
53}
54
66[[nodiscard]] [[gnu::always_inline]] inline uint32_t get_constantine_packed_digit(const uint64_t* scalar_data,
67 uint32_t lo_limb,
68 uint32_t hi_limb,
69 uint32_t lo_off,
70 uint32_t lo_bits,
71 uint32_t lo_mask,
72 uint32_t hi_mask,
73 bool slice_localised_to_one_u64,
74 size_t window_bits) noexcept
75{
76 uint64_t raw_wide = 0;
77 if (slice_localised_to_one_u64) {
78 // Fast path: one load + shift + mask. hi_part vanishes (hi_mask == 0); skip it.
79 raw_wide = (scalar_data[lo_limb] >> lo_off) & lo_mask;
80 } else if (lo_mask == 0) {
81 // Bottom-window fast path: synthetic-zero lookback bit, so the lo_part contribution is
82 // always 0 (lo_mask == 0). Skip the lo limb load entirely. lo_bits == 1 here, so the
83 // shift plants the window_bits-bit slice at bits 1..window_bits with bit 0 = 0.
84 // sp_lo_mask is loop-invariant within a window but is a runtime stack value, so the
85 // compiler does NOT constant-fold the `(s_lo >> lo_off) & 0 = 0` path inside the
86 // boundary branch; this explicit check saves ~3 ALU ops per scalar on the bottom window.
87 raw_wide = (scalar_data[hi_limb] & hi_mask) << lo_bits;
88 } else {
89 // Slow path: window straddles a uint64 boundary.
90 const uint64_t s_lo = scalar_data[lo_limb];
91 const uint64_t s_hi = scalar_data[hi_limb];
92 const uint64_t lo_part = (s_lo >> lo_off) & lo_mask;
93 const uint64_t hi_part = (s_hi & hi_mask) << lo_bits;
94 raw_wide = lo_part | hi_part;
95 }
96 // raw fits in window_bits+1 ≤ 32 bits, safe to narrow.
97 const uint32_t raw = static_cast<uint32_t>(raw_wide);
98
99 // signedWindowEncoding(raw, window_bits). raw fits in window_bits+1 bits; bit
100 // `window_bits` is the sign indicator.
101 //
102 // The conditional-negate trick `((encode + neg_mask) ^ neg_mask)` is the standard
103 // branchless idiom. We use the equivalent `(encode - neg) ^ neg_mask` to break the
104 // latency chain: `encode - neg` and `neg_mask = -neg` can issue in parallel (both
105 // depend only on `neg` / `encode`), whereas `encode + neg_mask` first waits for
106 // `neg_mask` to materialise. Saves one cycle on the inner-loop critical path
107 // (neg → neg_mask → +neg_mask → ^neg_mask → &val_mask vs neg → {neg_mask, enc_neg}
108 // in parallel → ^neg_mask → &val_mask). Identical result by:
109 // neg=0: enc_neg = encode, xored = encode ^ 0 = encode. ✓
110 // neg=1: enc_neg = encode−1, xored = (encode−1) ^ −1 = ~(encode−1) = −encode. ✓
111 const uint32_t neg = (raw >> window_bits) & uint32_t{ 1 };
112 const uint32_t neg_mask = uint32_t{ 0 } - neg; // 0 or 0xFFFFFFFF
113 const uint32_t val_mask = (uint32_t{ 1 } << window_bits) - 1;
114 const uint32_t encode = (raw + 1) >> 1;
115 const uint32_t bucket_idx = ((encode - neg) ^ neg_mask) & val_mask;
116
117 // Pack into (sign | bucket): sign in bit 31, bucket magnitude in the low bits.
118 return (neg << 31) | bucket_idx;
119}
120
121// 128-bit SIMD-friendly 4-wide variant of get_constantine_packed_digit. Computes 4 packed
122// digits in parallel via GCC's vector_size extension, which lowers to native SIMD on x86
123// (SSE2), ARM (NEON), and WASM (wasm-simd128). The branch on slice path is hoisted from
124// the per-call site to the per-window outer loop, so callers select the localised / bottom /
125// boundary specialisation once per window.
126//
127// We index the scalar via a `const uint32_t*` view rather than the natural `uint64_t*`:
128// each lane is one uint32, so a 128-bit SIMD register holds 4 (raw, encode, bucket, …)
129// values. `scalar.data` is a `std::array<uint64_t, 4>` whose byte layout is identical to
130// `uint32_t[8]` on every target we ship to (x86 / ARM / WASM are all little-endian, and the
131// codebase already assumes this layout in many places — `from_montgomery`, `uint256_t`,
132// etc.). The reinterpret_cast is the same alias pattern.
133//
134// Returns the four packed digits in `out[0..3]`. The caller scatters them individually,
135// since the consuming writes are not vectorisable. Switching from 2-wide uint64 to 4-wide
136// uint32 doubles the compute throughput per SIMD instruction at the cost of slightly more
137// straddle hits.
138using SimdU32x4 = uint32_t __attribute__((vector_size(16)));
139
140// Helpers return `SimdU32x4` directly so the v128 stays in the SIMD register file end-to-end.
141// Wrapping in a 4-uint32 struct round-tripped the v128 through 4 scalar memory slots.
142
143// uint32-indexed Constantine slice params, mirroring `ConstantineSliceParams` but with
144// limb indices measured in 32-bit (rather than 64-bit) chunks. Computed once per window in
145// `compute_constantine_slice_params_u32`; consumed by the SIMD x4 helpers below.
147 uint32_t lo_mask;
148 uint32_t hi_mask;
149 uint32_t lo_limb; // u32 limb index of the lookback bit
150 uint32_t hi_limb; // == lo_limb + 1, clamped to last in-range u32 limb at the top window
151 uint32_t lo_off; // bit-offset of the lookback bit within `lo_limb`
152 uint32_t lo_bits; // # bits read from `lo_limb` (also acts as the hi_part left-shift amount)
155};
156
158 size_t window_bits,
159 size_t num_u32_limbs) noexcept
160{
161 constexpr size_t LIMB_BITS_U32 = 32;
163 if (bit_offset == 0) {
164 sp.lo_limb = 0;
165 sp.hi_limb = 0;
166 sp.lo_off = LIMB_BITS_U32 - 1;
167 sp.lo_bits = 1;
168 sp.lo_mask = 0;
169 sp.hi_mask = (uint32_t{ 1 } << window_bits) - 1;
171 sp.is_bottom_window = true;
172 } else {
173 const size_t lookback_bit = bit_offset - 1;
174 const size_t bits_to_read = window_bits + 1;
175 sp.lo_limb = static_cast<uint32_t>(lookback_bit / LIMB_BITS_U32);
176 sp.lo_off = static_cast<uint32_t>(lookback_bit & (LIMB_BITS_U32 - 1));
177 const uint32_t in_lo = static_cast<uint32_t>(LIMB_BITS_U32 - sp.lo_off);
178 sp.lo_bits = (in_lo < static_cast<uint32_t>(bits_to_read)) ? in_lo : static_cast<uint32_t>(bits_to_read);
179 const uint32_t hi_bits = static_cast<uint32_t>(bits_to_read) - sp.lo_bits;
180 sp.lo_mask = (sp.lo_bits == LIMB_BITS_U32) ? ~uint32_t{ 0 } : ((uint32_t{ 1 } << sp.lo_bits) - 1);
181 if (static_cast<size_t>(sp.lo_limb) + 1 >= num_u32_limbs) {
182 sp.hi_limb = sp.lo_limb;
183 sp.hi_mask = 0;
184 } else {
185 sp.hi_limb = sp.lo_limb + 1;
186 sp.hi_mask = (uint32_t{ 1 } << hi_bits) - 1;
187 }
188 sp.slice_localised_to_one_u32 = (hi_bits == 0);
189 sp.is_bottom_window = false;
190 }
191 return sp;
192}
193
194// Gather 4 disjoint uint32 values into one v128 via wasm v128.load32_lane. On WASM this
195// is 1 splat + 3 load32_lane (4 ops); brace-init `{a, b, c, d}` with runtime values emits
196// 4 scalar i32.load + 1 splat + 3 replace_lane (8 ops). On native it falls back to brace-
197// init which clang lowers to NEON ins / SSE2 pinsrd.
198[[nodiscard]] [[gnu::always_inline]] inline SimdU32x4 gather_x4_u32(
199 const uint32_t* p0, const uint32_t* p1, const uint32_t* p2, const uint32_t* p3, uint32_t idx) noexcept
200{
201#ifdef __wasm_simd128__
202 v128_t v = wasm_i32x4_splat(0);
203 v = wasm_v128_load32_lane(p0 + idx, v, 0);
204 v = wasm_v128_load32_lane(p1 + idx, v, 1);
205 v = wasm_v128_load32_lane(p2 + idx, v, 2);
206 v = wasm_v128_load32_lane(p3 + idx, v, 3);
207 return reinterpret_cast<SimdU32x4>(v);
208#else
209 return SimdU32x4{ p0[idx], p1[idx], p2[idx], p3[idx] };
210#endif
211}
212
213// Store a `SimdU32x4` to a 4-lane uint32 destination as a single 128-bit op.
214// Precondition: `dst` is 16-byte aligned.
215// On WASM the explicit intrinsic guarantees a `v128.store`; on native the typed
216// vector store lets the compiler use aligned SIMD stores (e.g. x86 movaps/movdqa).
217[[gnu::always_inline]] inline void simd_u32x4_store(uint32_t* dst, SimdU32x4 v) noexcept
218{
219#ifdef __wasm_simd128__
220 wasm_v128_store(dst, reinterpret_cast<v128_t>(v));
221#else
222 *reinterpret_cast<SimdU32x4*>(dst) = v;
223#endif
224}
225
226// All four mask / constant v128s (lo_mask_v, hi_mask_v, one_v, val_mask) are loop-invariant
227// within a window. Callers build them ONCE per window in the outer-w loop and pass them in,
228// so the inner-i compute loop has zero v128.const / splat / shl+sub for the masks.
229// `neg_mask = -neg` uses GCC vector-ext unary minus which lowers to `i32x4.neg` on WASM.
230//
231// Helpers write the v128 result directly into the caller-provided 4-lane destination buffer.
232[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_localised(uint32_t* dst,
233 const uint32_t* scalar_data_0,
234 const uint32_t* scalar_data_1,
235 const uint32_t* scalar_data_2,
236 const uint32_t* scalar_data_3,
237 uint32_t lo_limb,
238 uint32_t lo_off,
239 SimdU32x4 lo_mask_v,
240 SimdU32x4 one_v,
241 SimdU32x4 val_mask,
242 uint32_t window_bits) noexcept
243{
244 const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
245 const SimdU32x4 raw = (lo >> lo_off) & lo_mask_v;
246 const SimdU32x4 neg = (raw >> window_bits) & one_v;
247 const SimdU32x4 neg_mask = -neg;
248 const SimdU32x4 encode = (raw + one_v) >> 1;
249 const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
250 const SimdU32x4 packed = (neg << 31) | bucket;
251 simd_u32x4_store(dst, packed);
252}
253
254[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_bottom(uint32_t* dst,
255 const uint32_t* scalar_data_0,
256 const uint32_t* scalar_data_1,
257 const uint32_t* scalar_data_2,
258 const uint32_t* scalar_data_3,
259 uint32_t hi_limb,
260 uint32_t lo_bits,
261 SimdU32x4 hi_mask_v,
262 SimdU32x4 one_v,
263 SimdU32x4 val_mask,
264 uint32_t window_bits) noexcept
265{
266 const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
267 const SimdU32x4 raw = (hi & hi_mask_v) << lo_bits;
268 const SimdU32x4 neg = (raw >> window_bits) & one_v;
269 const SimdU32x4 neg_mask = -neg;
270 const SimdU32x4 encode = (raw + one_v) >> 1;
271 const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
272 const SimdU32x4 packed = (neg << 31) | bucket;
273 simd_u32x4_store(dst, packed);
274}
275
276[[gnu::always_inline]] inline void store_constantine_packed_digits_x4_boundary(uint32_t* dst,
277 const uint32_t* scalar_data_0,
278 const uint32_t* scalar_data_1,
279 const uint32_t* scalar_data_2,
280 const uint32_t* scalar_data_3,
281 uint32_t lo_limb,
282 uint32_t hi_limb,
283 uint32_t lo_off,
284 uint32_t lo_bits,
285 SimdU32x4 lo_mask_v,
286 SimdU32x4 hi_mask_v,
287 SimdU32x4 one_v,
288 SimdU32x4 val_mask,
289 uint32_t window_bits) noexcept
290{
291 const SimdU32x4 lo = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, lo_limb);
292 const SimdU32x4 hi = gather_x4_u32(scalar_data_0, scalar_data_1, scalar_data_2, scalar_data_3, hi_limb);
293 const SimdU32x4 lo_part = (lo >> lo_off) & lo_mask_v;
294 const SimdU32x4 hi_part = (hi & hi_mask_v) << lo_bits;
295 const SimdU32x4 raw = lo_part | hi_part;
296 const SimdU32x4 neg = (raw >> window_bits) & one_v;
297 const SimdU32x4 neg_mask = -neg;
298 const SimdU32x4 encode = (raw + one_v) >> 1;
299 const SimdU32x4 bucket = ((encode - neg) ^ neg_mask) & val_mask;
300 const SimdU32x4 packed = (neg << 31) | bucket;
301 simd_u32x4_store(dst, packed);
302}
303
304// Path-selector enum used to dispatch on the SIMD specialisation once per window rather
305// than once per scalar.
306enum class ConstantineSlicePath : uint8_t {
307 Localised = 0,
308 Bottom = 1,
309 Boundary = 2,
310};
311
312[[nodiscard]] [[gnu::always_inline]] inline ConstantineSlicePath classify_slice_path_u32(
313 const ConstantineSliceParamsU32& sp) noexcept
314{
315 if (sp.is_bottom_window) {
317 }
318 if (sp.slice_localised_to_one_u32) {
320 }
322}
323
324} // namespace bb::scalar_multiplication::round_parallel_detail
__attribute__((section("__libfuzzer_extra_counters"))) uint8_t num_events
BoothSliceParams compute_booth_slice_params(size_t bit_offset, size_t window_bits, size_t num_uint64_limbs) noexcept
uint32_t get_constantine_packed_digit(const uint64_t *scalar_data, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, uint32_t lo_mask, uint32_t hi_mask, bool slice_localised_to_one_u64, size_t window_bits) noexcept
Read (window_bits+1) bits from scalar_data (uint64 limbs) using precomputed slice params and apply Co...
ConstantineSlicePath classify_slice_path_u32(const ConstantineSliceParamsU32 &sp) noexcept
void store_constantine_packed_digits_x4_bottom(uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t hi_limb, uint32_t lo_bits, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
void store_constantine_packed_digits_x4_boundary(uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t hi_limb, uint32_t lo_off, uint32_t lo_bits, SimdU32x4 lo_mask_v, SimdU32x4 hi_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
SimdU32x4 gather_x4_u32(const uint32_t *p0, const uint32_t *p1, const uint32_t *p2, const uint32_t *p3, uint32_t idx) noexcept
uint32_t __attribute__((vector_size(16))) SimdU32x4
ConstantineSliceParams compute_constantine_slice_params(size_t bit_offset, size_t window_bits, size_t num_uint64_limbs) noexcept
void simd_u32x4_store(uint32_t *dst, SimdU32x4 v) noexcept
void store_constantine_packed_digits_x4_localised(uint32_t *dst, const uint32_t *scalar_data_0, const uint32_t *scalar_data_1, const uint32_t *scalar_data_2, const uint32_t *scalar_data_3, uint32_t lo_limb, uint32_t lo_off, SimdU32x4 lo_mask_v, SimdU32x4 one_v, SimdU32x4 val_mask, uint32_t window_bits) noexcept
ConstantineSliceParamsU32 compute_constantine_slice_params_u32(size_t bit_offset, size_t window_bits, size_t num_u32_limbs) noexcept
Per-window precomputed slice parameters for the carry-less signed-Booth window recoding....