Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
element_impl.hpp
Go to the documentation of this file.
1// === AUDIT STATUS ===
2// internal: { status: Planned, auditors: [], commit: }
3// external_1: { status: not started, auditors: [], commit: }
4// external_2: { status: not started, auditors: [], commit: }
5// =====================
6
7#pragma once
13#include "element.hpp"
14#include <cstdint>
15
16// NOLINTBEGIN(readability-implicit-bool-conversion, cppcoreguidelines-avoid-c-arrays)
17namespace bb::group_elements {
18template <class Fq, class Fr, class T>
19constexpr element<Fq, Fr, T>::element(const Fq& a, const Fq& b, const Fq& c) noexcept
20 : x(a)
21 , y(b)
22 , z(c)
23{}
24
25template <class Fq, class Fr, class T>
27 : x(other.x)
28 , y(other.y)
29 , z(other.z)
30{}
31
32template <class Fq, class Fr, class T>
34 : x(other.x)
35 , y(other.y)
36 , z(other.z)
37{}
38
39template <class Fq, class Fr, class T>
41 : x(other.x)
42 , y(other.y)
43 , z(Fq::one())
44{}
45
46template <class Fq, class Fr, class T>
48{
49 if (this == &other) {
50 return *this;
51 }
52 x = other.x;
53 y = other.y;
54 z = other.z;
55 return *this;
56}
57
58template <class Fq, class Fr, class T>
60{
61 x = other.x;
62 y = other.y;
63 z = other.z;
64 return *this;
65}
66
67// Warning: variable-time — calls `z.invert()` (Bernstein-Yang safegcd). Do not
68// use on points derived from secret material (signing nonces, private keys, DH
69// shared secrets). For those, call `to_affine_const_time()` explicitly; the
70// implicit conversion does NOT pick up the const-time path.
71template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T>::operator affine_element<Fq, Fr, T>() const noexcept
72{
73 if (is_point_at_infinity()) {
75 result.x = Fq(0);
76 result.y = Fq(0);
77 result.self_set_infinity();
78 return result;
79 }
80 Fq z_inv = z.invert();
81 Fq zz_inv = z_inv.sqr();
82 Fq zzz_inv = zz_inv * z_inv;
83 affine_element<Fq, Fr, T> result(x * zz_inv, y * zzz_inv);
84 return result;
85}
86
87template <class Fq, class Fr, class T>
89{
90 if (is_point_at_infinity()) {
92 result.x = Fq(0);
93 result.y = Fq(0);
94 result.self_set_infinity();
95 return result;
96 }
97 Fq z_inv = z.invert_const_time();
98 Fq zz_inv = z_inv.sqr();
99 Fq zzz_inv = zz_inv * z_inv;
100 affine_element<Fq, Fr, T> result(x * zz_inv, y * zzz_inv);
101 return result;
102}
103
104template <class Fq, class Fr, class T> constexpr void element<Fq, Fr, T>::self_dbl() noexcept
105{
106 if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
107 if (is_point_at_infinity()) {
108 return;
109 }
110 } else {
111 if (x.is_msb_set_word()) {
112 return;
113 }
114 }
115
116 // T0 = x*x
117 Fq T0 = x.sqr();
118
119 // T1 = y*y
120 Fq T1 = y.sqr();
121
122 // T2 = T1*T1 = y*y*y*y
123 Fq T2 = T1.sqr();
124
125 // T1 = T1 + x = x + y*y
126 T1 += x;
127
128 // T1 = T1 * T1
129 T1.self_sqr();
130
131 // T3 = T0 + T2 = xx + y*y*y*y
132 Fq T3 = T0 + T2;
133
134 // T1 = T1 - T3 = x*x + y*y*y*y + 2*x*x*y*y*y*y - x*x - y*y*y*y = 2*x*x*y*y*y*y = 2*S
135 T1 -= T3;
136
137 // T1 = 2T1 = 4*S
138 T1 += T1;
139
140 // T3 = 3T0
141 T3 = T0 + T0;
142 T3 += T0;
143 if constexpr (T::has_a) {
144 T3 += (T::a * z.sqr().sqr());
145 }
146
147 // z2 = 2*y*z
148 z += z;
149 z *= y;
150
151 // T0 = 2T1
152 T0 = T1 + T1;
153
154 // x2 = T3*T3
155 x = T3.sqr();
156
157 // x2 = x2 - 2T1
158 x -= T0;
159
160 // T2 = 8T2
161 T2 += T2;
162 T2 += T2;
163 T2 += T2;
164
165 // y2 = T1 - x2
166 y = T1 - x;
167
168 // y2 = y2 * T3 - T2
169 y *= T3;
170 y -= T2;
171}
172
173template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T> element<Fq, Fr, T>::dbl() const noexcept
174{
175 element result(*this);
176 result.self_dbl();
177 return result;
178}
179
180template <class Fq, class Fr, class T>
182{
183 if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
184 // If either point is infinity, return the other point
185 if (other.is_point_at_infinity()) {
186 return *this;
187 }
188 if (is_point_at_infinity()) {
189 *this = { other.x, other.y, Fq::one() };
190 return *this;
191 }
192 } else {
193 const bool edge_case_trigger = x.is_msb_set() || other.x.is_msb_set();
194 if (edge_case_trigger) {
195 if (x.is_msb_set()) {
196 *this = { other.x, other.y, Fq::one() };
197 }
198 return *this;
199 }
200 }
201
202 // T0 = z1.z1
203 Fq T0 = z.sqr();
204
205 // T1 = x2.t0 - x1 = x2.z1.z1 - x1
206 Fq T1 = other.x * T0;
207 T1 -= x;
208
209 // T2 = T0.z1 = z1.z1.z1
210 // T2 = T2.y2 - y1 = y2.z1.z1.z1 - y1
211 Fq T2 = z * T0;
212 T2 *= other.y;
213 T2 -= y;
214
215 if (__builtin_expect(T1.is_zero(), 0)) {
216 if (T2.is_zero()) {
217 self_dbl();
218 return *this;
219 }
220 self_set_infinity();
221 return *this;
222 }
223
224 // T2 = 2T2 = 2(y2.z1.z1.z1 - y1) = R
225 // z3 = z1 + H
226 T2 += T2;
227 z += T1;
228
229 // T3 = T1*T1 = HH
230 Fq T3 = T1.sqr();
231
232 // z3 = z3 - z1z1 - HH
233 T0 += T3;
234
235 // z3 = (z1 + H)*(z1 + H)
236 z.self_sqr();
237 z -= T0;
238
239 // T3 = 4HH
240 T3 += T3;
241 T3 += T3;
242
243 // T1 = T1*T3 = 4HHH
244 T1 *= T3;
245
246 // T3 = T3 * x1 = 4HH*x1
247 T3 *= x;
248
249 // T0 = 2T3
250 T0 = T3 + T3;
251
252 // T0 = T0 + T1 = 2(4HH*x1) + 4HHH
253 T0 += T1;
254 x = T2.sqr();
255
256 // x3 = x3 - T0 = R*R - 8HH*x1 -4HHH
257 x -= T0;
258
259 // T3 = T3 - x3 = 4HH*x1 - x3
260 T3 -= x;
261
262 T1 *= y;
263 T1 += T1;
264
265 // T3 = T2 * T3 = R*(4HH*x1 - x3)
266 T3 *= T2;
267
268 // y3 = T3 - T1
269 y = T3 - T1;
270 return *this;
271}
272
273template <class Fq, class Fr, class T>
274constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator+(const affine_element<Fq, Fr, T>& other) const noexcept
275{
276 element result(*this);
277 return (result += other);
278}
279
280template <class Fq, class Fr, class T>
281constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator-=(const affine_element<Fq, Fr, T>& other) noexcept
282{
283 const affine_element<Fq, Fr, T> to_add{ other.x, -other.y };
284 return operator+=(to_add);
285}
286
287template <class Fq, class Fr, class T>
288constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator-(const affine_element<Fq, Fr, T>& other) const noexcept
289{
290 element result(*this);
291 return (result -= other);
292}
293
294template <class Fq, class Fr, class T>
296{
297 if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
298 bool p1_zero = is_point_at_infinity();
299 bool p2_zero = other.is_point_at_infinity();
300 if (__builtin_expect((p1_zero || p2_zero), 0)) {
301 if (p1_zero && !p2_zero) {
302 *this = other;
303 return *this;
304 }
305 if (p2_zero && !p1_zero) {
306 return *this;
307 }
308 self_set_infinity();
309 return *this;
310 }
311 } else {
312 bool p1_zero = x.is_msb_set();
313 bool p2_zero = other.x.is_msb_set();
314 if (__builtin_expect((p1_zero || p2_zero), 0)) {
315 if (p1_zero && !p2_zero) {
316 *this = other;
317 return *this;
318 }
319 if (p2_zero && !p1_zero) {
320 return *this;
321 }
322 self_set_infinity();
323 return *this;
324 }
325 }
326 Fq Z1Z1(z.sqr());
327 Fq Z2Z2(other.z.sqr());
328 Fq S2(Z1Z1 * z);
329 Fq U2(Z1Z1 * other.x);
330 S2 *= other.y;
331 Fq U1(Z2Z2 * x);
332 Fq S1(Z2Z2 * other.z);
333 S1 *= y;
334
335 Fq F(S2 - S1);
336
337 Fq H(U2 - U1);
338
339 if (__builtin_expect(H.is_zero(), 0)) {
340 if (F.is_zero()) {
341 self_dbl();
342 return *this;
343 }
344 self_set_infinity();
345 return *this;
346 }
347
348 F += F;
349
350 Fq I(H + H);
351 I.self_sqr();
352
353 Fq J(H * I);
354
355 U1 *= I;
356
357 U2 = U1 + U1;
358 U2 += J;
359
360 x = F.sqr();
361
362 x -= U2;
363
364 J *= S1;
365 J += J;
366
367 y = U1 - x;
368
369 y *= F;
370
371 y -= J;
372
373 z += other.z;
374
375 Z1Z1 += Z2Z2;
376
377 z.self_sqr();
378 z -= Z1Z1;
379 z *= H;
380 return *this;
381}
382
383template <class Fq, class Fr, class T>
384constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator+(const element& other) const noexcept
385{
386 element result(*this);
387 return (result += other);
388}
389
390template <class Fq, class Fr, class T>
392{
393 const element to_add{ other.x, -other.y, other.z };
394 return operator+=(to_add);
395}
396
397template <class Fq, class Fr, class T>
398constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator-(const element& other) const noexcept
399{
400 element result(*this);
401 return (result -= other);
402}
403
404template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T> element<Fq, Fr, T>::operator-() const noexcept
405{
406 return { x, -y, z };
407}
408
409template <class Fq, class Fr, class T>
411{
412 if constexpr (T::USE_ENDOMORPHISM) {
413 return mul_with_endomorphism(exponent);
414 }
415 return mul_without_endomorphism(exponent);
416}
417
418template <class Fq, class Fr, class T> element<Fq, Fr, T> element<Fq, Fr, T>::operator*=(const Fr& exponent) noexcept
419{
420 *this = operator*(exponent);
421 return *this;
422}
423
424template <class Fq, class Fr, class T>
426{
427 if (engine == nullptr) {
429 }
430
431 // Convert the scalar to canonical u256 form
432 const uint256_t k = uint256_t(scalar);
433
434 // Coron's first DPA countermeasure (J.-S. Coron, "Resistance against Differential Power Analysis
435 // for Elliptic Curve Cryptosystems", CHES 1999, LNCS 1717, pp. 292-302, Section 5.1): blind the
436 // scalar with k' = k + r * n where r is a fresh random 64-bit value sampled per call. Since
437 // n * P = O for any P in the prime-order subgroup, k' * P = k * P. The randomization defeats
438 // DPA: per-bit traces of two signings with the same k decorrelate because the bit pattern of k'
439 // differs across calls.
440 //
441 // We force the high bit of r to be 1 so that r is sampled uniformly from [2^63, 2^64). This
442 // guarantees r * n has a fixed-width range (MSB at position M+63 or M+64 for n with MSB at M),
443 // so the iteration count remains exactly NUM_BITS regardless of the sampled r.
444 const uint64_t r = engine->get_random_uint64() | (UINT64_C(1) << 63);
446 const uint512_t k_blinded = uint512_t(k) + r_times_n;
447
448 // For n with MSB at position M, r * n < 2^(M + 65), so k_blinded < 2^(M + 65) + n < 2^(M + 66).
449 // Iterating M+65 bits is safe because k < n means the additional bit from k cannot push k_blinded
450 // past 2^(M + 65) when n is at the lower end of [2^M, 2^(M+1)); we add one extra bit (M + 66
451 // total) to cover the worst case where n is close to 2^(M+1).
452 constexpr size_t NUM_BITS = static_cast<size_t>(uint256_t(Fr::modulus).get_msb()) + 66;
453
454 // Constant-time conditional swap of two Fq coordinates. `mask` is 0 (no swap) or all-ones (swap),
455 // derived from the secret bit via integer subtraction so no branch is emitted.
456 auto cs_fq = [](Fq& a, Fq& b, uint64_t mask) {
457 constexpr size_t NUM_LIMBS = sizeof(Fq) / sizeof(uint64_t);
458 for (size_t i = 0; i < NUM_LIMBS; ++i) {
459 uint64_t t = mask & (a.data[i] ^ b.data[i]);
460 a.data[i] ^= t;
461 b.data[i] ^= t;
462 }
463 };
464 auto cswap = [&cs_fq](element& a, element& b, uint64_t mask) {
465 cs_fq(a.x, b.x, mask);
466 cs_fq(a.y, b.y, mask);
467 cs_fq(a.z, b.z, mask);
468 };
469
470 // Montgomery ladder. Invariant after each iteration: R1 - R0 = P.
471 // Once R0 first becomes non-infinity (after the first 1-bit of k_blinded is processed), the
472 // invariant guarantees R0 + R1 and 2 * R0 do not hit the doubling/infinity special-case branches.
474 element R1(*this);
475
476 for (size_t i = NUM_BITS; i-- > 0;) {
477 const uint64_t mask = 0ULL - static_cast<uint64_t>(k_blinded.get_bit(i));
478 cswap(R0, R1, mask);
479 R1 = R0 + R1;
480 R0 = R0.dbl();
481 cswap(R0, R1, mask);
482 }
483 return R0;
484}
485
486// Warning: variable-time via the implicit affine conversion above. For
487// secret-input points use `normalize_const_time()`.
488template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T> element<Fq, Fr, T>::normalize() const noexcept
489{
490 const affine_element<Fq, Fr, T> converted = *this;
491 return element(converted);
492}
493
494template <class Fq, class Fr, class T>
496{
497 return element(to_affine_const_time());
498}
499
500template <class Fq, class Fr, class T> element<Fq, Fr, T> element<Fq, Fr, T>::infinity()
501{
504 return e;
505}
506
507template <class Fq, class Fr, class T> constexpr element<Fq, Fr, T> element<Fq, Fr, T>::set_infinity() const noexcept
508{
509 element result(*this);
510 result.self_set_infinity();
511 return result;
512}
513
514template <class Fq, class Fr, class T> constexpr void element<Fq, Fr, T>::self_set_infinity() noexcept
515{
516 if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
517 // We set the value of x equal to modulus to represent inifinty
518 x.data[0] = Fq::modulus.data[0];
519 x.data[1] = Fq::modulus.data[1];
520 x.data[2] = Fq::modulus.data[2];
521 x.data[3] = Fq::modulus.data[3];
522
523 // Clear y and z so the infinity representation is canonical regardless of prior state
524 y = Fq::zero();
525 z = Fq::zero();
526 } else {
527 (*this).x = Fq::zero();
528 (*this).y = Fq::zero();
529 (*this).z = Fq::zero();
530 x.self_set_msb();
531 }
532}
533
534template <class Fq, class Fr, class T> constexpr bool element<Fq, Fr, T>::is_point_at_infinity() const noexcept
535{
536 if constexpr (Fq::modulus.data[3] >= MODULUS_TOP_LIMB_LARGE_THRESHOLD) {
537 // We check if the value of x is equal to modulus to represent inifinty
538 return ((x.data[0] ^ Fq::modulus.data[0]) | (x.data[1] ^ Fq::modulus.data[1]) |
539 (x.data[2] ^ Fq::modulus.data[2]) | (x.data[3] ^ Fq::modulus.data[3])) == 0;
540 } else {
541 return (x.is_msb_set());
542 }
543}
544
545template <class Fq, class Fr, class T> constexpr bool element<Fq, Fr, T>::on_curve() const noexcept
546{
547 if (is_point_at_infinity()) {
548 return true;
549 }
550 // We specify the point at inifinity not by (0 \lambda 0), so z should not be 0
551 if (z.is_zero()) {
552 return false;
553 }
554 Fq zz = z.sqr();
555 Fq zzzz = zz.sqr();
556 Fq bz_6 = zzzz * zz * T::b;
557 if constexpr (T::has_a) {
558 bz_6 += (x * T::a) * zzzz;
559 }
560 Fq xxx = x.sqr() * x + bz_6;
561 Fq yy = y.sqr();
562 return (xxx == yy);
563}
564
565template <class Fq, class Fr, class T>
566constexpr bool element<Fq, Fr, T>::operator==(const element& other) const noexcept
567{
568 // If one of points is not on curve, we have no business comparing them.
569 if ((!on_curve()) || (!other.on_curve())) {
570 return false;
571 }
572 bool am_infinity = is_point_at_infinity();
573 bool is_infinity = other.is_point_at_infinity();
574 bool both_infinity = am_infinity && is_infinity;
575 // If just one is infinity, then they are obviously not equal.
576 if ((!both_infinity) && (am_infinity || is_infinity)) {
577 return false;
578 }
579 const Fq lhs_zz = z.sqr();
580 const Fq lhs_zzz = lhs_zz * z;
581 const Fq rhs_zz = other.z.sqr();
582 const Fq rhs_zzz = rhs_zz * other.z;
583
584 const Fq lhs_x = x * rhs_zz;
585 const Fq lhs_y = y * rhs_zzz;
586
587 const Fq rhs_x = other.x * lhs_zz;
588 const Fq rhs_y = other.y * lhs_zzz;
589 return both_infinity || ((lhs_x == rhs_x) && (lhs_y == rhs_y));
590}
591
592template <class Fq, class Fr, class T>
594{
595 if constexpr (T::can_hash_to_curve) {
596 element result = random_coordinates_on_curve(engine);
597 result.z = Fq::random_element(engine);
598 Fq zz = result.z.sqr();
599 Fq zzz = zz * result.z;
600 result.x *= zz;
601 result.y *= zzz;
602 return result;
603 } else {
604 Fr scalar = Fr::random_element(engine);
605 return (element{ T::one_x, T::one_y, Fq::one() } * scalar);
606 }
607}
608
609template <class Fq, class Fr, class T>
611{
612 const uint256_t converted_scalar(scalar);
613
614 if (converted_scalar == 0) {
615 return element::infinity();
616 }
617
618 element accumulator(*this);
619 const uint64_t maximum_set_bit = converted_scalar.get_msb();
620 // NOT constant-time: the loop bound leaks bit-length and the per-bit branch leaks Hamming
621 // weight. This is acceptable only for public scalars; secret scalars must go through
622 // mul_const_time.
623 for (uint64_t i = maximum_set_bit - 1; i < maximum_set_bit; --i) {
624 accumulator.self_dbl();
625 if (converted_scalar.get_bit(i)) {
626 accumulator += *this;
627 }
628 }
629 return accumulator;
630}
631
632namespace detail {
633// Represents the result of `split_into_endomorphism_scalars` — a pair of 128-bit halves
634// (k1, k2) such that `k = k1 - k2·λ (mod r)`, where λ = endomorphism scalar.
636
637// GLV endomorphism multiplication recodes each 128-bit split scalar with signed Booth windows.
638// K1 uses a standard 4-bit grid; batch_mul gives K2 a separate offset grid below.
642
643// Booth window size for the GLV endomorphism path: c=4 over each 128-bit endomorphism
644// half gives ceil(128/4) = 32 windows per half.
645inline constexpr size_t BOOTH_ENDO_WINDOW_BITS = 4;
646static_assert(BOOTH_ENDO_WINDOW_BITS + 1 <= 32);
647inline constexpr size_t BOOTH_ENDO_NUM_WINDOWS = 32;
648// Lookup table holds [1·P, 2·P, ..., 8·P] (= 2^(c-1) entries). Magnitude m ∈ [1, 8]
649// indexes the table at m-1; magnitude 0 skips the window.
650inline constexpr size_t BOOTH_ENDO_LOOKUP_SIZE = 1U << (BOOTH_ENDO_WINDOW_BITS - 1);
651// 128 bits / 64 = 2 uint64 limbs per endomorphism half.
652inline constexpr size_t BOOTH_ENDO_NUM_LIMBS_U64 = 2;
653
654// K2's offset window decomposition: a 2-bit bottom window at bit 0, then 32 × 4-bit
655// windows starting at bit 2. Pairing with K1's standard 4-bit grid at bit 0 yields
656// a union of bit positions {0, 2, 4, ..., 124, 126}, so every transition between
657// adjacent positions is exactly 2 doublings — every (2·dbl + add) pair fuses with
658// batch_affine_combined_double_add_impl, saving 1 doubling per 4-bit chunk vs. the
659// symmetric layout. With K2 < 2^127 (proven via the lattice analysis on BN254/Grumpkin
660// — see Fr/Fq.hpp endomorphism comments), the top 4-bit window covers only bit 126,
661// so its magnitude is in {0, 1, 2} and is empty ~50% of the time.
662inline constexpr size_t BOOTH_ENDO_K2_LOW_WINDOW_BITS = 2;
663static_assert(BOOTH_ENDO_K2_LOW_WINDOW_BITS + 1 <= 32);
664inline constexpr size_t BOOTH_ENDO_K2_NUM_WINDOWS = BOOTH_ENDO_NUM_WINDOWS + 1; // 33
665
666} // namespace detail
667
668template <class Fq, class Fr, class T>
670{
671 if (is_point_at_infinity()) {
672 return element::infinity();
673 }
674 const Fr converted_scalar = scalar.from_montgomery_form();
675 if (converted_scalar.is_zero()) {
676 return element::infinity();
677 }
678
679 // Booth lookup: [1·P, 2·P, ..., 8·P]. Magnitude m ∈ [1, 2^(c-1)] indexes at m-1;
680 // magnitude 0 (no contribution) just skips the add.
681 constexpr size_t LOOKUP_SIZE = detail::BOOTH_ENDO_LOOKUP_SIZE;
683 lookup_table[0] = element(*this);
684 for (size_t i = 1; i < LOOKUP_SIZE; ++i) {
685 lookup_table[i] = lookup_table[i - 1] + *this;
686 }
687
688 const detail::EndoScalars endo_scalars = Fr::split_into_endomorphism_scalars(converted_scalar);
689 constexpr auto slice_params = detail::make_booth_slice_params<detail::BOOTH_ENDO_NUM_WINDOWS,
692 const uint64_t* k1 = endo_scalars.first.data();
693 const uint64_t* k2 = endo_scalars.second.data();
694
695 element accumulator{ T::one_x, T::one_y, Fq::one() };
696 accumulator.self_set_infinity();
697 const Fq beta = Fq::cube_root_of_unity();
698
699 // Process windows high-to-low; within each window add k1's digit, then k2's
700 // (with x·=β and the GLV `k = k1 - k2·λ` sign flip on y), then double 4× to
701 // shift the next window's contribution into place. No skew correction needed —
702 // the signed-Booth digits already span the full c-bit range so an even scalar
703 // doesn't require a post-pass.
704 for (size_t w = detail::BOOTH_ENDO_NUM_WINDOWS; w-- > 0;) {
705 for (size_t h = 0; h < 2; ++h) {
706 const uint64_t* s = (h == 0) ? k1 : k2;
707 const uint32_t digit = detail::booth_packed_digit(s, slice_params[w], detail::BOOTH_ENDO_WINDOW_BITS);
708 const uint32_t magnitude = digit & 0x7FFFFFFFU;
709 if (magnitude == 0) {
710 continue;
711 }
712 const bool sign = (digit >> 31) != 0;
713 element to_add = lookup_table[magnitude - 1];
714 to_add.y.self_conditional_negate(sign ^ (h == 1));
715 if (h == 1) {
716 to_add.x *= beta;
717 }
718 accumulator += to_add;
719 }
720 if (w != 0) {
721 for (size_t d = 0; d < detail::BOOTH_ENDO_WINDOW_BITS; ++d) {
723 }
724 }
725 }
726 return accumulator;
727}
728
729template <class Fq, class Fr, class T>
731 std::span<const Fr> scalars) noexcept
732{
733 BB_BENCH_NAME("Element::straus_msm");
734 const size_t n = std::min(points.size(), scalars.size());
735 if (n == 0) {
736 return element::infinity();
737 }
738
739 if constexpr (T::USE_ENDOMORPHISM) {
740 // Endomorphism-Booth path: build a small lookup table per active point and walk
741 // the split-scalar windows high-to-low. The signed-Booth digits span the full
742 // c-bit range, so no post-pass skew correction is needed.
743 constexpr size_t LOOKUP_SIZE = detail::BOOTH_ENDO_LOOKUP_SIZE;
744 constexpr size_t NUM_WINDOWS = detail::BOOTH_ENDO_NUM_WINDOWS;
745 constexpr size_t WINDOW_BITS = detail::BOOTH_ENDO_WINDOW_BITS;
746 constexpr auto slice_params = detail::make_booth_slice_params<detail::BOOTH_ENDO_NUM_WINDOWS,
749
750 struct ActiveScalar {
754 };
755
757 active.reserve(n);
758 for (size_t i = 0; i < n; ++i) {
759 if (points[i].is_point_at_infinity()) {
760 continue;
761 }
762 const Fr converted = scalars[i].from_montgomery_form();
763 if (converted.is_zero()) {
764 continue;
765 }
766 ActiveScalar e;
767 const element pt(points[i]);
768 e.lookup[0] = pt;
769 for (size_t k = 1; k < LOOKUP_SIZE; ++k) {
770 e.lookup[k] = e.lookup[k - 1] + pt;
771 }
773 e.k1 = endo.first;
774 e.k2 = endo.second;
775 active.push_back(std::move(e));
776 }
777 if (active.empty()) {
778 return element::infinity();
779 }
780
781 element accumulator{ T::one_x, T::one_y, Fq::one() };
782 accumulator.self_set_infinity();
783 const Fq beta = Fq::cube_root_of_unity();
784
785 for (size_t w = NUM_WINDOWS; w-- > 0;) {
786 for (size_t h = 0; h < 2; ++h) {
787 for (auto& a : active) {
788 const uint64_t* s = (h == 0) ? a.k1.data() : a.k2.data();
789 const uint32_t digit = detail::booth_packed_digit(s, slice_params[w], WINDOW_BITS);
790 const uint32_t magnitude = digit & 0x7FFFFFFFU;
791 if (magnitude == 0) {
792 continue;
793 }
794 const bool sign = (digit >> 31) != 0;
795 element to_add = a.lookup[magnitude - 1];
796 to_add.y.self_conditional_negate(sign ^ (h == 1));
797 if (h == 1) {
798 to_add.x *= beta;
799 }
800 accumulator += to_add;
801 }
802 }
803 if (w != 0) {
804 for (size_t d = 0; d < WINDOW_BITS; ++d) {
806 }
807 }
808 }
809 return accumulator;
810 } else {
811 // No endomorphism: bit-by-bit simultaneous double-and-add over the active subset.
813 std::vector<uint256_t> active_scalars;
814 active_points.reserve(n);
815 active_scalars.reserve(n);
816 uint64_t max_set_bit = 0;
817 for (size_t i = 0; i < n; ++i) {
818 if (points[i].is_point_at_infinity()) {
819 continue;
820 }
821 uint256_t s(scalars[i]);
822 if (s == 0) {
823 continue;
824 }
825 max_set_bit = std::max(max_set_bit, s.get_msb());
826 active_points.push_back(points[i]);
827 active_scalars.push_back(s);
828 }
829 if (active_points.empty()) {
830 return element::infinity();
831 }
832
834 for (uint64_t bit = max_set_bit + 1; bit-- > 0;) {
835 accumulator.self_dbl();
836 for (size_t i = 0; i < active_points.size(); ++i) {
837 if (active_scalars[i].get_bit(bit)) {
838 accumulator += active_points[i];
839 }
840 }
841 }
842 return accumulator;
843 }
844}
845
860template <typename AffineElement, typename Fq>
861__attribute__((always_inline)) inline void batch_affine_add_impl(const AffineElement* lhs,
862 AffineElement* rhs,
863 const size_t num_pairs,
864 Fq* scratch_space) noexcept
865{
867
868 // Forward pass: prepare batch inversion
869 for (size_t i = 0; i < num_pairs; ++i) {
870 scratch_space[i] = lhs[i].x + rhs[i].x;
871 rhs[i].x -= lhs[i].x;
872 rhs[i].y -= lhs[i].y;
875 }
876
878 throw_or_abort("attempted to invert zero in batch_affine_add_impl");
879 }
881
882 // Backward pass: compute additions
883 for (size_t i = num_pairs - 1; i < num_pairs; --i) {
884 // lambda = (y2 - y1) / (x2 - x1)
887 rhs[i].x = rhs[i].y.sqr();
888 rhs[i].x -= scratch_space[i]; // x3 = lambda^2 - (x1 + x2)
889
890 // y3 = lambda * (x1 - x3) - y1
891 Fq temp = lhs[i].x - rhs[i].x;
892 temp *= rhs[i].y;
893 rhs[i].y = temp - lhs[i].y;
894 }
895}
896
906template <typename AffineElement, typename Fq>
907__attribute__((always_inline)) inline void batch_affine_add_interleaved(AffineElement* points,
908 const size_t num_points,
909 Fq* scratch_space) noexcept
910{
912
913 // Forward pass: accumulate (x2 - x1) products for batch inversion
914 for (size_t i = 0; i < num_points; i += 2) {
915 scratch_space[i >> 1] = points[i].x + points[i + 1].x; // x1 + x2 (saved for later)
916 points[i + 1].x -= points[i].x; // x2 - x1
917 points[i + 1].y -= points[i].y; // y2 - y1
918 points[i + 1].y *= batch_inversion_accumulator;
919 batch_inversion_accumulator *= points[i + 1].x;
920 }
921
923 throw_or_abort("attempted to invert zero in batch_affine_add_interleaved");
924 }
926
927 // Backward pass: complete inversions and compute additions
928 for (size_t i = num_points - 2; i < num_points; i -= 2) {
929 // lambda = (y2 - y1) / (x2 - x1)
930 points[i + 1].y *= batch_inversion_accumulator;
931 batch_inversion_accumulator *= points[i + 1].x;
932 points[i + 1].x = points[i + 1].y.sqr();
933 // x3 = lambda^2 - (x1 + x2)
934 points[(i + num_points) >> 1].x = points[i + 1].x - scratch_space[i >> 1];
935
936 if (i >= 2) {
937 __builtin_prefetch(points + i - 2);
938 __builtin_prefetch(points + i - 1);
939 __builtin_prefetch(points + ((i + num_points - 2) >> 1));
940 __builtin_prefetch(scratch_space + ((i - 2) >> 1));
941 }
942
943 // y3 = lambda * (x1 - x3) - y1
944 points[i].x -= points[(i + num_points) >> 1].x;
945 points[i].x *= points[i + 1].y;
946 points[(i + num_points) >> 1].y = points[i].x - points[i].y;
947 }
948}
949
964template <typename AffineElement, typename Fq, typename T>
965__attribute__((always_inline)) inline void batch_affine_double_impl(AffineElement* points,
966 const size_t num_points,
967 Fq* scratch_space) noexcept
968{
970
971 // Forward pass: prepare batch inversion
972 for (size_t i = 0; i < num_points; ++i) {
973 scratch_space[i] = points[i].x.sqr();
974 if constexpr (T::has_a) {
975 scratch_space[i] += T::a; // adjust slope in numerator
976 }
977 scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i];
978 scratch_space[i] *= batch_inversion_accumulator;
979 batch_inversion_accumulator *= (points[i].y + points[i].y);
980 }
981
983 throw_or_abort("attempted to invert zero in batch_affine_double_impl");
984 }
986
987 // Backward pass: compute doublings
989 for (size_t i_plus_1 = num_points; i_plus_1 > 0; --i_plus_1) {
990 size_t i = i_plus_1 - 1;
991
992 scratch_space[i] *= batch_inversion_accumulator;
993 batch_inversion_accumulator *= (points[i].y + points[i].y);
994
995 temp_x = points[i].x;
996 points[i].x = scratch_space[i].sqr() - (points[i].x + points[i].x);
997 points[i].y = scratch_space[i] * (temp_x - points[i].x) - points[i].y;
998 }
999}
1000
1029template <typename AffineElement, typename Fq>
1030__attribute__((always_inline)) inline void batch_affine_combined_double_add_impl(const AffineElement* to_add,
1031 AffineElement* accumulator,
1032 const size_t num_pairs,
1035 Fq* scratch_c) noexcept
1036{
1037 // === Phase 1: batch-invert (x2 − x1), produce λ1 and (x3 − x1). ===
1039 for (size_t i = 0; i < num_pairs; ++i) {
1040 // (x1 + x2): retained for x3 = λ1² − (x1 + x2) in the backward pass.
1041 scratch_a[i] = accumulator[i].x + to_add[i].x;
1042 // (x2 − x1): feeds Montgomery's batch inversion product.
1043 scratch_b[i] = to_add[i].x - accumulator[i].x;
1044 // (y2 − y1) × Π_{j<i}(x2_j − x1_j): partial numerator for λ1.
1045 scratch_c[i] = to_add[i].y - accumulator[i].y;
1046 scratch_c[i] *= batch_inv_acc;
1048 }
1050 throw_or_abort("attempted to invert zero in batch_affine_combined_double_add_impl phase 1");
1051 }
1053 for (size_t k = num_pairs; k-- > 0;) {
1054 // λ1 = (y2 − y1) / (x2 − x1).
1055 scratch_c[k] *= batch_inv_acc;
1057 // x3 = λ1² − (x1 + x2); overwrite scratch_b with (x3 − x1) for phase 2.
1058 Fq x3 = scratch_c[k].sqr();
1059 x3 -= scratch_a[k];
1060 scratch_b[k] = x3 - accumulator[k].x;
1061 // scratch_c[k] now holds λ1, retained for phase 2.
1062 }
1063
1064 // === Phase 2: batch-invert (x3 − x1), produce λ2, write x4 and y4. ===
1066 for (size_t i = 0; i < num_pairs; ++i) {
1067 // 2·y1 × Π_{j<i}(x3_j − x1_j): partial numerator for 2·y1 / (x3 − x1).
1068 scratch_a[i] = accumulator[i].y + accumulator[i].y;
1071 }
1072 if (batch_inv_acc == Fq::zero()) {
1073 throw_or_abort("attempted to invert zero in batch_affine_combined_double_add_impl phase 2");
1074 }
1075 batch_inv_acc = batch_inv_acc.invert();
1076 for (size_t k = num_pairs; k-- > 0;) {
1077 // 2·y1 / (x3 − x1).
1080 // λ2 = −λ1 − 2·y1 / (x3 − x1).
1081 Fq lambda2 = -scratch_c[k];
1082 lambda2 -= scratch_a[k];
1083 // x4 = λ2² − x1 − x3, where x3 = (x3 − x1) + x1 = scratch_b[k] + accumulator[k].x.
1084 Fq x4 = lambda2.sqr();
1085 x4 -= accumulator[k].x;
1086 x4 -= (scratch_b[k] + accumulator[k].x);
1087 // y4 = λ2 · (x1 − x4) − y1.
1088 Fq y4 = accumulator[k].x - x4;
1089 y4 *= lambda2;
1090 y4 -= accumulator[k].y;
1091 accumulator[k].x = x4;
1092 accumulator[k].y = y4;
1093 }
1094}
1095
1121template <typename AffineElement, typename Fq>
1122__attribute__((always_inline)) inline void batch_affine_add_indexed_impl(AffineElement* buckets,
1124 const size_t num_pairs,
1125 Fq* scratch_space) noexcept
1126{
1127 if (num_pairs == 0) {
1128 return;
1129 }
1130
1131 // Sparse indexed bucket accesses are hard for hardware prefetchers. A small fixed
1132 // lookahead overlaps the next bucket load with the current pair's field arithmetic.
1133 constexpr size_t PREFETCH_AHEAD = 4;
1134
1136
1137 // Forward pass: prepare batch inversion via the standard Montgomery trick.
1138 // Treats the dst slot as `rhs` (mutated in place) and src slot as `lhs` (read-only).
1139 for (size_t i = 0; i < num_pairs; ++i) {
1140 if (i + PREFETCH_AHEAD < num_pairs) {
1141 __builtin_prefetch(buckets + pairs[i + PREFETCH_AHEAD].first, 1, 3); // dst: write
1142 __builtin_prefetch(buckets + pairs[i + PREFETCH_AHEAD].second, 0, 3); // src: read
1143 }
1144 AffineElement& dst = buckets[pairs[i].first];
1145 const AffineElement& src = buckets[pairs[i].second];
1146 scratch_space[i] = src.x + dst.x; // x1 + x2 (saved for backward pass)
1147 dst.x -= src.x; // x2 - x1 (denominator)
1148 dst.y -= src.y; // y2 - y1 (numerator before scaling)
1151 }
1152
1154 throw_or_abort("attempted to invert zero in batch_affine_add_indexed_impl");
1155 }
1157
1158 // Backward pass: complete each pair's slope and write x3, y3 into the dst slot.
1159 for (size_t j = num_pairs; j > 0; --j) {
1160 const size_t i = j - 1;
1161 if (i >= PREFETCH_AHEAD) {
1162 __builtin_prefetch(buckets + pairs[i - PREFETCH_AHEAD].first, 1, 3); // dst: write
1163 __builtin_prefetch(buckets + pairs[i - PREFETCH_AHEAD].second, 0, 3); // src: read
1164 __builtin_prefetch(scratch_space + (i - PREFETCH_AHEAD), 0, 3);
1165 }
1166 AffineElement& dst = buckets[pairs[i].first];
1167 const AffineElement& src = buckets[pairs[i].second];
1168
1169 // lambda = (y2 - y1) / (x2 - x1)
1172 dst.x = dst.y.sqr();
1173 dst.x -= scratch_space[i]; // x3 = lambda^2 - (x1 + x2)
1174
1175 // y3 = lambda * (x1 - x3) - y1
1176 Fq temp = src.x - dst.x;
1177 temp *= dst.y;
1178 dst.y = temp - src.y;
1179 }
1180}
1181
1198template <typename AffineElement, typename Fq>
1199__attribute__((always_inline)) inline void batch_affine_double_indexed_impl(AffineElement* buckets,
1200 const uint32_t* indices,
1201 const size_t num_points,
1202 Fq* scratch_space) noexcept
1203{
1204 if (num_points == 0) {
1205 return;
1206 }
1207
1208 constexpr size_t PREFETCH_AHEAD = 4;
1209
1211
1212 // Forward pass.
1213 for (size_t i = 0; i < num_points; ++i) {
1214 if (i + PREFETCH_AHEAD < num_points) {
1215 __builtin_prefetch(buckets + indices[i + PREFETCH_AHEAD], 1, 3);
1216 }
1217 AffineElement& p = buckets[indices[i]];
1218 scratch_space[i] = p.x.sqr();
1219 scratch_space[i] = scratch_space[i] + scratch_space[i] + scratch_space[i]; // 3 x^2
1220 scratch_space[i] *= batch_inversion_accumulator;
1221 batch_inversion_accumulator *= (p.y + p.y);
1222 }
1223
1225 throw_or_abort("attempted to invert zero in batch_affine_double_indexed_impl");
1226 }
1228
1229 // Backward pass.
1230 Fq temp_x;
1231 for (size_t j = num_points; j > 0; --j) {
1232 const size_t i = j - 1;
1233 if (i >= PREFETCH_AHEAD) {
1234 __builtin_prefetch(buckets + indices[i - PREFETCH_AHEAD], 1, 3);
1235 __builtin_prefetch(scratch_space + (i - PREFETCH_AHEAD), 0, 3);
1236 }
1237 AffineElement& p = buckets[indices[i]];
1238
1239 scratch_space[i] *= batch_inversion_accumulator;
1240 batch_inversion_accumulator *= (p.y + p.y);
1241
1242 temp_x = p.x;
1243 p.x = scratch_space[i].sqr() - (p.x + p.x);
1244 p.y = scratch_space[i] * (temp_x - p.x) - p.y;
1245 }
1246}
1247
1259template <class Fq, class Fr, class T>
1261 const std::span<affine_element<Fq, Fr, T>>& second_group,
1262 const std::span<affine_element<Fq, Fr, T>>& results) noexcept
1263{
1265 const size_t num_points = first_group.size();
1266 BB_ASSERT_EQ(second_group.size(), first_group.size());
1267
1268 // Space for temporary values
1269 std::vector<Fq> scratch_space(num_points);
1270
1272 num_points, [&](size_t i) { results[i] = first_group[i]; }, thread_heuristics::FF_COPY_COST * 2);
1273
1274 // Perform batch affine addition: (lhs[i], rhs[i]) -> rhs[i]
1276 num_points,
1277 [&](size_t start, size_t end, BB_UNUSED size_t chunk_index) {
1278 batch_affine_add_impl<affine_element, Fq>(
1279 &second_group[start], &results[start], end - start, &scratch_space[start]);
1280 },
1282}
1283
1294template <class Fq, class Fr, class T>
1296 const std::span<const affine_element<Fq, Fr, T>>& points, const Fr& scalar) noexcept
1297{
1298 BB_BENCH();
1300 const size_t num_points = points.size();
1301 if (num_points == 0) {
1302 return {};
1303 }
1304
1305 // Scratch for batch inversions.
1306 // [0 .. N) — single-inversion buffer for batch_affine_add_impl / batch_affine_double_impl
1307 // [0 .. 3N) — batch_affine_combined_double_add_impl needs three N-sized scratches
1308 // (one each for (x1+x2), (x2−x1)/(x3−x1), and the λ1/numerator product).
1309 std::vector<Fq> scratch_space(3 * num_points);
1310 Fq* const scratch_a = &scratch_space[0];
1311 Fq* const scratch_b = &scratch_space[num_points];
1312 Fq* const scratch_c = &scratch_space[2 * num_points];
1313
1314 // p−1 still produces an "infinity" path at the end of the sum under any
1315 // signed-digit encoding (the partial sums hit a doubling edge), so short-circuit
1316 // it here to keep edge-case handling out of the hot loop.
1317 if (scalar == -Fr::one()) {
1319 parallel_for_heuristic(num_points, [&](size_t i) { results[i] = -points[i]; }, thread_heuristics::FF_COPY_COST);
1320 return results;
1321 }
1322 const Fr converted_scalar = scalar.from_montgomery_form();
1323 if (converted_scalar.is_zero()) {
1324 affine_element result{ Fq::zero(), Fq::zero() };
1325 result.self_set_infinity();
1327 parallel_for_heuristic(num_points, [&](size_t i) { results[i] = result; }, thread_heuristics::FF_COPY_COST);
1328 return results;
1329 }
1330
1331 constexpr size_t LOOKUP_SIZE = detail::BOOTH_ENDO_LOOKUP_SIZE;
1332 constexpr size_t NUM_WINDOWS = detail::BOOTH_ENDO_NUM_WINDOWS;
1333 constexpr size_t K2_NUM_WINDOWS = detail::BOOTH_ENDO_K2_NUM_WINDOWS;
1334 constexpr size_t WINDOW_BITS = detail::BOOTH_ENDO_WINDOW_BITS;
1335 constexpr size_t K2_LOW_WINDOW_BITS = detail::BOOTH_ENDO_K2_LOW_WINDOW_BITS;
1336 constexpr auto slice_params = detail::make_booth_slice_params<detail::BOOTH_ENDO_NUM_WINDOWS,
1339 constexpr auto k2_slice_params = detail::make_offset_booth_slice_params<detail::BOOTH_ENDO_K2_NUM_WINDOWS,
1343
1344 // K1 keeps the standard 4-bit Booth grid at bit positions {0, 4, ..., 124}.
1345 // K2 uses the offset grid: a 2-bit window at bit 0, then 4-bit windows at
1346 // {2, 6, ..., 126}. The union of K1/K2 bit positions is {0, 2, 4, ..., 126},
1347 // so the main loop visits each position once with 2 doublings between
1348 // adjacent positions — every (2·dbl + add) pair fuses with combined_chunked.
1349 const detail::EndoScalars endo_scalars = Fr::split_into_endomorphism_scalars(converted_scalar);
1350 const uint64_t* k1 = endo_scalars.first.data();
1351 const uint64_t* k2 = endo_scalars.second.data();
1352 BB_ASSERT((k2[1] >> 63) == 0, "GLV K2 split must fit below 2^127 for the offset Booth window schedule");
1353
1356 for (size_t w = 0; w < NUM_WINDOWS; ++w) {
1357 k1_digits[w] = detail::booth_packed_digit(k1, slice_params[w], WINDOW_BITS);
1358 }
1359 k2_digits[0] = detail::booth_packed_digit(k2, k2_slice_params[0], K2_LOW_WINDOW_BITS);
1360 for (size_t w = 1; w < K2_NUM_WINDOWS; ++w) {
1361 k2_digits[w] = detail::booth_packed_digit(k2, k2_slice_params[w], WINDOW_BITS);
1362 }
1363
1364 // Precompute, for every chunked-combined/add call below, whether
1365 // batch_affine_combined_double_add_impl's edge conditions could fire.
1366 // Both edges (x(2·accum) == x(to_add), and 2·accum + to_add == O) are a function
1367 // only of the (k1, k2) Booth digit stream and the (P, φP) basis, so we simulate the
1368 // accumulator's coefficients in that basis as int64s and set one mask bit per call site.
1369 //
1370 // Layout:
1371 // bits 0..61: main loop, step s ↔ pos 124 - 2·s.
1372 // bit 62: pos-0 combined_chunked.
1373 // bit 63: pos-0 trailing add_chunked when both pos-0 digits are non-zero.
1374 // The pos-0 add_chunked on the !initialised seed path is omitted: there the accumulator
1375 // is d0·P and to_add is d1·φP with both digits
1376 // non-zero, so the two affine points cannot share an x-coordinate (P and φP are
1377 // independent generators of the prime-order subgroup), so its check is always false.
1378 auto compute_safe_mask = [&]() -> uint64_t {
1379 uint64_t mask = 0;
1380 int64_t a = 0;
1381 int64_t b = 0;
1382 bool initialised = false;
1383
1384 const auto signed_digit = [](uint32_t packed) -> int64_t {
1385 const int64_t mag = static_cast<int64_t>(packed & 0x7FFFFFFFU);
1386 return ((packed >> 31) != 0) ? -mag : mag;
1387 };
1388
1389 // Edge predicate for combined_chunked (does dbl-then-add internally).
1390 // Pre-call accum = (a)P + (b)φP, internal accumulator post-double = (2a)P + (2b)φP.
1391 // Edge 1: x((2a)P + (2b)φP) == x(d·BASE) ⇔ one of (a, b) is 0 AND 2·other = ±d.
1392 // Edge 2: (4a + d)P + 4b·φP == O ⇔ b=0 AND 4a = -d (K1 case)
1393 // or a=0 AND 4b = -d (K2 case).
1394 const auto edge_for_combined = [&](int64_t d, bool is_k1) -> bool {
1395 if (is_k1) {
1396 if (b != 0) {
1397 return false;
1398 }
1399 if ((d % 2 == 0) && (2 * a == d || 2 * a == -d)) {
1400 return true;
1401 }
1402 return (d % 4 == 0) && (4 * a == -d);
1403 }
1404 if (a != 0) {
1405 return false;
1406 }
1407 if ((d % 2 == 0) && (2 * b == d || 2 * b == -d)) {
1408 return true;
1409 }
1410 return (d % 4 == 0) && (4 * b == -d);
1411 };
1412
1413 // Edge predicate for plain add_chunked: accum = (a)P + (b)φP, to_add = d·BASE.
1414 // x(accum) == x(to_add) ⇔ accum = ±to_add ⇔ the orthogonal basis coord is 0
1415 // AND |the aligned coord| = |d|.
1416 const auto edge_for_add = [&](int64_t d, bool is_k1) -> bool {
1417 if (is_k1) {
1418 return (b == 0) && (a == d || a == -d);
1419 }
1420 return (a == 0) && (b == d || b == -d);
1421 };
1422
1423 // === Pos 126: K2 window 32 (top, 4-bit but value ≤ 2 since K2 < 2^127). ===
1424 {
1425 const uint32_t d126 = k2_digits[K2_NUM_WINDOWS - 1];
1426 if ((d126 & 0x7FFFFFFFU) != 0) {
1427 b = signed_digit(d126);
1428 initialised = true;
1429 }
1430 }
1431
1432 // === Positions 124, 122, ..., 2 (62 iterations). ===
1433 for (size_t step = 0; step < 62; ++step) {
1434 // Once either basis coordinate is non-zero and outside ±4, no later edge predicate can fire:
1435 // same-basis additions are too small to collide, and opposite-basis additions see a non-zero
1436 // orthogonal coordinate. Stop before the coefficient simulation can overflow int64_t.
1437 if (((a != 0) && (std::abs(a) > 4)) || ((b != 0) && (std::abs(b) > 4))) {
1438 break;
1439 }
1440 const size_t pos = 124 - 2 * step;
1441 const bool is_k1 = (pos % 4 == 0);
1442 const uint32_t digit = is_k1 ? k1_digits[pos / 4] : k2_digits[(pos + 2) / 4];
1443 const uint32_t m = digit & 0x7FFFFFFFU;
1444 const int64_t d = signed_digit(digit);
1445
1446 if (!initialised) {
1447 if (m != 0) {
1448 if (is_k1) {
1449 a = d;
1450 } else {
1451 b = d;
1452 }
1453 initialised = true;
1454 }
1455 continue;
1456 }
1457 if (m == 0) {
1458 a *= 4;
1459 b *= 4;
1460 continue;
1461 }
1462 if (edge_for_combined(d, is_k1)) {
1463 mask |= (uint64_t{ 1 } << step);
1464 }
1465 a *= 4;
1466 b *= 4;
1467 if (is_k1) {
1468 a += d;
1469 } else {
1470 b += d;
1471 }
1472 }
1473
1474 // === Pos 0: K1 window 0 (4-bit) + K2 window 0 (2-bit). ===
1475 // Mirrors the runtime pos-0 branch structure below.
1476 {
1477 const uint32_t d0 = k1_digits[0];
1478 const uint32_t d1 = k2_digits[0];
1479 const uint32_t m0 = d0 & 0x7FFFFFFFU;
1480 const uint32_t m1 = d1 & 0x7FFFFFFFU;
1481 const int64_t s0 = signed_digit(d0);
1482 const int64_t s1 = signed_digit(d1);
1483
1484 if (!initialised) {
1485 if (m0 != 0) {
1486 a = s0;
1487 initialised = true;
1488 if (m1 != 0) { // accum=d0·P, to_add=d1·φP. Linear independence ⇒ no x-coordinate collision.
1489 b += s1;
1490 }
1491 } else if (m1 != 0) {
1492 b = s1;
1493 initialised = true;
1494 }
1495 } else if (m0 == 0 && m1 == 0) {
1496 a *= 4;
1497 b *= 4;
1498 } else {
1499 const bool fuse_with_h1 = (m0 == 0);
1500 const int64_t fused_d = fuse_with_h1 ? s1 : s0;
1501 if (edge_for_combined(fused_d, /*is_k1=*/!fuse_with_h1)) {
1502 mask |= (uint64_t{ 1 } << 62);
1503 }
1504 a *= 4;
1505 b *= 4;
1506 if (fuse_with_h1) {
1507 b += fused_d;
1508 } else {
1509 a += fused_d;
1510 }
1511 if (m0 != 0 && m1 != 0) {
1512 // Combined consumed d0 (the K1 contribution); the trailing add_chunked
1513 // stacks d1 (the K2 contribution) on top.
1514 if (edge_for_add(s1, /*is_k1=*/false)) {
1515 mask |= (uint64_t{ 1 } << 63);
1516 }
1517 }
1518 b += s1;
1519 }
1520 }
1521
1522 return mask;
1523 };
1524 const uint64_t safe_mask = compute_safe_mask();
1525
1527 std::array<std::vector<affine_element>, LOOKUP_SIZE> lookup_table;
1528 for (auto& table : lookup_table) {
1529 table.resize(num_points);
1530 }
1531 std::vector<affine_element> temp_point_vector(num_points);
1532
1533 auto execute_range = [&](size_t start, size_t end) {
1534 BB_BENCH_TRACY_NAME("batch_mul_with_endo/execute_range");
1535 const auto add_chunked = [&](const affine_element* lhs, affine_element* rhs) {
1536 batch_affine_add_impl<affine_element, Fq>(&lhs[start], &rhs[start], end - start, &scratch_a[start]);
1537 };
1538 const auto add_safe_chunked = [&](const affine_element* lhs, affine_element* rhs) {
1539 for (size_t i = start; i < end; ++i) {
1540 element acc(rhs[i]);
1541 acc += lhs[i];
1542 rhs[i] = affine_element(acc);
1543 }
1544 };
1545 const auto double_chunked = [&](affine_element* lhs) {
1546 batch_affine_double_impl<affine_element, Fq, T>(&lhs[start], end - start, &scratch_a[start]);
1547 };
1548 // Fused 2·accum + to_add — saves 1 mul + 1 sqr per point vs. (double + add).
1549 const auto combined_chunked = [&](const affine_element* to_add, affine_element* accum) {
1550 batch_affine_combined_double_add_impl<affine_element, Fq>(
1551 &to_add[start], &accum[start], end - start, &scratch_a[start], &scratch_b[start], &scratch_c[start]);
1552 };
1553 const auto combined_safe_chunked = [&](const affine_element* to_add, affine_element* accum) {
1554 for (size_t i = start; i < end; ++i) {
1555 element acc(accum[i]);
1556 acc.self_dbl();
1557 acc += to_add[i];
1558 accum[i] = affine_element(acc);
1559 }
1560 };
1561 // Build lookup table [1·P, 2·P, 3·P, ..., 8·P]. Substitute affine::one()
1562 // for points-at-infinity to keep the batch arithmetic edge-case-free; the
1563 // final pass below sets work_elements[i] to infinity for those slots.
1564 for (size_t i = start; i < end; ++i) {
1565 if (points[i].is_point_at_infinity()) {
1566 lookup_table[0][i] = affine_element::one();
1567 temp_point_vector[i] = affine_element::one();
1568 } else {
1569 lookup_table[0][i] = points[i];
1570 temp_point_vector[i] = points[i];
1571 }
1572 }
1573 // lookup[1] = 2·P via batch double (lookup[1] = lookup[0] + lookup[0] would
1574 // trip the equal-x guard in batch_affine_add_impl).
1575 for (size_t i = start; i < end; ++i) {
1576 lookup_table[1][i] = lookup_table[0][i];
1577 }
1578 double_chunked(&lookup_table[1][0]);
1579 // lookup[j] = lookup[j-1] + P for j ≥ 2 (lookup[j-1] = j·P, never equals P).
1580 for (size_t j = 2; j < LOOKUP_SIZE; ++j) {
1581 for (size_t i = start; i < end; ++i) {
1582 lookup_table[j][i] = lookup_table[j - 1][i];
1583 }
1584 add_chunked(&temp_point_vector[0], &lookup_table[j][0]);
1585 }
1586
1587 constexpr Fq beta = Fq::cube_root_of_unity();
1588
1589 // Materialise lookup[mag-1] (possibly negated, possibly β-twisted) for one
1590 // (window, half). Skips the points-at-infinity slots via the lookup entry
1591 // already being affine::one(); those slots get zeroed at the end.
1592 auto fill_to_add = [&](uint32_t digit, bool half_idx, affine_element* dst) {
1593 const uint32_t magnitude = digit & 0x7FFFFFFFU;
1594 const bool sign = (digit >> 31) != 0;
1595 const bool flip_y = sign ^ half_idx;
1596 for (size_t i = start; i < end; ++i) {
1597 affine_element pt = lookup_table[magnitude - 1][i];
1598 pt.y.self_conditional_negate(flip_y);
1599 if (half_idx) {
1600 pt.x *= beta;
1601 }
1602 dst[i] = pt;
1603 }
1604 };
1605
1606 // Walk K1/K2 bit positions {126, 124, 122, ..., 4, 2, 0} top-to-bottom.
1607 // Mapping: pos % 4 == 0 → K1 window pos/4; pos % 4 == 2 → K2 window (pos+2)/4.
1608 // Pos 126 hosts K2 window 32 (top, 4-bit but value ≤ 2 since K2 < 2^127, so
1609 // empty ~50% of the time). Pos 0 hosts both K1 window 0 and K2 window 0 (2-bit).
1610 //
1611 // Once initialised, every transition between adjacent positions is "2 dbl + add",
1612 // fused as "1 dbl + 1 combined_chunked". Booth digit 0 turns the add into a no-op,
1613 // so a zero digit just becomes 2 unfused doublings to shift past.
1614 bool initialised = false;
1615 const auto update_initialised_from_work = [&]() { initialised = !work_elements[start].is_point_at_infinity(); };
1616 auto seed_or_skip = [&](uint32_t digit, bool half_idx) {
1617 // Pre-init: no doublings (accumulator is conceptually identity).
1618 if ((digit & 0x7FFFFFFFU) != 0) {
1619 fill_to_add(digit, half_idx, &work_elements[0]);
1620 initialised = true;
1621 }
1622 };
1623
1624 // Pos 126: K2 window 32 (top). Magnitude in {0, 1, 2} given K2 < 2^127.
1625 seed_or_skip(k2_digits[K2_NUM_WINDOWS - 1], /*half_idx=*/true);
1626
1627 // Positions 124, 122, ..., 2 (62 positions, alternating K1 / K2).
1628 for (size_t step = 0; step < 62; ++step) {
1629 const size_t pos = 124 - 2 * step;
1630 const bool is_k1 = (pos % 4 == 0);
1631 const uint32_t digit = is_k1 ? k1_digits[pos / 4] : k2_digits[(pos + 2) / 4];
1632 const bool half_idx = !is_k1;
1633 const uint32_t m = digit & 0x7FFFFFFFU;
1634
1635 if (!initialised) {
1636 if (m != 0) {
1637 fill_to_add(digit, half_idx, &work_elements[0]);
1638 initialised = true;
1639 }
1640 continue;
1641 }
1642
1643 if (m == 0) {
1644 // 2 unfused doublings to shift past this empty position.
1645 double_chunked(&work_elements[0]);
1646 double_chunked(&work_elements[0]);
1647 continue;
1648 }
1649
1650 // (2·dbl + add) fused as (1·dbl + combined_chunked).
1651 double_chunked(&work_elements[0]);
1652 fill_to_add(digit, half_idx, &temp_point_vector[0]);
1653 if ((safe_mask >> step) & uint64_t{ 1 }) {
1654 combined_safe_chunked(&temp_point_vector[0], &work_elements[0]);
1655 update_initialised_from_work();
1656 } else {
1657 combined_chunked(&temp_point_vector[0], &work_elements[0]);
1658 }
1659 }
1660
1661 // Pos 0: both K1 window 0 (4-bit) and K2 window 0 (2-bit). Transition from
1662 // pos 2 still requires the standard 2-dbl shift; the second contribution rides
1663 // on top via an extra add_chunked.
1664 {
1665 const uint32_t d0 = k1_digits[0];
1666 const uint32_t d1 = k2_digits[0];
1667 const uint32_t m0 = d0 & 0x7FFFFFFFU;
1668 const uint32_t m1 = d1 & 0x7FFFFFFFU;
1669
1670 if (!initialised) {
1671 if (m0 != 0) {
1672 fill_to_add(d0, /*half_idx=*/false, &work_elements[0]);
1673 initialised = true;
1674 if (m1 != 0) {
1675 // accum = d0·P, to_add = d1·φP with both digits non-zero.
1676 // x-coords cannot collide (linear independence of P, φP), so the
1677 // unsafe batch-add formula is always safe here.
1678 fill_to_add(d1, /*half_idx=*/true, &temp_point_vector[0]);
1679 add_chunked(&temp_point_vector[0], &work_elements[0]);
1680 }
1681 } else if (m1 != 0) {
1682 fill_to_add(d1, /*half_idx=*/true, &work_elements[0]);
1683 initialised = true;
1684 }
1685 } else if (m0 == 0 && m1 == 0) {
1686 double_chunked(&work_elements[0]);
1687 double_chunked(&work_elements[0]);
1688 } else {
1689 double_chunked(&work_elements[0]);
1690 const bool fuse_with_h1 = (m0 == 0);
1691 const uint32_t fused_digit = fuse_with_h1 ? d1 : d0;
1692 fill_to_add(fused_digit, fuse_with_h1, &temp_point_vector[0]);
1693 if ((safe_mask >> 62) & uint64_t{ 1 }) {
1694 combined_safe_chunked(&temp_point_vector[0], &work_elements[0]);
1695 update_initialised_from_work();
1696 } else {
1697 combined_chunked(&temp_point_vector[0], &work_elements[0]);
1698 }
1699 if (m0 != 0 && m1 != 0) {
1700 if (!initialised) {
1701 fill_to_add(d1, /*half_idx=*/true, &work_elements[0]);
1702 initialised = true;
1703 } else {
1704 fill_to_add(d1, /*half_idx=*/true, &temp_point_vector[0]);
1705 if ((safe_mask >> 63) & uint64_t{ 1 }) {
1706 add_safe_chunked(&temp_point_vector[0], &work_elements[0]);
1707 update_initialised_from_work();
1708 } else {
1709 add_chunked(&temp_point_vector[0], &work_elements[0]);
1710 }
1711 }
1712 }
1713 }
1714 }
1715
1716 BB_ASSERT(initialised, "non-zero scalar must produce at least one non-zero Booth digit");
1717
1718 // Restore infinity for slots where the input was at infinity.
1719 for (size_t i = start; i < end; ++i) {
1720 if (points[i].is_point_at_infinity()) {
1721 work_elements[i].self_set_infinity();
1722 }
1723 }
1724 };
1725 parallel_for_range(num_points, execute_range);
1726
1727 return work_elements;
1728}
1729
1730template <typename Fq, typename Fr, typename T>
1731void element<Fq, Fr, T>::batch_normalize(element* elements, const size_t num_elements) noexcept
1732{
1733 std::vector<Fq> temporaries;
1734 temporaries.reserve(num_elements * 2);
1736
1737 // Iterate over the points, computing the product of their z-coordinates.
1738 // At each iteration, store the currently-accumulated z-coordinate in `temporaries`
1739 for (size_t i = 0; i < num_elements; ++i) {
1740 temporaries.emplace_back(accumulator);
1741 if (!elements[i].is_point_at_infinity()) {
1742 accumulator *= elements[i].z;
1743 }
1744 }
1745 // For the rest of this method we refer to the product of all z-coordinates as the 'global' z-coordinate
1746 // Invert the global z-coordinate and store in `accumulator`
1747 accumulator = accumulator.invert();
1748
1771 for (size_t i = num_elements - 1; i < num_elements; --i) {
1772 if (!elements[i].is_point_at_infinity()) {
1773 Fq z_inv = accumulator * temporaries[i];
1774 Fq zz_inv = z_inv.sqr();
1775 elements[i].x *= zz_inv;
1776 elements[i].y *= (zz_inv * z_inv);
1777 accumulator *= elements[i].z;
1778 }
1779 elements[i].z = Fq::one();
1780 }
1781}
1782
1783template <typename Fq, typename Fr, typename T>
1784template <typename>
1786{
1787 bool found_one = false;
1788 Fq yy;
1789 Fq x;
1790 Fq y;
1791 while (!found_one) {
1793 yy = x.sqr() * x + T::b;
1794 if constexpr (T::has_a) {
1795 yy += (x * T::a);
1796 }
1797 auto [found_root, y1] = yy.sqrt();
1798 y = y1;
1799 found_one = found_root;
1800 }
1801 return { x, y, Fq::one() };
1802}
1803
1804} // namespace bb::group_elements
1805// NOLINTEND(readability-implicit-bool-conversion, cppcoreguidelines-avoid-c-arrays)
#define BB_ASSERT(expression,...)
Definition assert.hpp:70
#define BB_ASSERT_EQ(actual, expected,...)
Definition assert.hpp:83
#define BB_BENCH_NAME(name)
Definition bb_bench.hpp:264
#define BB_BENCH_TRACY_NAME(name)
Definition bb_bench.hpp:256
#define BB_BENCH()
Definition bb_bench.hpp:268
constexpr bool is_point_at_infinity() const noexcept
constexpr void self_set_infinity() noexcept
static constexpr affine_element one() noexcept
element class. Implements ecc group arithmetic using Jacobian coordinates See https://hyperelliptic....
Definition element.hpp:35
element operator*=(const Fr &exponent) noexcept
BB_INLINE constexpr element set_infinity() const noexcept
element mul_with_endomorphism(const Fr &scalar) const noexcept
static std::vector< affine_element< Fq, Fr, Params > > batch_mul_with_endomorphism(const std::span< const affine_element< Fq, Fr, Params > > &points, const Fr &scalar) noexcept
Multiply each point by the same scalar.
constexpr element operator-=(const element &other) noexcept
constexpr element operator-() const noexcept
constexpr affine_element< Fq, Fr, Params > to_affine_const_time() const noexcept
friend constexpr element operator+(const affine_element< Fq, Fr, Params > &left, const element &right) noexcept
Definition element.hpp:76
constexpr element dbl() const noexcept
constexpr element normalize() const noexcept
constexpr void self_dbl() noexcept
static element random_element(numeric::RNG *engine=nullptr) noexcept
static void batch_normalize(element *elements, size_t num_elements) noexcept
constexpr element operator+=(const element &other) noexcept
static void batch_affine_add(const std::span< affine_element< Fq, Fr, Params > > &first_group, const std::span< affine_element< Fq, Fr, Params > > &second_group, const std::span< affine_element< Fq, Fr, Params > > &results) noexcept
Pairwise affine add points in first and second group.
element mul_const_time(const Fr &scalar, numeric::RNG *engine=nullptr) const noexcept
Constant-time scalar multiplication intended for secret scalars (e.g. ECDSA / Schnorr nonces).
BB_INLINE constexpr bool on_curve() const noexcept
BB_INLINE constexpr bool operator==(const element &other) const noexcept
element operator*(const Fr &exponent) const noexcept
static element straus_msm(std::span< const affine_element< Fq, Fr, Params > > points, std::span< const Fr > scalars) noexcept
Straus-style multi-scalar multiplication.
element() noexcept=default
static element random_coordinates_on_curve(numeric::RNG *engine=nullptr) noexcept
element mul_without_endomorphism(const Fr &scalar) const noexcept
constexpr element & operator=(const element &other) noexcept
BB_INLINE constexpr void self_set_infinity() noexcept
constexpr element normalize_const_time() const noexcept
BB_INLINE constexpr bool is_point_at_infinity() const noexcept
constexpr bool get_bit(uint64_t bit_index) const
constexpr uint64_t get_msb() const
bool get_bit(uint64_t bit_index) const
#define BB_UNUSED
FF a
FF b
numeric::RNG & engine
constexpr std::array< BoothSliceParams, NUM_WINDOWS > make_offset_booth_slice_params() noexcept
constexpr std::array< BoothSliceParams, NUM_WINDOWS > make_booth_slice_params() noexcept
uint32_t booth_packed_digit(const uint64_t *s, const BoothSliceParams &sp, size_t window_bits) noexcept
Read a (window_bits+1)-bit window from s[] (uint64 limbs) and apply Constantine's signedWindowEncodin...
constexpr size_t BOOTH_ENDO_K2_NUM_WINDOWS
std::pair< std::array< uint64_t, 2 >, std::array< uint64_t, 2 > > EndoScalars
constexpr size_t BOOTH_ENDO_K2_LOW_WINDOW_BITS
constexpr size_t BOOTH_ENDO_WINDOW_BITS
constexpr size_t BOOTH_ENDO_LOOKUP_SIZE
constexpr size_t BOOTH_ENDO_NUM_WINDOWS
constexpr size_t BOOTH_ENDO_NUM_LIMBS_U64
AffineElement const size_t Fq *scratch_space noexcept
AffineElement const size_t num_pairs
__attribute__((always_inline)) inline void batch_affine_add_impl(const AffineElement *lhs
Batch affine addition for parallel arrays: (lhs[i], rhs[i]) → rhs[i].
const size_t num_points
AffineElement const size_t Fq Fq * scratch_b
const uint32_t * indices
AffineElement * rhs
AffineElement * accumulator
AffineElement const size_t Fq * scratch_a
const std::pair< uint32_t, uint32_t > * pairs
uintx< uint256_t > uint512_t
Definition uintx.hpp:309
RNG & get_randomness()
Definition engine.cpp:258
std::conditional_t< IsGoblinBigGroup< C, Fq, Fr, G >, element_goblin::goblin_element< C, goblin_field< C >, Fr, G >, element_default::element< C, Fq, Fr, G > > element
element wraps either element_default::element or element_goblin::goblin_element depending on parametr...
constexpr size_t FF_COPY_COST
Definition thread.hpp:144
constexpr size_t FF_ADDITION_COST
Definition thread.hpp:132
constexpr size_t FF_MULTIPLICATION_COST
Definition thread.hpp:134
Univariate< Fr, domain_end > operator*(const Fr &ff, const Univariate< Fr, domain_end > &uv)
void parallel_for_heuristic(size_t num_points, const std::function< void(size_t, size_t, size_t)> &func, size_t heuristic_cost)
Split a loop into several loops running in parallel based on operations in 1 iteration.
Definition thread.cpp:171
void parallel_for_range(size_t num_points, const std::function< void(size_t, size_t)> &func, size_t no_multhreading_if_less_or_equal)
Split a loop into several loops running in parallel.
Definition thread.cpp:141
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
Definition tuple.hpp:13
grumpkin::fq Fq
static constexpr field cube_root_of_unity()
static constexpr field one()
static constexpr uint256_t modulus
static void split_into_endomorphism_scalars(const field &k, field &k1, field &k2)
Full-width endomorphism decomposition: k ≡ k1 - k2·λ (mod r). Modifies the field elements k1 and k2.
BB_INLINE constexpr void self_sqr() &noexcept
constexpr field invert() const noexcept
BB_INLINE constexpr bool is_msb_set() const noexcept
static field random_element(numeric::RNG *engine=nullptr) noexcept
BB_INLINE constexpr field sqr() const noexcept
BB_INLINE constexpr bool is_zero() const noexcept
BB_INLINE constexpr field from_montgomery_form() const noexcept
static constexpr field zero()
constexpr field invert_const_time() const noexcept
void throw_or_abort(std::string const &err)