Barretenberg
The ZK-SNARK library at the core of Aztec
Loading...
Searching...
No Matches
bernstein_yang_inverse.hpp
Go to the documentation of this file.
1// Bernstein-Yang safegcd modular inverse.
2//
3// We want a⁻¹ mod p, where p is an odd prime. Run an extended binary GCD on
4// the pair (f, g) starting at (p, a), with Bezout coefficients (d, e)
5// satisfying invariants d·a ≡ f (mod p) and e·a ≡ g (mod p) at all times.
6// When g reaches 0, gcd(p, a) = ±f = ±1 (since p is prime, a ≠ 0), so
7// a⁻¹ = ±d.
8//
9// The cheap operation in a binary GCD is "if g is odd swap-and-subtract to
10// make it even, then divide g by 2"; one such step is a "divstep" and
11// shrinks |g| by ~1 bit. Doing each divstep on the full 256-bit state would
12// be slow, so we use Pornin's trick: do BATCH divsteps purely on the low 64
13// bits of (f, g), accumulating the resulting linear transform as a 2×2
14// integer matrix M, then apply M to the full-precision (f, g, d, e) in one
15// go (with an exact /2^BATCH at the end). The (d, e) side needs a 2-adic
16// correction k·p added before dividing so the result stays integer-valued
17// — k is determined by the low BATCH bits of M·(d, e) and p⁻¹ mod 2^BATCH.
18//
19// Native5x64 and Wasm9x29 wrap the platform-specific limb representation
20// behind the same static interface so `invert_vartime` below is a
21// single algorithm for both targets.
22
23#pragma once
24
26#include <cstdint>
27
29
31using u64 = uint64_t;
32using i64 = int64_t;
33
34// The transition matrix produced by BATCH divsteps. With the implicit
35// "/ 2^BATCH" at the end of apply_divstep_matrix, it represents the linear
36// map (f, g) ↦ (M·(f, g) / 2^BATCH). Each divstep doubles one matrix entry,
37// so after BATCH steps |u|, |v|, |q|, |r| ≤ 2^BATCH; they are signed (the
38// swap-and-subtract case introduces negatives).
40 i64 u, v, q, r;
41};
42
43// 5 × 64-bit limbs, top limb two's-complement signed. Products via __int128.
45 public:
46 // Number of divsteps folded into one matrix application. Bigger BATCH
47 // means fewer matrix applications per inversion but bigger matrix
48 // entries. Cap is set by the matrix-times-state product staying inside
49 // an __int128 accumulator: a single (i63 entry) × (u64 limb) is 127 bits,
50 // so BATCH ≤ 63; we use 62 to keep one bit of slack for the running sum.
51 static constexpr int BATCH = 62;
52
53 // Worst-case number of matrix applications needed before g must have
54 // reached 0. The 735-divstep bound for 254-bit inputs is from the BY
55 // paper's convergence proof (rate ≈ 1 / 1.7 bits of g consumed per
56 // divstep). We pick the smallest NUM_ITERATIONS so NUM_ITERATIONS *
57 // BATCH ≥ 735; ⌈735 / 62⌉ = 12. The actual loop usually exits much
58 // earlier via the early break on g == 0.
59 static constexpr int NUM_ITERATIONS = 12;
60
61 // The Bezout coefficients (d, e) live mod p but during the iteration we
62 // hold them as signed integers in the state. Each matrix application grows
63 // |d|, |e| by roughly a factor of 2 (matrix entry × value) plus an
64 // additive p (from the 2-adic correction k·p), so without bringing them
65 // back to [0, p) they would eventually overflow the state.
66 // reduce_to_canonical does that subtract-or-add-p reduction. Calling it
67 // every iter is wasteful — the 5×64-bit signed state has enough room to
68 // let |d|, |e| reach ~2^K · p before they no longer fit, so we only
69 // reduce every REDUCE_INTERVAL = 4 iters (|d|, |e| stay ≤ ~32p between
70 // reductions, plenty of headroom).
71 static constexpr int REDUCE_INTERVAL = 4;
72
73 // Worst-case iteration cap inside reduce_to_canonical. After
74 // REDUCE_INTERVAL iters between reductions, |d|, |e| ≤ (2^(REDUCE_INTERVAL+1) - 1)·p,
75 // so reducing requires that many subtractions plus one break iter.
76 static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS = 36;
77 static_assert((1U << (REDUCE_INTERVAL + 1)) <= REDUCE_TO_CANONICAL_MAX_ITERS,
78 "REDUCE_INTERVAL too large for reduce_to_canonical iteration bound");
79
80 Native5x64() noexcept
81 : l{}
82 {}
83 explicit Native5x64(const uint256_t& x) noexcept
84 : l{ x.data[0], x.data[1], x.data[2], x.data[3], 0 }
85 {}
86 static Native5x64 one() noexcept
87 {
88 Native5x64 r;
89 r.l[0] = 1;
90 return r;
91 }
92
93 uint256_t to_uint256() const noexcept { return { l[0], l[1], l[2], l[3] }; }
94 u64 low_64() const noexcept { return l[0]; }
95 bool is_zero() const noexcept { return (l[0] | l[1] | l[2] | l[3] | l[4]) == 0; }
96 bool is_negative() const noexcept { return (i64)l[4] < 0; }
97 void neg() noexcept
98 {
99 u64 c = 1;
100 for (int i = 0; i < N; ++i) {
101 u64 v = (~l[i]) + c;
102 c = (c && v == 0) ? 1 : 0;
103 l[i] = v;
104 }
105 }
106 // Iter cap chosen by the REDUCE_TO_CANONICAL_MAX_ITERS / REDUCE_INTERVAL
107 // static_assert above; see those constants for the magnitude argument.
108 void reduce_to_canonical(const Native5x64& p) noexcept
109 {
110 for (int it = 0; it < REDUCE_TO_CANONICAL_MAX_ITERS; ++it) {
111 if (is_negative()) {
112 add_inplace(p);
113 } else if (ge(p)) {
114 sub_inplace(p);
115 } else {
116 break;
117 }
118 }
119 }
120
121 // BATCH branchy divsteps on the low 64 bits of (f, g); returns the
122 // transition matrix M and updates δ. Variable-time over the inner
123 // branches — non-secret inputs only.
124 static DivstepMatrix compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept;
125 // (f, g) ← M·(f, g) / 2^BATCH and (d, e) ← (M·(d, e) + k·p) / 2^BATCH,
126 // where k_i = -((M·(d, e))_i · p⁻¹) mod 2^BATCH (the 2-adic correction
127 // that makes (M·(d, e))_i + k_i·p divisible by 2^BATCH).
128 static void apply_divstep_matrix(const DivstepMatrix& m,
129 Native5x64& f,
130 Native5x64& g,
131 Native5x64& d,
132 Native5x64& e,
133 const Native5x64& p,
134 u64 p_inv_mod_2k) noexcept;
135 // r_inv = -p⁻¹ mod 2^64 (barretenberg's Montgomery constant), so p⁻¹ mod
136 // 2^BATCH is the low BATCH bits of -r_inv.
137 static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
138 {
139 // r_inv = -p^{-1} mod 2^64, so 0 - r_inv = p^{-1} mod 2^64.
140 return (0ULL - r_inv) & ((1ULL << BATCH) - 1);
141 }
142
143 private:
144 static constexpr int N = 5;
146
147 void add_inplace(const Native5x64& b) noexcept
148 {
149 u64 c = 0;
150 for (int i = 0; i < N; ++i) {
151 __uint128_t s = (__uint128_t)l[i] + b.l[i] + c;
152 l[i] = (u64)s;
153 c = (u64)(s >> 64);
154 }
155 }
156 void sub_inplace(const Native5x64& b) noexcept
157 {
158 u64 borrow = 0;
159 for (int i = 0; i < N; ++i) {
160 __uint128_t s = (__uint128_t)l[i] - (__uint128_t)b.l[i] - borrow;
161 l[i] = (u64)s;
162 borrow = ((i64)(s >> 64) < 0) ? 1 : 0;
163 }
164 }
165 bool ge(const Native5x64& b) const noexcept
166 {
167 i64 a_top = (i64)l[N - 1], b_top = (i64)b.l[N - 1];
168 if (a_top != b_top) {
169 return a_top > b_top;
170 }
171 for (int i = N - 2; i >= 0; --i) {
172 if (l[i] != b.l[i]) {
173 return l[i] > b.l[i];
174 }
175 }
176 return true;
177 }
178
179 friend struct NativeMatrix;
180};
181
182// 6-limb signed product helpers used by Native5x64::apply_divstep_matrix.
184 static void signed_linear_combination(i64 a, const Native5x64& x, i64 b, const Native5x64& y, u64 out[6]) noexcept
185 {
186 __int128 c = 0;
187 for (int i = 0; i < 4; ++i) {
188 c += (__int128)a * (__int128)(u64)x.l[i] + (__int128)b * (__int128)(u64)y.l[i];
189 out[i] = (u64)c;
190 c >>= 64;
191 }
192 c += (__int128)a * (__int128)(i64)x.l[4] + (__int128)b * (__int128)(i64)y.l[4];
193 out[4] = (u64)c;
194 out[5] = (u64)(c >> 64);
195 }
196 // Sign-preserving exact /2^BATCH on the 6-limb signed `t`. Sign of the
197 // result lives in bit 63 of r.l[4], which is bit 61 of t[5]. This is the
198 // sign of t iff t[5] is just sign-extension of the actual magnitude — i.e.,
199 // iff |t| < 2^319. BY guarantees this: |M·(x,y) + k·p| ≤ 2^324 in the
200 // (d,e) row and ≤ 2^319 in the (f,g) row, both with |result/2^62| < 2^263 < 2^319.
201 // A future change widening the matrix entries or state without re-running
202 // this analysis will silently corrupt the sign bit.
203 static Native5x64 arithmetic_shift_by_batch(const u64 t[6]) noexcept
204 {
205 Native5x64 r;
206 for (int i = 0; i < 4; ++i) {
207 r.l[i] = (t[i] >> 62) | (t[i + 1] << 2);
208 }
209 r.l[4] = (t[4] >> 62) | (t[5] << 2);
210 return r;
211 }
212};
213
214// Each inner step shrinks |g| by ~1 bit using a binary-GCD-style move.
215// Three cases, depending on g's parity and the "δ" tracker (which decides
216// whether |f| or |g| is currently smaller):
217// g even : g ← g/2.
218// g odd, δ ≤ 0 : g ← (g + f)/2 (adding f to make g+f even before /2).
219// g odd, δ > 0 : swap roles — (f, g) ← (g, (g - f)/2).
220// The matrix (u, v, q, r) tracks the same linear transform applied
221// symbolically; doubling u, v (or q, r) corresponds to the implicit /2 each
222// inner step performs. After BATCH steps the low BATCH bits of the
223// transformed state are guaranteed zero, so apply_divstep_matrix's implicit
224// "/ 2^BATCH" is an exact integer division.
225inline DivstepMatrix Native5x64::compute_divstep_matrix(i64& delta, u64 f_lo, u64 g_lo) noexcept
226{
227 i64 u = 1, v = 0, q = 0, r = 1;
228 for (int i = 0; i < BATCH; ++i) {
229 if (g_lo & 1) {
230 if (delta > 0) {
231 u64 nf = g_lo, ng = (g_lo - f_lo) >> 1;
232 i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v;
233 f_lo = nf;
234 g_lo = ng;
235 u = nu;
236 v = nv;
237 q = nq;
238 r = nr;
239 delta = 1 - delta;
240 } else {
241 g_lo = (g_lo + f_lo) >> 1;
242 q = q + u;
243 r = r + v;
244 u <<= 1;
245 v <<= 1;
246 delta = delta + 1;
247 }
248 } else {
249 g_lo >>= 1;
250 u <<= 1;
251 v <<= 1;
252 delta = delta + 1;
253 }
254 }
255 return { u, v, q, r };
256}
257
259 Native5x64& f,
260 Native5x64& g,
261 Native5x64& d,
262 Native5x64& e,
263 const Native5x64& p,
264 u64 p_inv_mod_2k) noexcept
265{
266 constexpr u64 MASK_BATCH = (1ULL << BATCH) - 1;
267
268 u64 nf[6], ng[6];
269 NativeMatrix::signed_linear_combination(m.u, f, m.v, g, nf);
270 NativeMatrix::signed_linear_combination(m.q, f, m.r, g, ng);
273
274 // k = -t · p_inv_mod_2k mod 2^BATCH makes t + k·p divisible by 2^BATCH.
275 auto apply_corrected_row = [&](i64 a, const Native5x64& da, i64 b, const Native5x64& eb, Native5x64& out) {
276 u64 t[6];
278 u64 k = ((0ULL - t[0]) * p_inv_mod_2k) & MASK_BATCH;
279 u64 kp[6] = {};
280 u64 carry = 0;
281 for (int i = 0; i < 5; ++i) {
282 __uint128_t prod = (__uint128_t)k * (u64)p.l[i] + carry;
283 kp[i] = (u64)prod;
284 carry = (u64)(prod >> 64);
285 }
286 kp[5] = carry;
287 u64 c = 0;
288 for (int i = 0; i < 6; ++i) {
289 __uint128_t s = (__uint128_t)t[i] + kp[i] + c;
290 t[i] = (u64)s;
291 c = (u64)(s >> 64);
292 }
294 };
295 Native5x64 nd, ne;
296 apply_corrected_row(m.u, d, m.v, e, nd);
297 apply_corrected_row(m.q, d, m.r, e, ne);
298 d = nd;
299 e = ne;
300}
301
302} // namespace bb::bernstein_yang
303
305
306namespace bb::bernstein_yang {
307
308#if defined(__wasm__)
309using State = Wasm9x29;
310#else
312#endif
313
324template <class S = State>
325inline uint256_t invert_vartime(const uint256_t& a, const uint256_t& p, u64 p_inv_mod_2k) noexcept
326{
327 if (a == uint256_t(0)) {
328 return uint256_t(0);
329 }
330 S P(p), f = P, g(a), d, e = S::one();
331 // δ is Pornin's auxiliary used by the divstep rule to decide swap-vs-add cases.
332 i64 delta = 1;
333 for (int i = 0; i < S::NUM_ITERATIONS; ++i) {
334 DivstepMatrix m = S::compute_divstep_matrix(delta, f.low_64(), g.low_64());
335 S::apply_divstep_matrix(m, f, g, d, e, P, p_inv_mod_2k);
336 if (g.is_zero()) {
337 break;
338 }
339 if ((i + 1) % S::REDUCE_INTERVAL == 0) {
340 d.reduce_to_canonical(P);
341 e.reduce_to_canonical(P);
342 }
343 }
344 d.reduce_to_canonical(P);
345 if (f.is_negative()) {
346 d.neg();
347 d.reduce_to_canonical(P);
348 }
349 return d.to_uint256();
350}
351
352inline constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
353{
355}
356
357// True iff `invert_vartime` is usable for field params T: the active
358// kernel must be compilable on this toolchain (Native5x64 needs __int128, the
359// WASM kernel is unconditional) and T's modulus must fit BY's < 2^255
360// precondition. Used to gate the dispatch in `field::invert()`.
361template <class T>
362inline constexpr bool supported_v =
363#if defined(__SIZEOF_INT128__) || defined(__wasm__)
364 T::modulus_3 < (1ULL << 63);
365#else
366 false;
367#endif
368
369} // namespace bb::bernstein_yang
static DivstepMatrix compute_divstep_matrix(i64 &delta, u64 f_lo, u64 g_lo) noexcept
void sub_inplace(const Native5x64 &b) noexcept
static constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
Native5x64(const uint256_t &x) noexcept
void reduce_to_canonical(const Native5x64 &p) noexcept
static Native5x64 one() noexcept
void add_inplace(const Native5x64 &b) noexcept
bool ge(const Native5x64 &b) const noexcept
uint256_t to_uint256() const noexcept
static void apply_divstep_matrix(const DivstepMatrix &m, Native5x64 &f, Native5x64 &g, Native5x64 &d, Native5x64 &e, const Native5x64 &p, u64 p_inv_mod_2k) noexcept
static constexpr int REDUCE_TO_CANONICAL_MAX_ITERS
FF a
FF b
constexpr u64 p_inv_mod_2k_from_montgomery_r_inv(u64 r_inv) noexcept
uint256_t invert_vartime(const uint256_t &a, const uint256_t &p, u64 p_inv_mod_2k) noexcept
Variable-time safegcd inverse (Bernstein-Yang TCHES 2019, Pornin 2020 §4).
static Native5x64 arithmetic_shift_by_batch(const u64 t[6]) noexcept
static void signed_linear_combination(i64 a, const Native5x64 &x, i64 b, const Native5x64 &y, u64 out[6]) noexcept