167 i64 u = 1, v = 0, q = 0, r = 1;
168 for (
int i = 0; i <
BATCH; ++i) {
171 u64 nf = g_lo, ng = (g_lo - f_lo) >> 1;
172 i64 nu = q << 1, nv = r << 1, nq = q - u, nr = r - v;
181 g_lo = (g_lo + f_lo) >> 1;
195 return { u, v, q, r };
211 u64 p_inv_mod_2k)
noexcept
213 constexpr u64 MASK_BATCH = (1ULL <<
BATCH) - 1;
214 const i64 u_lo = m.
u & (
i64)LIMB_MASK, u_hi = m.u >> LIMB_BITS;
215 const i64 v_lo = m.v & (
i64)LIMB_MASK, v_hi = m.v >> LIMB_BITS;
216 const i64 q_lo = m.q & (
i64)LIMB_MASK, q_hi = m.q >> LIMB_BITS;
217 const i64 r_lo = m.r & (
i64)LIMB_MASK, r_hi = m.r >> LIMB_BITS;
220 i64 cf = 0, cg = 0, fp = 0, gp = 0;
221 for (
int i = 0; i < N; ++i) {
222 const i64 fi = f.l[i], gi = g.l[i];
223 const i64 nf = u_lo * fi + v_lo * gi + u_hi * fp + v_hi * gp + cf;
224 const i64 ng = q_lo * fi + r_lo * gi + q_hi * fp + r_hi * gp + cg;
225 cf = nf >> LIMB_BITS;
226 cg = ng >> LIMB_BITS;
228 f.l[i - 2] = nf & (
i64)LIMB_MASK;
229 g.l[i - 2] = ng & (
i64)LIMB_MASK;
234 const i64 nf9 = u_hi * fp + v_hi * gp + cf;
235 const i64 ng9 = q_hi * fp + r_hi * gp + cg;
236 f.l[N - 2] = nf9 & (
i64)LIMB_MASK;
237 g.l[N - 2] = ng9 & (
i64)LIMB_MASK;
238 f.l[N - 1] = nf9 >> LIMB_BITS;
239 g.l[N - 1] = ng9 >> LIMB_BITS;
245 const i64 d0 = d.l[0], e0 = e.l[0], d1 = d.l[1], e1 = e.l[1];
246 const i64 nd0 = u_lo * d0 + v_lo * e0;
247 const i64 ne0 = q_lo * d0 + r_lo * e0;
248 const i64 nd1 = u_lo * d1 + v_lo * e1 + u_hi * d0 + v_hi * e0;
249 const i64 ne1 = q_lo * d1 + r_lo * e1 + q_hi * d0 + r_hi * e0;
250 const u64 t_d = ((
u64)nd0 & LIMB_MASK) | (((
u64)(nd1 + (nd0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS);
251 const u64 t_e = ((
u64)ne0 & LIMB_MASK) | (((
u64)(ne1 + (ne0 >> LIMB_BITS)) & LIMB_MASK) << LIMB_BITS);
252 const u64 k_d = ((0ULL - t_d) * p_inv_mod_2k) & MASK_BATCH;
253 const u64 k_e = ((0ULL - t_e) * p_inv_mod_2k) & MASK_BATCH;
254 const i64 kd_lo = (
i64)(k_d & LIMB_MASK), kd_hi = (
i64)(k_d >> LIMB_BITS);
255 const i64 ke_lo = (
i64)(k_e & LIMB_MASK), ke_hi = (
i64)(k_e >> LIMB_BITS);
256 i64 cd = (nd1 + kd_lo * p.l[1] + kd_hi * p.l[0] + ((nd0 + kd_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS;
257 i64 ce = (ne1 + ke_lo * p.l[1] + ke_hi * p.l[0] + ((ne0 + ke_lo * p.l[0]) >> LIMB_BITS)) >> LIMB_BITS;
259 i64 dp = d1, ep = e1;
260 for (
int i = 2; i < N; ++i) {
261 const i64 di = d.l[i], ei = e.l[i];
262 const i64 nd = u_lo * di + v_lo * ei + u_hi * dp + v_hi * ep + kd_lo * p.l[i] + kd_hi * p.l[i - 1] + cd;
263 const i64 ne = q_lo * di + r_lo * ei + q_hi * dp + r_hi * ep + ke_lo * p.l[i] + ke_hi * p.l[i - 1] + ce;
264 cd = nd >> LIMB_BITS;
265 ce = ne >> LIMB_BITS;
266 d.l[i - 2] = nd & (
i64)LIMB_MASK;
267 e.l[i - 2] = ne & (
i64)LIMB_MASK;
271 const i64 nd9 = u_hi * dp + v_hi * ep + kd_hi * p.l[N - 1] + cd;
272 const i64 ne9 = q_hi * dp + r_hi * ep + ke_hi * p.l[N - 1] + ce;
273 d.l[N - 2] = nd9 & (
i64)LIMB_MASK;
274 e.l[N - 2] = ne9 & (
i64)LIMB_MASK;
275 d.l[N - 1] = nd9 >> LIMB_BITS;
276 e.l[N - 1] = ne9 >> LIMB_BITS;