233 const size_t* w0_bucket_start,
238 std::span<typename Curve::AffineElement> extra_points,
239 std::span<uint32_t> redirect_lookup,
240 const uint8_t* msb_per_scalar,
248 constexpr uint32_t HT_EMPTY = ~uint32_t{ 0 };
262 constexpr size_t HT_SIZE = 4096;
263 constexpr size_t HT_MASK = HT_SIZE - 1;
264 static_assert((HT_SIZE & (HT_SIZE - 1)) == 0,
"HT_SIZE must be a power of 2");
276 constexpr uint32_t HT_CLUSTER_BIT = uint32_t{ 1 } << 31;
282 uint32_t*
const cluster_members_data = scratch.cluster_members.
data();
283 const size_t cluster_members_cap = scratch.cluster_members.size();
284 size_t cluster_members_size = 0;
285 uint32_t*
const cluster_offsets_data = scratch.cluster_offsets.data();
286 const size_t cluster_offsets_cap = scratch.cluster_offsets.size();
287 size_t cluster_offsets_size = 0;
288 uint16_t*
const dirty_slots_data = scratch.dirty_slots.data();
289 const size_t dirty_slots_cap = scratch.dirty_slots.size();
290 size_t dirty_slots_size = 0;
295 cluster_offsets_data[cluster_offsets_size++] = 0;
305 uint32_t clusters_opened = 0;
311 uint32_t*
const bucket_rep_data = scratch.bucket_rep.data();
312 const size_t bucket_rep_cap = scratch.bucket_rep.size();
313 size_t bucket_rep_size = 0;
315 const size_t staged_cap = scratch.staged.size();
316 size_t staged_size = 0;
318 for (
size_t b = b_lo;
b < b_hi; ++
b) {
319 const size_t lo = w0_bucket_start[
b];
320 const size_t hi = w0_bucket_start[
b + 1];
326 for (
size_t k = 0; k < dirty_slots_size; ++k) {
327 ht[dirty_slots_data[k]] = HT_EMPTY;
329 dirty_slots_size = 0;
333 for (
size_t i = lo; i < hi; ++i) {
335 if (
static_cast<size_t>(msb_per_scalar[idx]) < c_threshold) {
338 const uint64_t* d = scalars[idx].data;
347 size_t probe_count = 0;
349 if (++probe_count > HT_SIZE) {
352 const uint32_t entry = ht[
slot];
353 if (entry == HT_EMPTY) {
355 ht_fingerprint[
slot] = fingerprint;
359 if (
BB_LIKELY(dirty_slots_size < dirty_slots_cap)) {
360 dirty_slots_data[dirty_slots_size++] =
static_cast<uint16_t
>(
slot);
364 if ((entry & HT_CLUSTER_BIT) != 0) {
365 const uint32_t bucket_cid = entry & ~HT_CLUSTER_BIT;
366 const uint32_t rep = bucket_rep_data[bucket_cid];
367 if (ht_fingerprint[
slot] == fingerprint &&
374 staged_data[staged_size++] = { bucket_cid, idx };
381 if (ht_fingerprint[
slot] == fingerprint &&
383 if (clusters_opened >= (cid_max - cid_lo)) {
387 if (
BB_UNLIKELY(bucket_rep_size >= bucket_rep_cap || staged_size >= staged_cap)) {
390 const uint32_t bucket_cid =
static_cast<uint32_t
>(bucket_rep_size);
391 bucket_rep_data[bucket_rep_size++] = entry;
392 staged_data[staged_size++] = { bucket_cid, idx };
393 ht[
slot] = HT_CLUSTER_BIT | bucket_cid;
401 if (bucket_rep_size == 0) {
408 staged_data + staged_size,
411 size_t staged_cursor = 0;
412 for (
size_t bc = 0; bc < bucket_rep_size; ++bc) {
419 size_t this_cluster_members = 1;
420 for (
size_t sc = staged_cursor; sc < staged_size && staged_data[sc].first == bc; ++sc) {
421 ++this_cluster_members;
423 if (cluster_members_size + this_cluster_members > cluster_members_cap) {
426 cluster_members_data[cluster_members_size++] = bucket_rep_data[bc];
427 while (staged_cursor < staged_size && staged_data[staged_cursor].first == bc) {
428 cluster_members_data[cluster_members_size++] = staged_data[staged_cursor].second;
434 cluster_offsets_data[cluster_offsets_size++] =
static_cast<uint32_t
>(cluster_members_size);
443 const size_t num_clusters = cluster_offsets_size - 1;
444 if (num_clusters == 0) {
451 BB_ASSERT_EQ(cluster_offsets_size, num_clusters + 1,
"cluster_offsets layout mismatch");
456 uint32_t*
const chunk_ids_data = scratch.chunk_ids.data();
457 const size_t chunk_cap = scratch.chunk_pts.size();
460 size_t chunk_size = 0;
468 size_t cid_cursor = 0;
469 size_t member_offset_in_cluster = 0;
470 AffineElement carry{};
471 bool has_carry =
false;
473 while (cid_cursor < num_clusters || has_carry) {
476 chunk_pts_data[chunk_size] = carry;
477 chunk_ids_data[chunk_size] =
static_cast<uint32_t
>(cid_cursor);
482 const size_t cluster_lo = cluster_offsets_data[cid_cursor] + member_offset_in_cluster;
483 const size_t cluster_hi = cluster_offsets_data[cid_cursor + 1];
484 const size_t available = cluster_hi - cluster_lo;
486 if (available <= room) {
487 for (
size_t k = 0; k < available; ++k) {
488 chunk_pts_data[chunk_size] = points[cluster_members_data[cluster_lo + k]];
489 chunk_ids_data[chunk_size] =
static_cast<uint32_t
>(cid_cursor);
493 member_offset_in_cluster = 0;
495 for (
size_t k = 0; k < room; ++k) {
496 chunk_pts_data[chunk_size] = points[cluster_members_data[cluster_lo + k]];
497 chunk_ids_data[chunk_size] =
static_cast<uint32_t
>(cid_cursor);
500 member_offset_in_cluster += room;
504 const size_t result_len = dedup_tree_reduce_in_place<Curve>(chunk_pts_data,
510 const bool last_is_partial = (cid_cursor < num_clusters) && (member_offset_in_cluster > 0);
512 for (
size_t k = 0; k < whole_count; ++k) {
513 const uint32_t local_cid = chunk_ids_data[k];
514 extra_points[cid_lo + local_cid] = chunk_pts_data[k];
516 if (last_is_partial) {
525 for (
size_t k = 0; k < num_clusters; ++k) {
526 const size_t mlo = cluster_offsets_data[k];
527 const size_t mhi = cluster_offsets_data[k + 1];
528 const uint32_t rep_idx = cluster_members_data[mlo];
529 const uint32_t global_cid = cid_lo +
static_cast<uint32_t
>(k);
531 for (
size_t m = mlo + 1; m < mhi; ++m) {
532 const uint32_t non_rep_idx = cluster_members_data[m];