61 std::array<uint16_t, VAR_WINDOW_MAX_WINDOWS>
bit_base{};
79 size_t num_logical_threads)
noexcept
81 constexpr uint32_t MAX_C = 20;
85 static_cast<void>(num_bits);
86 const size_t target_load = (n_input > 4096) ? (num_logical_threads * 2 / 3) : (num_logical_threads / 3);
87 if (target_load == 0 || num_points <= target_load) {
90 const size_t ratio = num_points / target_load;
95 }
else if (best >= MAX_C) {
100 static_cast<void>(n_input);
101 static_cast<void>(num_logical_threads);
102 uint64_t best_cost =
static_cast<uint64_t
>(-1);
103 for (uint32_t window_bits = 2; window_bits < MAX_C; ++window_bits) {
104 const uint64_t rounds = (num_bits + 2 + window_bits - 1) / window_bits;
105 const uint64_t buckets = (uint64_t{ 1 } << (window_bits - 1)) + 1;
106 const uint64_t n = num_points;
107 constexpr uint64_t BUCKET_ACC_COST = 15;
108 const uint64_t cost = rounds * (n + (buckets * BUCKET_ACC_COST));
109 if (cost < best_cost) {
126 size_t bits_remaining = num_bits + 2;
127 size_t bit_offset = 0;
131 sched.bit_base[w] =
static_cast<uint16_t
>(bit_offset);
132 sched.window_bits_per_window[w] =
static_cast<uint8_t
>(window_bits_w);
133 sched.num_buckets[w] =
static_cast<uint16_t
>((
size_t{ 1 } << (window_bits_w - 1)) + 1);
134 bit_offset += window_bits_w;
135 bits_remaining -= window_bits_w;
138 sched.num_windows = w;
191 size_t global_max_overflow_per_window,
193 size_t phase_a_cluster_members_cap,
194 size_t phase_a_cluster_offsets_cap,
195 size_t windows_per_batch,
196 size_t dense_stride_est)
noexcept
198 auto align_up = [](
size_t off,
size_t align) ->
size_t {
return (off + align - 1) & ~(align - 1); };
199 auto layout_add = [&](
size_t& off,
size_t bytes,
size_t align) { off = align_up(off, align) + bytes; };
204 layout_add(
ts_fixed_layout,
sizeof(uint32_t) * chunk_capacity,
alignof(uint32_t));
208 layout_add(
ts_fixed_layout,
sizeof(uint32_t) * global_max_overflow_per_window,
alignof(uint32_t));
214 layout_add(
pa_layout,
sizeof(uint32_t) * phase_a_cluster_members_cap,
alignof(uint32_t));
215 layout_add(
pa_layout,
sizeof(uint32_t) * phase_a_cluster_offsets_cap,
alignof(uint32_t));
230 if (windows_per_batch != 0) {
231 const size_t dense_total = windows_per_batch * dense_stride_est;
232 const size_t dense_pair_max = dense_total / 2;
252 const size_t per_thread = (B_eff > 1) ? ((B_eff - 1 + num_threads - 1) / num_threads) :
size_t{ 1 };
260 return (B_eff > 0) ? (B_eff - 1 + num_threads - 1) :
size_t{ 0 };
266 size_t subchunk_entries_cap)
noexcept
268 const size_t global_max_chunk_len = (n + num_threads - 1) / num_threads;
269 return (global_max_chunk_len + subchunk_entries_cap - 1) / subchunk_entries_cap;
282template <
typename Curve>
284 size_t num_threads,
size_t B_eff,
size_t n,
size_t dense_stride,
size_t worker_total)
noexcept
287 const size_t hist_h_bytes_pw =
size_t{ 4 } * num_threads * B_eff;
288 const size_t hist_o_bytes_pw = (
sizeof(
ChunkOutput<Curve>) * num_threads) + (
size_t{ 96 } * num_threads);
289 const size_t hist_slot_bytes_pw =
std::max(hist_h_bytes_pw, hist_o_bytes_pw);
290 const size_t dense_slot_bytes_pw =
size_t{ 65 } * bucket_partials_max;
291 return (
size_t{ 4 } * n) + hist_slot_bytes_pw + dense_slot_bytes_pw + (
size_t{ 8 } * (B_eff + 1)) +
292 (
size_t{ 8 } * (num_threads + 1)) + (
size_t{ 8 } * (num_threads + 1)) + (
size_t{ 8 } * num_threads) +
293 (
size_t{ 8 } * num_threads) + (
size_t{ 8 } * num_threads) + (
size_t{ 16 } * worker_total) +
294 (
size_t{ 8 } * num_threads) + (
size_t{ 87 } * worker_total * dense_stride);
301 bool inline_glv_double,
302 size_t profile_threads)
noexcept
305 + (use_glv ?
size_t{ 32 } * n :
size_t{ 0 })
306 + (inline_glv_double ?
size_t{ 64 } * n :
size_t{ 0 })
307 + (profile_threads *
size_t{ 1024 });
325[[nodiscard]]
inline size_t solve_wpb(
size_t per_window_bytes,
size_t available_budget,
size_t W_R)
noexcept
330 if (per_window_bytes == 0 || available_budget == 0) {
333 return std::min(
std::max<size_t>(1, available_budget / per_window_bytes), W_R);
typename Group::element Element
typename Group::affine_element AffineElement
constexpr T get_msb(const T in)
constexpr size_t BATCH_MEM_BUDGET
size_t compute_global_max_overflow_per_window(size_t n, size_t num_threads, size_t subchunk_entries_cap) noexcept
constexpr size_t BATCH_CAPACITY
constexpr size_t MIN_AFFINE_THREAD_RATIO
constexpr size_t DEDUP_MAX_CLUSTERS
constexpr size_t MIN_BATCH_CAPACITY
size_t compute_phase_one_prologue_bytes(size_t n, bool use_glv, bool inline_glv_double, size_t profile_threads) noexcept
size_t solve_wpb(size_t per_window_bytes, size_t available_budget, size_t W_R) noexcept
constexpr size_t DEDUP_MAX_CHUNK_MEMBERS
constexpr size_t SUBCHUNK_ENTRIES_CAP
constexpr size_t DEDUP_MAX_MEMBERS
size_t compute_per_window_bytes(size_t num_threads, size_t B_eff, size_t n, size_t dense_stride, size_t worker_total) noexcept
constexpr size_t VAR_WINDOW_MAX_WINDOWS
size_t compute_bucket_partials_max(size_t B_eff, size_t num_threads) noexcept
PhaseACaps compute_phase_a_caps(size_t n, size_t num_threads) noexcept
size_t compute_dense_stride(size_t B_eff, size_t num_threads) noexcept
uint32_t choose_window_bits(size_t num_points, size_t num_bits, size_t n_input, size_t num_logical_threads) noexcept
VariableWindowSchedule build_var_window_schedule(size_t num_bits, size_t window_bits) noexcept
constexpr decltype(auto) get(::tuplet::tuple< T... > &&t) noexcept
static constexpr size_t WORKER_SLAB_ALIGN
typename Curve::AffineElement AffineElement
size_t per_worker_union_bytes
static constexpr size_t PHASE_A_STAGED_CAP
size_t per_worker_per_wpb_layout
static constexpr size_t PHASE_A_BUCKET_REP_CAP
static constexpr size_t PHASE_A_DIRTY_SLOTS_CAP
static constexpr size_t PHASE_A_CHUNK_CAP
typename Curve::BaseField BaseField
PerWorkerArenaLayout(size_t chunk_capacity, size_t global_max_overflow_per_window, bool dedup_active, size_t phase_a_cluster_members_cap, size_t phase_a_cluster_offsets_cap, size_t windows_per_batch, size_t dense_stride_est) noexcept
std::array< uint16_t, VAR_WINDOW_MAX_WINDOWS > bit_base
std::array< uint8_t, VAR_WINDOW_MAX_WINDOWS > window_bits_per_window
std::array< uint16_t, VAR_WINDOW_MAX_WINDOWS > num_buckets