mfunc u32 idx(u32 bucket, u32 bit) const { return bucket * 64 + bit; } template func u64 bucket_runs(u64 runs, u32 step1, u32 step2, u32 step3, u32 step4, u32 step5, u32 step6) { if constexpr (N >= 1) runs &= runs >> step1; if constexpr (N >= 2) runs &= runs >> step2; if constexpr (N >= 3) runs &= runs >> step3; if constexpr (N >= 4) runs &= runs >> step4; if constexpr (N >= 5) runs &= runs >> step5; if constexpr (N >= 6) runs &= runs >> step6; return runs; } template mfunc u32 find_run_branch(u32 num_bits) const { auto nb = num_bits; let step1 = nb >> 1; let step2 = (nb -= step1) >> 1; let step3 = (nb -= step2) >> 1; let step4 = (nb -= step3) >> 1; let step5 = (nb -= step4) >> 1; let step6 = (nb -= step5) >> 1; for (u32 bk = 0; bk < num_buckets - 1; ++bk) { let left = BIT ? buckets[bk] : ~buckets[bk]; let filter = FILTER ? popcnt(left) >= num_bits : true; if (filter) { let runs = bucket_runs(left, step1, step2, step3, step4, step5, step6); if (runs) return idx(bk, tzcnt(runs)); } let right = BIT ? ~buckets[bk + 1] : buckets[bk + 1]; let left_bits = lzcnt(~left); let right_bits = tzcnt(right); if (left_bits + right_bits >= num_bits) return idx(bk, 64 - left_bits); } let bk = num_buckets - 1; let left = BIT ? buckets[bk] : ~buckets[bk]; let filter = FILTER ? popcnt(left) >= num_bits : true; if (filter) { let runs = bucket_runs(left, step1, step2, step3, step4, step5, step6); if (runs) return idx(bk, tzcnt(runs)); } return U32_MAX; } template mfunc u32 find_run(u32 num_bits) const { if (num_bits <= 2) return find_run_branch(num_bits); else if (num_bits <= 4) return find_run_branch(num_bits); else if (num_bits <= 8) return find_run_branch(num_bits); else if (num_bits <= 16) return find_run_branch(num_bits); else if (num_bits <= 32) return find_run_branch(num_bits); else if (num_bits <= 40) return find_run_branch(num_bits); else if (num_bits <= 64) return find_run_branch(num_bits); else if (num_bits <= 128) { for (u32 bk = 0; bk < num_buckets - 2; ++bk) { let left = BIT ? ~buckets[bk] : buckets[bk]; let middle = BIT ? ~buckets[bk + 1] : buckets[bk + 1]; let left_bits = lzcnt(left); u32 run = left_bits + tzcnt(middle); if (!middle) { let right = BIT ? ~buckets[bk + 2] : buckets[bk + 2]; run += tzcnt(right); } if (run >= num_bits) return idx(bk, 64 - left_bits); } let bk = num_buckets - 2; let left = BIT ? ~buckets[bk] : buckets[bk]; let right = BIT ? ~buckets[bk + 1] : buckets[bk + 1]; let left_bits = lzcnt(left); let run = left_bits + tzcnt(right); if (run >= num_bits) return idx(bk, 64 - left_bits); } else { let min_full = num_bits / 64 - 1; u32 num_full = 0; for (u32 bk = 0; bk < num_buckets; ++bk) { letx FULL = BIT ? U64_MAX : 0; if (buckets[bk] == FULL) ++num_full; else num_full = 0; if (num_full != min_full) continue; u32 run = min_full * 64; u32 left_bits = 0; let bk_l = i32(bk) - i32(min_full); if (bk_l >= 0) { let left = BIT ? ~buckets[bk_l] : buckets[bk_l]; left_bits = lzcnt(left); } run += left_bits; for (u32 bk_r = bk + 1; bk_r < num_buckets; ++bk_r) { let right = BIT ? ~buckets[bk_r] : buckets[bk_r]; if (!right) run += 64; else { run += tzcnt(right); break; } if (run >= num_bits) break; } if (run >= num_bits) return bk_l < 0 ? idx(bk_l + 1, 0) : idx(bk_l, 64 - left_bits); } } return U32_MAX; }