Shorten x86_64 random slab path with pext asm

Only applies on platforms with BMI2, i.e. haswell+
This commit is contained in:
Thor Preimesberger 2025-09-22 17:01:59 -04:00
parent ff99511eb4
commit 96668b20e3

View file

@ -405,8 +405,22 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct
// randomize start location for linear search (uniform random choice is too slow) // randomize start location for linear search (uniform random choice is too slow)
size_t random_index = get_random_u16_uniform(rng, slots); size_t random_index = get_random_u16_uniform(rng, slots);
size_t first_bitmap = random_index / U64_WIDTH; size_t first_bitmap = random_index / U64_WIDTH;
u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); #if __x86_64__ && __BMI2__
u64 tmp;
__asm__ (
// set up mask
"mov $0xfffffffffffffff8, %%rdx\n\t"
// tmp is now same as shift amount mod 256 in portable case
"pext %[tmp], %[random_index], %%rdx\n\t"
: [tmp] "=r" (tmp)
: [random_index] "r" (random_index));
// gcc/clang is smart enough to generate code with no spills here
u64 random_split = ~(~0UL << (tmp));
#else
u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH));
#endif
size_t i = first_bitmap; size_t i = first_bitmap;
u64 masked = metadata->bitmap[i]; u64 masked = metadata->bitmap[i];
masked |= random_split; masked |= random_split;