From 96668b20e37bc4eb13eba8733dce27d39964a5b9 Mon Sep 17 00:00:00 2001 From: Thor Preimesberger Date: Mon, 22 Sep 2025 17:01:59 -0400 Subject: [PATCH 1/5] Shorten x86_64 random slab path with pext asm Only applies on platforms with BMI2, i.e. haswell+ --- h_malloc.c | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/h_malloc.c b/h_malloc.c index 6221d0b..c4855cd 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -405,8 +405,22 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct // randomize start location for linear search (uniform random choice is too slow) size_t random_index = get_random_u16_uniform(rng, slots); size_t first_bitmap = random_index / U64_WIDTH; - u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); +#if __x86_64__ && __BMI2__ + u64 tmp; + __asm__ ( + // set up mask + "mov $0xfffffffffffffff8, %%rdx\n\t" + // tmp is now same as shift amount mod 256 in portable case + "pext %[tmp], %[random_index], %%rdx\n\t" + + : [tmp] "=r" (tmp) + : [random_index] "r" (random_index)); + // gcc/clang is smart enough to generate code with no spills here + u64 random_split = ~(~0UL << (tmp)); +#else + u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); +#endif size_t i = first_bitmap; u64 masked = metadata->bitmap[i]; masked |= random_split; From 42a635282ae50900b1d12fdc669ceffa27be939b Mon Sep 17 00:00:00 2001 From: Thor Preimesberger Date: Tue, 23 Sep 2025 17:13:15 -0400 Subject: [PATCH 2/5] Fix breaking tests, let compiler do register alloc --- h_malloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/h_malloc.c b/h_malloc.c index c4855cd..5dfa800 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -410,9 +410,9 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct __asm__ ( // set up mask - "mov $0xfffffffffffffff8, %%rdx\n\t" + "mov $0xfffffffffffffff8, %1\n\t" // tmp is now same as shift amount mod 256 in portable case - "pext %[tmp], %[random_index], %%rdx\n\t" + "pext %[tmp], %[random_index], %1\n\t" : [tmp] "=r" (tmp) : [random_index] "r" (random_index)); From 592a25aaeb5bcdeb47bcea539cfe30ac90222ca6 Mon Sep 17 00:00:00 2001 From: Thor Preimesberger Date: Tue, 23 Sep 2025 17:55:57 -0400 Subject: [PATCH 3/5] Use intrinsics instead --- h_malloc.c | 19 ++++++------------- 1 file changed, 6 insertions(+), 13 deletions(-) diff --git a/h_malloc.c b/h_malloc.c index 5dfa800..ad002a4 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -20,6 +20,10 @@ #include "random.h" #include "util.h" +#if __x86_64__ +#include "immintrin.h" +#endif + #ifdef USE_PKEY #include #endif @@ -405,19 +409,8 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct // randomize start location for linear search (uniform random choice is too slow) size_t random_index = get_random_u16_uniform(rng, slots); size_t first_bitmap = random_index / U64_WIDTH; -#if __x86_64__ && __BMI2__ - u64 tmp; - __asm__ ( - - // set up mask - "mov $0xfffffffffffffff8, %1\n\t" - // tmp is now same as shift amount mod 256 in portable case - "pext %[tmp], %[random_index], %1\n\t" - - : [tmp] "=r" (tmp) - : [random_index] "r" (random_index)); - // gcc/clang is smart enough to generate code with no spills here - u64 random_split = ~(~0UL << (tmp)); +#if __x86_64__ && (__BMI2__) + u64 random_split = ~(~0UL << _pext_u64(random_index, 8)); #else u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); #endif From 10e27e21420aaf034b033a3e99593420806656f9 Mon Sep 17 00:00:00 2001 From: Thor Preimesberger Date: Tue, 23 Sep 2025 18:20:54 -0400 Subject: [PATCH 4/5] Integrate into build system on other architectures, flags for gcc and clang respectively --- h_malloc.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/h_malloc.c b/h_malloc.c index ad002a4..b83007f 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -20,9 +20,12 @@ #include "random.h" #include "util.h" +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wundef" #if __x86_64__ #include "immintrin.h" #endif +#pragma GCC diagnostic pop #ifdef USE_PKEY #include @@ -409,7 +412,11 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct // randomize start location for linear search (uniform random choice is too slow) size_t random_index = get_random_u16_uniform(rng, slots); size_t first_bitmap = random_index / U64_WIDTH; -#if __x86_64__ && (__BMI2__) +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wundef" +// __BMI2__ is idiomatic to gcc unfortunately. +#if __x86_64__ && (__BMI2__ || (__clang__ && __BMI2INTRIN_H_)) +#pragma GCC diagnostic pop u64 random_split = ~(~0UL << _pext_u64(random_index, 8)); #else u64 random_split = ~(~0UL << (random_index - first_bitmap * U64_WIDTH)); From 5deea96fe193f1495366195ea08049af071b1eb4 Mon Sep 17 00:00:00 2001 From: Thor Preimesberger Date: Tue, 23 Sep 2025 19:04:08 -0400 Subject: [PATCH 5/5] Clean up feature test macros to better communicate intent. Semantics unchanged. --- h_malloc.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/h_malloc.c b/h_malloc.c index b83007f..4f6ed19 100644 --- a/h_malloc.c +++ b/h_malloc.c @@ -414,8 +414,7 @@ static size_t get_free_slot(struct random_state *rng, size_t slots, const struct size_t first_bitmap = random_index / U64_WIDTH; #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wundef" -// __BMI2__ is idiomatic to gcc unfortunately. -#if __x86_64__ && (__BMI2__ || (__clang__ && __BMI2INTRIN_H_)) +#if __x86_64__ && ((__GNU__ && __BMI2__ ) || (__clang__ && __BMI2INTRIN_H_)) #pragma GCC diagnostic pop u64 random_split = ~(~0UL << _pext_u64(random_index, 8)); #else