implement the option of large size classes
This extends the size class scheme used for slab allocations to large allocations. This drastically improves performance for many real world programs using incremental realloc growth instead of using proper growth factors. There are 4 size classes for every doubling in size, resulting in a worst case of ~20% extra virtual memory being reserved and a huge increase in performance for pathological cases. For example, growing from 4MiB to 8MiB by calling realloc in increments of 32 bytes will only need to do work beyond looking up the size 4 times instead of 1024 times with 4096 byte granularity.pull/87/head
parent
7a7126e780
commit
e0891c8cfc
|
@ -17,6 +17,7 @@ common_cflags = [
|
||||||
"-DSLAB_CANARY=true",
|
"-DSLAB_CANARY=true",
|
||||||
"-DSLAB_QUARANTINE_RANDOM_LENGTH=1",
|
"-DSLAB_QUARANTINE_RANDOM_LENGTH=1",
|
||||||
"-DSLAB_QUARANTINE_QUEUE_LENGTH=1",
|
"-DSLAB_QUARANTINE_QUEUE_LENGTH=1",
|
||||||
|
"-DCONFIG_LARGE_SIZE_CLASSES=true",
|
||||||
"-DGUARD_SLABS_INTERVAL=1",
|
"-DGUARD_SLABS_INTERVAL=1",
|
||||||
"-DGUARD_SIZE_DIVISOR=2",
|
"-DGUARD_SIZE_DIVISOR=2",
|
||||||
"-DREGION_QUARANTINE_RANDOM_LENGTH=128",
|
"-DREGION_QUARANTINE_RANDOM_LENGTH=128",
|
||||||
|
|
6
Makefile
6
Makefile
|
@ -8,6 +8,7 @@ CONFIG_SLOT_RANDOMIZE := true
|
||||||
CONFIG_SLAB_CANARY := true
|
CONFIG_SLAB_CANARY := true
|
||||||
CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 1
|
CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH := 1
|
||||||
CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 1
|
CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH := 1
|
||||||
|
CONFIG_LARGE_SIZE_CLASSES := true
|
||||||
CONFIG_GUARD_SLABS_INTERVAL := 1
|
CONFIG_GUARD_SLABS_INTERVAL := 1
|
||||||
CONFIG_GUARD_SIZE_DIVISOR := 2
|
CONFIG_GUARD_SIZE_DIVISOR := 2
|
||||||
CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 128
|
CONFIG_REGION_QUARANTINE_RANDOM_LENGTH := 128
|
||||||
|
@ -68,6 +69,10 @@ ifeq (,$(filter $(CONFIG_SLAB_CANARY),true false))
|
||||||
$(error CONFIG_SLAB_CANARY must be true or false)
|
$(error CONFIG_SLAB_CANARY must be true or false)
|
||||||
endif
|
endif
|
||||||
|
|
||||||
|
ifeq (,$(filter $(CONFIG_LARGE_SIZE_CLASSES),true false))
|
||||||
|
$(error CONFIG_LARGE_SIZE_CLASSES must be true or false)
|
||||||
|
endif
|
||||||
|
|
||||||
CPPFLAGS += \
|
CPPFLAGS += \
|
||||||
-DZERO_ON_FREE=$(CONFIG_ZERO_ON_FREE) \
|
-DZERO_ON_FREE=$(CONFIG_ZERO_ON_FREE) \
|
||||||
-DWRITE_AFTER_FREE_CHECK=$(CONFIG_WRITE_AFTER_FREE_CHECK) \
|
-DWRITE_AFTER_FREE_CHECK=$(CONFIG_WRITE_AFTER_FREE_CHECK) \
|
||||||
|
@ -75,6 +80,7 @@ CPPFLAGS += \
|
||||||
-DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \
|
-DSLAB_CANARY=$(CONFIG_SLAB_CANARY) \
|
||||||
-DSLAB_QUARANTINE_RANDOM_LENGTH=$(CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH) \
|
-DSLAB_QUARANTINE_RANDOM_LENGTH=$(CONFIG_SLAB_QUARANTINE_RANDOM_LENGTH) \
|
||||||
-DSLAB_QUARANTINE_QUEUE_LENGTH=$(CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH) \
|
-DSLAB_QUARANTINE_QUEUE_LENGTH=$(CONFIG_SLAB_QUARANTINE_QUEUE_LENGTH) \
|
||||||
|
-DCONFIG_LARGE_SIZE_CLASSES=$(CONFIG_LARGE_SIZE_CLASSES) \
|
||||||
-DGUARD_SLABS_INTERVAL=$(CONFIG_GUARD_SLABS_INTERVAL) \
|
-DGUARD_SLABS_INTERVAL=$(CONFIG_GUARD_SLABS_INTERVAL) \
|
||||||
-DGUARD_SIZE_DIVISOR=$(CONFIG_GUARD_SIZE_DIVISOR) \
|
-DGUARD_SIZE_DIVISOR=$(CONFIG_GUARD_SIZE_DIVISOR) \
|
||||||
-DREGION_QUARANTINE_RANDOM_LENGTH=$(CONFIG_REGION_QUARANTINE_RANDOM_LENGTH) \
|
-DREGION_QUARANTINE_RANDOM_LENGTH=$(CONFIG_REGION_QUARANTINE_RANDOM_LENGTH) \
|
||||||
|
|
11
README.md
11
README.md
|
@ -167,6 +167,9 @@ for the chosen values are not written yet, so use them at your own peril:
|
||||||
* `CONFIG_STATS`: `false` (default) to control whether stats on allocation /
|
* `CONFIG_STATS`: `false` (default) to control whether stats on allocation /
|
||||||
deallocation count and active allocations are tracked. This is currently only
|
deallocation count and active allocations are tracked. This is currently only
|
||||||
exposed via the mallinfo APIs on Android.
|
exposed via the mallinfo APIs on Android.
|
||||||
|
* `CONFIG_LARGE_SIZE_CLASSES`: `true` (default) to control whether large
|
||||||
|
allocations use the slab allocation size class scheme instead of page size
|
||||||
|
granularity (see the section on size classes below)
|
||||||
|
|
||||||
There will be more control over enabled features in the future along with
|
There will be more control over enabled features in the future along with
|
||||||
control over fairly arbitrarily chosen values like the size of empty slab
|
control over fairly arbitrarily chosen values like the size of empty slab
|
||||||
|
@ -400,6 +403,14 @@ size for 2048 byte spacing and the next spacing class matches the page size of
|
||||||
classes required to avoid substantial waste from rounding. Further slab
|
classes required to avoid substantial waste from rounding. Further slab
|
||||||
allocation size classes may be offered as an option in the future.
|
allocation size classes may be offered as an option in the future.
|
||||||
|
|
||||||
|
The `CONFIG_LARGE_SIZE_CLASSES` option controls whether large allocations use
|
||||||
|
the same size class scheme providing 4 size classes for every doubling of size.
|
||||||
|
It increases virtual memory consumption but drastically improves performance
|
||||||
|
where realloc is used without proper growth factors, which is fairly common and
|
||||||
|
destroys performance in some commonly used programs. If large size classes are
|
||||||
|
disabled, the granularity is instead the page size, which is currently always
|
||||||
|
4096 bytes on supported platforms.
|
||||||
|
|
||||||
## Scalability
|
## Scalability
|
||||||
|
|
||||||
### Small (slab) allocations
|
### Small (slab) allocations
|
||||||
|
|
57
h_malloc.c
57
h_malloc.c
|
@ -1123,11 +1123,37 @@ COLD __attribute__((constructor(101))) static void trigger_early_init(void) {
|
||||||
h_free(h_malloc(16));
|
h_free(h_malloc(16));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Returns 0 on overflow.
|
||||||
|
static size_t get_large_size_class(size_t size) {
|
||||||
|
if (CONFIG_LARGE_SIZE_CLASSES) {
|
||||||
|
// Continue small size class growth pattern of power of 2 spacing classes:
|
||||||
|
//
|
||||||
|
// 4 KiB [20 KiB, 24 KiB, 28 KiB, 32 KiB]
|
||||||
|
// 8 KiB [40 KiB, 48 KiB, 54 KiB, 64 KiB]
|
||||||
|
// 16 KiB [80 KiB, 96 KiB, 112 KiB, 128 KiB]
|
||||||
|
// 32 KiB [160 KiB, 192 KiB, 224 KiB, 256 KiB]
|
||||||
|
// 512 KiB [2560 KiB, 3 MiB, 3584 KiB, 4 MiB]
|
||||||
|
// 1 MiB [5 MiB, 6 MiB, 7 MiB, 8 MiB]
|
||||||
|
// etc.
|
||||||
|
size_t spacing_shift = 64 - __builtin_clzl(size - 1) - 3;
|
||||||
|
size_t spacing_class = 1ULL << spacing_shift;
|
||||||
|
return (size + (spacing_class - 1)) & ~(spacing_class - 1);
|
||||||
|
} else {
|
||||||
|
return PAGE_CEILING(size);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static size_t get_guard_size(struct random_state *state, size_t size) {
|
static size_t get_guard_size(struct random_state *state, size_t size) {
|
||||||
return (get_random_u64_uniform(state, size / PAGE_SIZE / GUARD_SIZE_DIVISOR) + 1) * PAGE_SIZE;
|
return (get_random_u64_uniform(state, size / PAGE_SIZE / GUARD_SIZE_DIVISOR) + 1) * PAGE_SIZE;
|
||||||
}
|
}
|
||||||
|
|
||||||
static void *allocate_large(size_t size) {
|
static void *allocate_large(size_t size) {
|
||||||
|
size = get_large_size_class(size);
|
||||||
|
if (unlikely(!size)) {
|
||||||
|
errno = ENOMEM;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
|
||||||
struct region_allocator *ra = ro.region_allocator;
|
struct region_allocator *ra = ro.region_allocator;
|
||||||
|
|
||||||
mutex_lock(&ra->lock);
|
mutex_lock(&ra->lock);
|
||||||
|
@ -1200,6 +1226,11 @@ static int alloc_aligned(void **memptr, size_t alignment, size_t size, size_t mi
|
||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
size = get_large_size_class(size);
|
||||||
|
if (unlikely(!size)) {
|
||||||
|
return ENOMEM;
|
||||||
|
}
|
||||||
|
|
||||||
struct region_allocator *ra = ro.region_allocator;
|
struct region_allocator *ra = ro.region_allocator;
|
||||||
|
|
||||||
mutex_lock(&ra->lock);
|
mutex_lock(&ra->lock);
|
||||||
|
@ -1277,6 +1308,14 @@ EXPORT void *h_realloc(void *old, size_t size) {
|
||||||
|
|
||||||
size = adjust_size_for_canaries(size);
|
size = adjust_size_for_canaries(size);
|
||||||
|
|
||||||
|
if (size > max_slab_size_class) {
|
||||||
|
size = get_large_size_class(size);
|
||||||
|
if (unlikely(!size)) {
|
||||||
|
errno = ENOMEM;
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
size_t old_size;
|
size_t old_size;
|
||||||
if (old >= get_slab_region_start() && old < ro.slab_region_end) {
|
if (old >= get_slab_region_start() && old < ro.slab_region_end) {
|
||||||
old_size = slab_usable_size(old);
|
old_size = slab_usable_size(old);
|
||||||
|
@ -1297,28 +1336,24 @@ EXPORT void *h_realloc(void *old, size_t size) {
|
||||||
}
|
}
|
||||||
old_size = region->size;
|
old_size = region->size;
|
||||||
size_t old_guard_size = region->guard_size;
|
size_t old_guard_size = region->guard_size;
|
||||||
if (PAGE_CEILING(old_size) == PAGE_CEILING(size)) {
|
if (old_size == size) {
|
||||||
region->size = size;
|
|
||||||
mutex_unlock(&ra->lock);
|
mutex_unlock(&ra->lock);
|
||||||
thread_seal_metadata();
|
thread_seal_metadata();
|
||||||
return old;
|
return old;
|
||||||
}
|
}
|
||||||
mutex_unlock(&ra->lock);
|
mutex_unlock(&ra->lock);
|
||||||
|
|
||||||
size_t old_rounded_size = PAGE_CEILING(old_size);
|
|
||||||
size_t rounded_size = PAGE_CEILING(size);
|
|
||||||
|
|
||||||
if (size > max_slab_size_class) {
|
if (size > max_slab_size_class) {
|
||||||
// in-place shrink
|
// in-place shrink
|
||||||
if (size < old_size) {
|
if (size < old_size) {
|
||||||
void *new_end = (char *)old + rounded_size;
|
void *new_end = (char *)old + size;
|
||||||
if (memory_map_fixed(new_end, old_guard_size)) {
|
if (memory_map_fixed(new_end, old_guard_size)) {
|
||||||
thread_seal_metadata();
|
thread_seal_metadata();
|
||||||
return NULL;
|
return NULL;
|
||||||
}
|
}
|
||||||
memory_set_name(new_end, old_guard_size, "malloc large");
|
memory_set_name(new_end, old_guard_size, "malloc large");
|
||||||
void *new_guard_end = (char *)new_end + old_guard_size;
|
void *new_guard_end = (char *)new_end + old_guard_size;
|
||||||
regions_quarantine_deallocate_pages(new_guard_end, old_rounded_size - rounded_size, 0);
|
regions_quarantine_deallocate_pages(new_guard_end, old_size - size, 0);
|
||||||
|
|
||||||
mutex_lock(&ra->lock);
|
mutex_lock(&ra->lock);
|
||||||
struct region_metadata *region = regions_find(old);
|
struct region_metadata *region = regions_find(old);
|
||||||
|
@ -1333,10 +1368,10 @@ EXPORT void *h_realloc(void *old, size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// in-place growth
|
// in-place growth
|
||||||
void *guard_end = (char *)old + old_rounded_size + old_guard_size;
|
void *guard_end = (char *)old + old_size + old_guard_size;
|
||||||
size_t extra = rounded_size - old_rounded_size;
|
size_t extra = size - old_size;
|
||||||
if (!memory_remap((char *)old + old_rounded_size, old_guard_size, old_guard_size + extra)) {
|
if (!memory_remap((char *)old + old_size, old_guard_size, old_guard_size + extra)) {
|
||||||
if (memory_protect_rw((char *)old + old_rounded_size, extra)) {
|
if (memory_protect_rw((char *)old + old_size, extra)) {
|
||||||
memory_unmap(guard_end, extra);
|
memory_unmap(guard_end, extra);
|
||||||
} else {
|
} else {
|
||||||
mutex_lock(&ra->lock);
|
mutex_lock(&ra->lock);
|
||||||
|
|
Loading…
Reference in New Issue