mirror of
https://github.com/GrapheneOS/hardened_malloc.git
synced 2025-11-05 02:06:32 +01:00
Compare commits
No commits in common. "main" and "2024051500-redfin" have entirely different histories.
main
...
2024051500
15 changed files with 183 additions and 420 deletions
14
.github/workflows/build-and-test.yml
vendored
14
.github/workflows/build-and-test.yml
vendored
|
|
@ -11,9 +11,9 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
version: [14]
|
version: [12]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
- name: Setting up gcc version
|
- name: Setting up gcc version
|
||||||
run: |
|
run: |
|
||||||
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
|
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
|
||||||
|
|
@ -24,11 +24,9 @@ jobs:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
strategy:
|
strategy:
|
||||||
matrix:
|
matrix:
|
||||||
version: [19, 20]
|
version: [14, 15]
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
- name: Install dependencies
|
|
||||||
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-19 clang-20
|
|
||||||
- name: Setting up clang version
|
- name: Setting up clang version
|
||||||
run: |
|
run: |
|
||||||
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
|
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
|
||||||
|
|
@ -40,7 +38,7 @@ jobs:
|
||||||
container:
|
container:
|
||||||
image: alpine:latest
|
image: alpine:latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: apk update && apk add build-base python3
|
run: apk update && apk add build-base python3
|
||||||
- name: Build
|
- name: Build
|
||||||
|
|
@ -48,7 +46,7 @@ jobs:
|
||||||
build-ubuntu-gcc-aarch64:
|
build-ubuntu-gcc-aarch64:
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
steps:
|
steps:
|
||||||
- uses: actions/checkout@v5
|
- uses: actions/checkout@v4
|
||||||
- name: Install dependencies
|
- name: Install dependencies
|
||||||
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu
|
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu
|
||||||
- name: Build
|
- name: Build
|
||||||
|
|
|
||||||
4
.gitignore
vendored
4
.gitignore
vendored
|
|
@ -1,2 +1,2 @@
|
||||||
/out/
|
out/
|
||||||
/out-light/
|
out-light/
|
||||||
|
|
|
||||||
|
|
@ -5,6 +5,8 @@ common_cflags = [
|
||||||
"-fPIC",
|
"-fPIC",
|
||||||
"-fvisibility=hidden",
|
"-fvisibility=hidden",
|
||||||
//"-fno-plt",
|
//"-fno-plt",
|
||||||
|
"-Wall",
|
||||||
|
"-Wextra",
|
||||||
"-Wcast-align",
|
"-Wcast-align",
|
||||||
"-Wcast-qual",
|
"-Wcast-qual",
|
||||||
"-Wwrite-strings",
|
"-Wwrite-strings",
|
||||||
|
|
@ -72,7 +74,7 @@ cc_library {
|
||||||
cflags: ["-DLABEL_MEMORY"],
|
cflags: ["-DLABEL_MEMORY"],
|
||||||
},
|
},
|
||||||
device_has_arm_mte: {
|
device_has_arm_mte: {
|
||||||
cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"]
|
cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"]
|
||||||
},
|
},
|
||||||
},
|
},
|
||||||
apex_available: [
|
apex_available: [
|
||||||
|
|
|
||||||
2
LICENSE
2
LICENSE
|
|
@ -1,4 +1,4 @@
|
||||||
Copyright © 2018-2025 GrapheneOS
|
Copyright © 2018-2024 GrapheneOS
|
||||||
|
|
||||||
Permission is hereby granted, free of charge, to any person obtaining a copy
|
Permission is hereby granted, free of charge, to any person obtaining a copy
|
||||||
of this software and associated documentation files (the "Software"), to deal
|
of this software and associated documentation files (the "Software"), to deal
|
||||||
|
|
|
||||||
19
README.md
19
README.md
|
|
@ -65,14 +65,14 @@ used instead as this allocator fundamentally doesn't support that environment.
|
||||||
|
|
||||||
## Dependencies
|
## Dependencies
|
||||||
|
|
||||||
Debian stable (currently Debian 13) determines the most ancient set of
|
Debian stable (currently Debian 12) determines the most ancient set of
|
||||||
supported dependencies:
|
supported dependencies:
|
||||||
|
|
||||||
* glibc 2.41
|
* glibc 2.36
|
||||||
* Linux 6.12
|
* Linux 6.1
|
||||||
* Clang 19.1.7 or GCC 14.2.0
|
* Clang 14.0.6 or GCC 12.2.0
|
||||||
|
|
||||||
For Android, the Linux GKI 6.1, 6.6 and 6.12 branches are supported.
|
For Android, the Linux GKI 5.10, 5.15 and 6.1 branches are supported.
|
||||||
|
|
||||||
However, using more recent releases is highly recommended. Older versions of
|
However, using more recent releases is highly recommended. Older versions of
|
||||||
the dependencies may be compatible at the moment but are not tested and will
|
the dependencies may be compatible at the moment but are not tested and will
|
||||||
|
|
@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
|
||||||
along with other hardening for the C standard library implementation.
|
along with other hardening for the C standard library implementation.
|
||||||
|
|
||||||
For Android, only the current generation, actively developed maintenance branch of the Android
|
For Android, only the current generation, actively developed maintenance branch of the Android
|
||||||
Open Source Project will be supported, which currently means `android16-release`.
|
Open Source Project will be supported, which currently means `android13-qpr2-release`.
|
||||||
|
|
||||||
## Testing
|
## Testing
|
||||||
|
|
||||||
|
|
@ -159,11 +159,8 @@ line to the `/etc/ld.so.preload` configuration file:
|
||||||
The format of this configuration file is a whitespace-separated list, so it's
|
The format of this configuration file is a whitespace-separated list, so it's
|
||||||
good practice to put each library on a separate line.
|
good practice to put each library on a separate line.
|
||||||
|
|
||||||
For maximum compatibility `libhardened_malloc.so` can be installed into
|
On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/`
|
||||||
`/usr/lib/` to avoid preload failures caused by AppArmor profiles or systemd
|
to avoid preload failures caused by AppArmor profile restrictions.
|
||||||
ExecPaths= restrictions. Check for logs of the following format:
|
|
||||||
|
|
||||||
ERROR: ld.so: object '/usr/local/lib/libhardened_malloc.so' from /etc/ld.so.preload cannot be preloaded (failed to map segment from shared object): ignored.
|
|
||||||
|
|
||||||
Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
|
Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
|
||||||
will not work when `AT_SECURE` is set such as with setuid binaries. It's also
|
will not work when `AT_SECURE` is set such as with setuid binaries. It's also
|
||||||
|
|
|
||||||
|
|
@ -44,7 +44,7 @@ void *set_pointer_tag(void *ptr, u8 tag) {
|
||||||
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
|
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
|
||||||
}
|
}
|
||||||
|
|
||||||
// This test checks that slab slot allocation uses tag that is distinct from tags of its neighbors
|
// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
|
||||||
// and from the tag of the previous allocation that used the same slot
|
// and from the tag of the previous allocation that used the same slot
|
||||||
void tag_distinctness() {
|
void tag_distinctness() {
|
||||||
// tag 0 is reserved
|
// tag 0 is reserved
|
||||||
|
|
|
||||||
2
chacha.c
2
chacha.c
|
|
@ -41,7 +41,7 @@ static const unsigned rounds = 8;
|
||||||
a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \
|
a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \
|
||||||
c = PLUS(c, d); b = ROTATE(XOR(b, c), 7);
|
c = PLUS(c, d); b = ROTATE(XOR(b, c), 7);
|
||||||
|
|
||||||
static const char sigma[16] NONSTRING = "expand 32-byte k";
|
static const char sigma[16] = "expand 32-byte k";
|
||||||
|
|
||||||
void chacha_keysetup(chacha_ctx *x, const u8 *k) {
|
void chacha_keysetup(chacha_ctx *x, const u8 *k) {
|
||||||
x->input[0] = U8TO32_LITTLE(sigma + 0);
|
x->input[0] = U8TO32_LITTLE(sigma + 0);
|
||||||
|
|
|
||||||
48
h_malloc.c
48
h_malloc.c
|
|
@ -94,24 +94,6 @@ static inline bool is_memtag_enabled(void) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static void *memory_map_tagged(size_t size) {
|
|
||||||
#ifdef HAS_ARM_MTE
|
|
||||||
if (likely51(is_memtag_enabled())) {
|
|
||||||
return memory_map_mte(size);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return memory_map(size);
|
|
||||||
}
|
|
||||||
|
|
||||||
static bool memory_map_fixed_tagged(void *ptr, size_t size) {
|
|
||||||
#ifdef HAS_ARM_MTE
|
|
||||||
if (likely51(is_memtag_enabled())) {
|
|
||||||
return memory_map_fixed_mte(ptr, size);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
return memory_map_fixed(ptr, size);
|
|
||||||
}
|
|
||||||
|
|
||||||
#define SLAB_METADATA_COUNT
|
#define SLAB_METADATA_COUNT
|
||||||
|
|
||||||
struct slab_metadata {
|
struct slab_metadata {
|
||||||
|
|
@ -488,7 +470,7 @@ static void write_after_free_check(const char *p, size_t size) {
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -523,7 +505,7 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
|
||||||
static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
|
static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
|
||||||
#if SLAB_CANARY
|
#if SLAB_CANARY
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -535,7 +517,7 @@ static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void
|
||||||
static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
|
static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
|
||||||
#if SLAB_CANARY
|
#if SLAB_CANARY
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -642,7 +624,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||||
write_after_free_check(p, size - canary_size);
|
write_after_free_check(p, size - canary_size);
|
||||||
set_canary(metadata, p, size);
|
set_canary(metadata, p, size);
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -679,7 +661,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||||
if (requested_size) {
|
if (requested_size) {
|
||||||
set_canary(metadata, p, size);
|
set_canary(metadata, p, size);
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -706,7 +688,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||||
if (requested_size) {
|
if (requested_size) {
|
||||||
set_canary(metadata, p, size);
|
set_canary(metadata, p, size);
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -735,7 +717,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
|
||||||
write_after_free_check(p, size - canary_size);
|
write_after_free_check(p, size - canary_size);
|
||||||
set_canary(metadata, p, size);
|
set_canary(metadata, p, size);
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
p = tag_and_clear_slab_slot(metadata, p, slot, size);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
@ -823,7 +805,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
|
||||||
|
|
||||||
bool skip_zero = false;
|
bool skip_zero = false;
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
if (likely51(is_memtag_enabled())) {
|
if (likely(is_memtag_enabled())) {
|
||||||
arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
|
arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
|
||||||
// metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
|
// metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
|
||||||
skip_zero = true;
|
skip_zero = true;
|
||||||
|
|
@ -908,7 +890,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
|
||||||
|
|
||||||
if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
|
if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
|
||||||
int saved_errno = errno;
|
int saved_errno = errno;
|
||||||
if (!memory_map_fixed_tagged(slab, slab_size)) {
|
if (!memory_map_fixed(slab, slab_size)) {
|
||||||
label_slab(slab, slab_size, class);
|
label_slab(slab, slab_size, class);
|
||||||
stats_slab_deallocate(c, slab_size);
|
stats_slab_deallocate(c, slab_size);
|
||||||
enqueue_free_slab(c, metadata);
|
enqueue_free_slab(c, metadata);
|
||||||
|
|
@ -1260,7 +1242,15 @@ COLD static void init_slow_path(void) {
|
||||||
if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
|
if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
|
||||||
fatal_error("failed to unprotect memory for regions table");
|
fatal_error("failed to unprotect memory for regions table");
|
||||||
}
|
}
|
||||||
ro.slab_region_start = memory_map_tagged(slab_region_size);
|
#ifdef HAS_ARM_MTE
|
||||||
|
if (likely(is_memtag_enabled())) {
|
||||||
|
ro.slab_region_start = memory_map_mte(slab_region_size);
|
||||||
|
} else {
|
||||||
|
ro.slab_region_start = memory_map(slab_region_size);
|
||||||
|
}
|
||||||
|
#else
|
||||||
|
ro.slab_region_start = memory_map(slab_region_size);
|
||||||
|
#endif
|
||||||
if (unlikely(ro.slab_region_start == NULL)) {
|
if (unlikely(ro.slab_region_start == NULL)) {
|
||||||
fatal_error("failed to allocate slab region");
|
fatal_error("failed to allocate slab region");
|
||||||
}
|
}
|
||||||
|
|
@ -1905,7 +1895,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) {
|
||||||
struct slab_metadata *iterator = c->empty_slabs;
|
struct slab_metadata *iterator = c->empty_slabs;
|
||||||
while (iterator) {
|
while (iterator) {
|
||||||
void *slab = get_slab(c, slab_size, iterator);
|
void *slab = get_slab(c, slab_size, iterator);
|
||||||
if (memory_map_fixed_tagged(slab, slab_size)) {
|
if (memory_map_fixed(slab, slab_size)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
label_slab(slab, slab_size, class);
|
label_slab(slab, slab_size, class);
|
||||||
|
|
|
||||||
32
memory.c
32
memory.c
|
|
@ -17,8 +17,8 @@
|
||||||
#include "memory.h"
|
#include "memory.h"
|
||||||
#include "util.h"
|
#include "util.h"
|
||||||
|
|
||||||
static void *memory_map_prot(size_t size, int prot) {
|
void *memory_map(size_t size) {
|
||||||
void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
|
void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
|
||||||
if (unlikely(p == MAP_FAILED)) {
|
if (unlikely(p == MAP_FAILED)) {
|
||||||
if (errno != ENOMEM) {
|
if (errno != ENOMEM) {
|
||||||
fatal_error("non-ENOMEM mmap failure");
|
fatal_error("non-ENOMEM mmap failure");
|
||||||
|
|
@ -28,19 +28,22 @@ static void *memory_map_prot(size_t size, int prot) {
|
||||||
return p;
|
return p;
|
||||||
}
|
}
|
||||||
|
|
||||||
void *memory_map(size_t size) {
|
|
||||||
return memory_map_prot(size, PROT_NONE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef HAS_ARM_MTE
|
#ifdef HAS_ARM_MTE
|
||||||
// Note that PROT_MTE can't be cleared via mprotect
|
// Note that PROT_MTE can't be cleared via mprotect
|
||||||
void *memory_map_mte(size_t size) {
|
void *memory_map_mte(size_t size) {
|
||||||
return memory_map_prot(size, PROT_MTE);
|
void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
|
||||||
|
if (unlikely(p == MAP_FAILED)) {
|
||||||
|
if (errno != ENOMEM) {
|
||||||
|
fatal_error("non-ENOMEM MTE mmap failure");
|
||||||
|
}
|
||||||
|
return NULL;
|
||||||
|
}
|
||||||
|
return p;
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
|
bool memory_map_fixed(void *ptr, size_t size) {
|
||||||
void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
|
void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
|
||||||
bool ret = p == MAP_FAILED;
|
bool ret = p == MAP_FAILED;
|
||||||
if (unlikely(ret) && errno != ENOMEM) {
|
if (unlikely(ret) && errno != ENOMEM) {
|
||||||
fatal_error("non-ENOMEM MAP_FIXED mmap failure");
|
fatal_error("non-ENOMEM MAP_FIXED mmap failure");
|
||||||
|
|
@ -48,17 +51,6 @@ static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool memory_map_fixed(void *ptr, size_t size) {
|
|
||||||
return memory_map_fixed_prot(ptr, size, PROT_NONE);
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef HAS_ARM_MTE
|
|
||||||
// Note that PROT_MTE can't be cleared via mprotect
|
|
||||||
bool memory_map_fixed_mte(void *ptr, size_t size) {
|
|
||||||
return memory_map_fixed_prot(ptr, size, PROT_MTE);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
|
|
||||||
bool memory_unmap(void *ptr, size_t size) {
|
bool memory_unmap(void *ptr, size_t size) {
|
||||||
bool ret = munmap(ptr, size);
|
bool ret = munmap(ptr, size);
|
||||||
if (unlikely(ret) && errno != ENOMEM) {
|
if (unlikely(ret) && errno != ENOMEM) {
|
||||||
|
|
|
||||||
3
memory.h
3
memory.h
|
|
@ -15,9 +15,6 @@ void *memory_map(size_t size);
|
||||||
void *memory_map_mte(size_t size);
|
void *memory_map_mte(size_t size);
|
||||||
#endif
|
#endif
|
||||||
bool memory_map_fixed(void *ptr, size_t size);
|
bool memory_map_fixed(void *ptr, size_t size);
|
||||||
#ifdef HAS_ARM_MTE
|
|
||||||
bool memory_map_fixed_mte(void *ptr, size_t size);
|
|
||||||
#endif
|
|
||||||
bool memory_unmap(void *ptr, size_t size);
|
bool memory_unmap(void *ptr, size_t size);
|
||||||
bool memory_protect_ro(void *ptr, size_t size);
|
bool memory_protect_ro(void *ptr, size_t size);
|
||||||
bool memory_protect_rw(void *ptr, size_t size);
|
bool memory_protect_rw(void *ptr, size_t size);
|
||||||
|
|
|
||||||
|
|
@ -1,6 +1,5 @@
|
||||||
#include <pthread.h>
|
#include <pthread.h>
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
|
||||||
|
|
||||||
#if defined(__GLIBC__) || defined(__ANDROID__)
|
#if defined(__GLIBC__) || defined(__ANDROID__)
|
||||||
#include <malloc.h>
|
#include <malloc.h>
|
||||||
|
|
|
||||||
|
|
@ -98,7 +98,7 @@ class TestSimpleMemoryCorruption(unittest.TestCase):
|
||||||
self.assertEqual(stderr.decode("utf-8"),
|
self.assertEqual(stderr.decode("utf-8"),
|
||||||
"fatal allocator error: invalid free\n")
|
"fatal allocator error: invalid free\n")
|
||||||
|
|
||||||
def test_invalid_malloc_usable_size_small_quarantine(self):
|
def test_invalid_malloc_usable_size_small_quarantene(self):
|
||||||
_stdout, stderr, returncode = self.run_test(
|
_stdout, stderr, returncode = self.run_test(
|
||||||
"invalid_malloc_usable_size_small_quarantine")
|
"invalid_malloc_usable_size_small_quarantine")
|
||||||
self.assertEqual(returncode, -6)
|
self.assertEqual(returncode, -6)
|
||||||
|
|
|
||||||
441
third_party/libdivide.h
vendored
441
third_party/libdivide.h
vendored
|
|
@ -1,8 +1,8 @@
|
||||||
// libdivide.h - Optimized integer division
|
// libdivide.h - Optimized integer division
|
||||||
// https://libdivide.com
|
// https://libdivide.com
|
||||||
//
|
//
|
||||||
// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
|
// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com>
|
||||||
// Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
|
// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com>
|
||||||
//
|
//
|
||||||
// libdivide is dual-licensed under the Boost or zlib licenses.
|
// libdivide is dual-licensed under the Boost or zlib licenses.
|
||||||
// You may use libdivide under the terms of either of these.
|
// You may use libdivide under the terms of either of these.
|
||||||
|
|
@ -11,14 +11,11 @@
|
||||||
#ifndef LIBDIVIDE_H
|
#ifndef LIBDIVIDE_H
|
||||||
#define LIBDIVIDE_H
|
#define LIBDIVIDE_H
|
||||||
|
|
||||||
// *** Version numbers are auto generated - do not edit ***
|
#define LIBDIVIDE_VERSION "5.0"
|
||||||
#define LIBDIVIDE_VERSION "5.2.0"
|
|
||||||
#define LIBDIVIDE_VERSION_MAJOR 5
|
#define LIBDIVIDE_VERSION_MAJOR 5
|
||||||
#define LIBDIVIDE_VERSION_MINOR 2
|
#define LIBDIVIDE_VERSION_MINOR 0
|
||||||
#define LIBDIVIDE_VERSION_PATCH 0
|
|
||||||
|
|
||||||
#include <stdint.h>
|
#include <stdint.h>
|
||||||
|
|
||||||
#if !defined(__AVR__)
|
#if !defined(__AVR__)
|
||||||
#include <stdio.h>
|
#include <stdio.h>
|
||||||
#include <stdlib.h>
|
#include <stdlib.h>
|
||||||
|
|
@ -27,24 +24,15 @@
|
||||||
#if defined(LIBDIVIDE_SSE2)
|
#if defined(LIBDIVIDE_SSE2)
|
||||||
#include <emmintrin.h>
|
#include <emmintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
|
#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
|
||||||
#include <immintrin.h>
|
#include <immintrin.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_NEON)
|
#if defined(LIBDIVIDE_NEON)
|
||||||
#include <arm_neon.h>
|
#include <arm_neon.h>
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
|
|
||||||
#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
|
|
||||||
#define LIBDIVIDE_X64_INTRINSICS
|
|
||||||
#endif
|
|
||||||
|
|
||||||
#if defined(_MSC_VER)
|
#if defined(_MSC_VER)
|
||||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
|
||||||
#include <intrin.h>
|
#include <intrin.h>
|
||||||
#endif
|
|
||||||
#pragma warning(push)
|
#pragma warning(push)
|
||||||
// disable warning C4146: unary minus operator applied
|
// disable warning C4146: unary minus operator applied
|
||||||
// to unsigned type, result still unsigned
|
// to unsigned type, result still unsigned
|
||||||
|
|
@ -250,32 +238,24 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr
|
||||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
|
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
|
||||||
int16_t numer, int16_t magic, uint8_t more);
|
int16_t numer, int16_t magic, uint8_t more);
|
||||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
|
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
|
||||||
int16_t numer, const struct libdivide_s16_t *denom);
|
int16_t numer, const struct libdivide_s16_t* denom);
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
|
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
|
||||||
uint16_t numer, uint16_t magic, uint8_t more);
|
uint16_t numer, uint16_t magic, uint8_t more);
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
|
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
|
||||||
uint16_t numer, const struct libdivide_u16_t *denom);
|
uint16_t numer, const struct libdivide_u16_t* denom);
|
||||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
|
|
||||||
int32_t numer, int32_t magic, uint8_t more);
|
|
||||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
|
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
|
||||||
int32_t numer, const struct libdivide_s32_t *denom);
|
int32_t numer, const struct libdivide_s32_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
|
|
||||||
uint32_t numer, uint32_t magic, uint8_t more);
|
|
||||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
|
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
|
||||||
uint32_t numer, const struct libdivide_u32_t *denom);
|
uint32_t numer, const struct libdivide_u32_t *denom);
|
||||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
|
|
||||||
int64_t numer, int64_t magic, uint8_t more);
|
|
||||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
|
static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
|
||||||
int64_t numer, const struct libdivide_s64_t *denom);
|
int64_t numer, const struct libdivide_s64_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
|
|
||||||
uint64_t numer, uint64_t magic, uint8_t more);
|
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
|
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
|
||||||
uint64_t numer, const struct libdivide_u64_t *denom);
|
uint64_t numer, const struct libdivide_u64_t *denom);
|
||||||
|
|
||||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
|
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
|
||||||
int16_t numer, const struct libdivide_s16_branchfree_t *denom);
|
int16_t numer, const struct libdivide_s16_branchfree_t* denom);
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
|
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
|
||||||
uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
|
uint16_t numer, const struct libdivide_u16_branchfree_t* denom);
|
||||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
|
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
|
||||||
int32_t numer, const struct libdivide_s32_branchfree_t *denom);
|
int32_t numer, const struct libdivide_s32_branchfree_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
|
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
|
||||||
|
|
@ -285,17 +265,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
|
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
|
||||||
uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
|
uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
|
||||||
|
|
||||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
|
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom);
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
|
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom);
|
||||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
|
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
|
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
|
||||||
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
|
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
|
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
|
||||||
|
|
||||||
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
|
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
|
||||||
const struct libdivide_s16_branchfree_t *denom);
|
const struct libdivide_s16_branchfree_t* denom);
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
|
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
|
||||||
const struct libdivide_u16_branchfree_t *denom);
|
const struct libdivide_u16_branchfree_t* denom);
|
||||||
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
|
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
|
||||||
const struct libdivide_s32_branchfree_t *denom);
|
const struct libdivide_s32_branchfree_t *denom);
|
||||||
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
|
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
|
||||||
|
|
@ -334,7 +314,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
|
static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
|
||||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
|
||||||
return __umulh(x, y);
|
return __umulh(x, y);
|
||||||
#elif defined(HAS_INT128_T)
|
#elif defined(HAS_INT128_T)
|
||||||
__uint128_t xl = x, yl = y;
|
__uint128_t xl = x, yl = y;
|
||||||
|
|
@ -360,7 +340,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
|
||||||
}
|
}
|
||||||
|
|
||||||
static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
|
static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
|
||||||
#if defined(LIBDIVIDE_X64_INTRINSICS)
|
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64)
|
||||||
return __mulh(x, y);
|
return __mulh(x, y);
|
||||||
#elif defined(HAS_INT128_T)
|
#elif defined(HAS_INT128_T)
|
||||||
__int128_t xl = x, yl = y;
|
__int128_t xl = x, yl = y;
|
||||||
|
|
@ -462,7 +442,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
|
||||||
// uint {v}. The result must fit in 16 bits.
|
// uint {v}. The result must fit in 16 bits.
|
||||||
// Returns the quotient directly and the remainder in *r
|
// Returns the quotient directly and the remainder in *r
|
||||||
static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
|
static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
|
||||||
uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
|
uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) {
|
||||||
uint32_t n = ((uint32_t)u1 << 16) | u0;
|
uint32_t n = ((uint32_t)u1 << 16) | u0;
|
||||||
uint16_t result = (uint16_t)(n / v);
|
uint16_t result = (uint16_t)(n / v);
|
||||||
*r = (uint16_t)(n - result * (uint32_t)v);
|
*r = (uint16_t)(n - result * (uint32_t)v);
|
||||||
|
|
@ -532,7 +512,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
|
||||||
|
|
||||||
// Check for overflow and divide by 0.
|
// Check for overflow and divide by 0.
|
||||||
if (numhi >= den) {
|
if (numhi >= den) {
|
||||||
if (r) *r = ~0ull;
|
if (r != NULL) *r = ~0ull;
|
||||||
return ~0ull;
|
return ~0ull;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -578,14 +558,11 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
|
||||||
q0 = (uint32_t)qhat;
|
q0 = (uint32_t)qhat;
|
||||||
|
|
||||||
// Return remainder if requested.
|
// Return remainder if requested.
|
||||||
if (r) *r = (rem * b + num0 - q0 * den) >> shift;
|
if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift;
|
||||||
return ((uint64_t)q1 << 32) | q0;
|
return ((uint64_t)q1 << 32) | q0;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
#if !(defined(HAS_INT128_T) && \
|
|
||||||
defined(HAS_INT128_DIV))
|
|
||||||
|
|
||||||
// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
|
// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
|
||||||
static LIBDIVIDE_INLINE void libdivide_u128_shift(
|
static LIBDIVIDE_INLINE void libdivide_u128_shift(
|
||||||
uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
|
uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
|
||||||
|
|
@ -602,8 +579,6 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
|
// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
|
||||||
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
|
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
|
||||||
uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
|
uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
|
||||||
|
|
@ -721,7 +696,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
|
||||||
// 1 in its recovery algorithm.
|
// 1 in its recovery algorithm.
|
||||||
result.magic = 0;
|
result.magic = 0;
|
||||||
result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
|
result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
uint8_t more;
|
uint8_t more;
|
||||||
uint16_t rem, proposed_m;
|
uint16_t rem, proposed_m;
|
||||||
proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
|
proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
|
||||||
|
|
@ -733,7 +709,8 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
|
||||||
if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
|
if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
|
||||||
// This power works
|
// This power works
|
||||||
more = floor_log_2_d;
|
more = floor_log_2_d;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// We have to use the general 17-bit algorithm. We need to compute
|
// We have to use the general 17-bit algorithm. We need to compute
|
||||||
// (2**power) / d. However, we already have (2**(power-1))/d and
|
// (2**power) / d. However, we already have (2**(power-1))/d and
|
||||||
// its remainder. By doubling both, and then correcting the
|
// its remainder. By doubling both, and then correcting the
|
||||||
|
|
@ -765,7 +742,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
|
||||||
}
|
}
|
||||||
struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
|
struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
|
||||||
struct libdivide_u16_branchfree_t ret = {
|
struct libdivide_u16_branchfree_t ret = {
|
||||||
tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
|
tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) };
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
@ -775,12 +752,14 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
|
||||||
uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
|
uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
|
||||||
if (!magic) {
|
if (!magic) {
|
||||||
return numer >> more;
|
return numer >> more;
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
uint16_t q = libdivide_mullhi_u16(magic, numer);
|
uint16_t q = libdivide_mullhi_u16(magic, numer);
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||||
uint16_t t = ((numer - q) >> 1) + q;
|
uint16_t t = ((numer - q) >> 1) + q;
|
||||||
return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
|
return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
|
||||||
} else {
|
}
|
||||||
|
else {
|
||||||
// All upper bits are 0,
|
// All upper bits are 0,
|
||||||
// don't need to mask them off.
|
// don't need to mask them off.
|
||||||
return q >> more;
|
return q >> more;
|
||||||
|
|
@ -788,12 +767,12 @@ uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
|
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) {
|
||||||
return libdivide_u16_do_raw(numer, denom->magic, denom->more);
|
return libdivide_u16_do_raw(numer, denom->magic, denom->more);
|
||||||
}
|
}
|
||||||
|
|
||||||
uint16_t libdivide_u16_branchfree_do(
|
uint16_t libdivide_u16_branchfree_do(
|
||||||
uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
|
uint16_t numer, const struct libdivide_u16_branchfree_t* denom) {
|
||||||
uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
|
uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
|
||||||
uint16_t t = ((numer - q) >> 1) + q;
|
uint16_t t = ((numer - q) >> 1) + q;
|
||||||
return t >> denom->more;
|
return t >> denom->more;
|
||||||
|
|
@ -821,7 +800,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
|
||||||
// overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
|
// overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
|
||||||
// then double the quotient and remainder.
|
// then double the quotient and remainder.
|
||||||
uint32_t half_n = (uint32_t)1 << (16 + shift);
|
uint32_t half_n = (uint32_t)1 << (16 + shift);
|
||||||
uint32_t d = ((uint32_t)1 << 16) | denom->magic;
|
uint32_t d = ( (uint32_t)1 << 16) | denom->magic;
|
||||||
// Note that the quotient is guaranteed <= 16 bits, but the remainder
|
// Note that the quotient is guaranteed <= 16 bits, but the remainder
|
||||||
// may need 17!
|
// may need 17!
|
||||||
uint16_t half_q = (uint16_t)(half_n / d);
|
uint16_t half_q = (uint16_t)(half_n / d);
|
||||||
|
|
@ -933,11 +912,12 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
|
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
|
||||||
if (!magic) {
|
uint8_t more = denom->more;
|
||||||
|
if (!denom->magic) {
|
||||||
return numer >> more;
|
return numer >> more;
|
||||||
} else {
|
} else {
|
||||||
uint32_t q = libdivide_mullhi_u32(magic, numer);
|
uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||||
uint32_t t = ((numer - q) >> 1) + q;
|
uint32_t t = ((numer - q) >> 1) + q;
|
||||||
return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
|
return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
|
||||||
|
|
@ -949,10 +929,6 @@ uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
|
|
||||||
return libdivide_u32_do_raw(numer, denom->magic, denom->more);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint32_t libdivide_u32_branchfree_do(
|
uint32_t libdivide_u32_branchfree_do(
|
||||||
uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
|
uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
|
||||||
uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
|
uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
|
||||||
|
|
@ -1096,11 +1072,12 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
|
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
|
||||||
if (!magic) {
|
uint8_t more = denom->more;
|
||||||
|
if (!denom->magic) {
|
||||||
return numer >> more;
|
return numer >> more;
|
||||||
} else {
|
} else {
|
||||||
uint64_t q = libdivide_mullhi_u64(magic, numer);
|
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||||
uint64_t t = ((numer - q) >> 1) + q;
|
uint64_t t = ((numer - q) >> 1) + q;
|
||||||
return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
|
return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
|
||||||
|
|
@ -1112,10 +1089,6 @@ uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
|
|
||||||
return libdivide_u64_do_raw(numer, denom->magic, denom->more);
|
|
||||||
}
|
|
||||||
|
|
||||||
uint64_t libdivide_u64_branchfree_do(
|
uint64_t libdivide_u64_branchfree_do(
|
||||||
uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
|
uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
|
||||||
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
|
uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
|
||||||
|
|
@ -1455,10 +1428,11 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
|
||||||
return result;
|
return result;
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
|
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
||||||
|
uint8_t more = denom->more;
|
||||||
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
||||||
|
|
||||||
if (!magic) {
|
if (!denom->magic) {
|
||||||
uint32_t sign = (int8_t)more >> 7;
|
uint32_t sign = (int8_t)more >> 7;
|
||||||
uint32_t mask = ((uint32_t)1 << shift) - 1;
|
uint32_t mask = ((uint32_t)1 << shift) - 1;
|
||||||
uint32_t uq = numer + ((numer >> 31) & mask);
|
uint32_t uq = numer + ((numer >> 31) & mask);
|
||||||
|
|
@ -1467,7 +1441,7 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
|
||||||
q = (q ^ sign) - sign;
|
q = (q ^ sign) - sign;
|
||||||
return q;
|
return q;
|
||||||
} else {
|
} else {
|
||||||
uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
|
uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer);
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||||
// must be arithmetic shift and then sign extend
|
// must be arithmetic shift and then sign extend
|
||||||
int32_t sign = (int8_t)more >> 7;
|
int32_t sign = (int8_t)more >> 7;
|
||||||
|
|
@ -1482,10 +1456,6 @@ int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
|
|
||||||
return libdivide_s32_do_raw(numer, denom->magic, denom->more);
|
|
||||||
}
|
|
||||||
|
|
||||||
int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
|
int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
|
||||||
uint8_t more = denom->more;
|
uint8_t more = denom->more;
|
||||||
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
|
||||||
|
|
@ -1627,10 +1597,11 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
|
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
||||||
|
uint8_t more = denom->more;
|
||||||
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
||||||
|
|
||||||
if (!magic) { // shift path
|
if (!denom->magic) { // shift path
|
||||||
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
||||||
uint64_t uq = numer + ((numer >> 63) & mask);
|
uint64_t uq = numer + ((numer >> 63) & mask);
|
||||||
int64_t q = (int64_t)uq;
|
int64_t q = (int64_t)uq;
|
||||||
|
|
@ -1640,7 +1611,7 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
|
||||||
q = (q ^ sign) - sign;
|
q = (q ^ sign) - sign;
|
||||||
return q;
|
return q;
|
||||||
} else {
|
} else {
|
||||||
uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
|
uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer);
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
if (more & LIBDIVIDE_ADD_MARKER) {
|
||||||
// must be arithmetic shift and then sign extend
|
// must be arithmetic shift and then sign extend
|
||||||
int64_t sign = (int8_t)more >> 7;
|
int64_t sign = (int8_t)more >> 7;
|
||||||
|
|
@ -1655,10 +1626,6 @@ int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
|
|
||||||
return libdivide_s64_do_raw(numer, denom->magic, denom->more);
|
|
||||||
}
|
|
||||||
|
|
||||||
int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
|
int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
|
||||||
uint8_t more = denom->more;
|
uint8_t more = denom->more;
|
||||||
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
|
||||||
|
|
@ -1715,22 +1682,15 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
|
||||||
|
|
||||||
// Simplest possible vector type division: treat the vector type as an array
|
// Simplest possible vector type division: treat the vector type as an array
|
||||||
// of underlying native type.
|
// of underlying native type.
|
||||||
//
|
|
||||||
// Use a union to read a vector via pointer-to-integer, without violating strict
|
|
||||||
// aliasing.
|
|
||||||
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
|
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
|
||||||
const size_t count = sizeof(VecT) / sizeof(IntT); \
|
const size_t count = sizeof(VecT) / sizeof(IntT); \
|
||||||
union type_pun_vec { \
|
VecT result; \
|
||||||
VecT vec; \
|
IntT *pSource = (IntT *)&numers; \
|
||||||
IntT arr[sizeof(VecT) / sizeof(IntT)]; \
|
IntT *pTarget = (IntT *)&result; \
|
||||||
}; \
|
for (size_t loop=0; loop<count; ++loop) { \
|
||||||
union type_pun_vec result; \
|
pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \
|
||||||
union type_pun_vec input; \
|
|
||||||
input.vec = numers; \
|
|
||||||
for (size_t loop = 0; loop < count; ++loop) { \
|
|
||||||
result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
|
|
||||||
} \
|
} \
|
||||||
return result.vec;
|
return result; \
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_NEON)
|
#if defined(LIBDIVIDE_NEON)
|
||||||
|
|
||||||
|
|
@ -1844,12 +1804,13 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64
|
||||||
|
|
||||||
////////// UINT16
|
////////// UINT16
|
||||||
|
|
||||||
uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
|
uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) {
|
||||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
|
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)
|
||||||
|
}
|
||||||
|
|
||||||
uint16x8_t libdivide_u16_branchfree_do_vec128(
|
uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||||
uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
|
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)
|
||||||
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
|
}
|
||||||
|
|
||||||
////////// UINT32
|
////////// UINT32
|
||||||
|
|
||||||
|
|
@ -1909,12 +1870,13 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
|
||||||
|
|
||||||
////////// SINT16
|
////////// SINT16
|
||||||
|
|
||||||
int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
|
int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) {
|
||||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
|
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)
|
||||||
|
}
|
||||||
|
|
||||||
int16x8_t libdivide_s16_branchfree_do_vec128(
|
int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||||
int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
|
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)
|
||||||
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
|
}
|
||||||
|
|
||||||
////////// SINT32
|
////////// SINT32
|
||||||
|
|
||||||
|
|
@ -2120,12 +2082,13 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y
|
||||||
|
|
||||||
////////// UINT16
|
////////// UINT16
|
||||||
|
|
||||||
__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
|
__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) {
|
||||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)
|
||||||
|
}
|
||||||
|
|
||||||
__m512i libdivide_u16_branchfree_do_vec512(
|
__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||||
__m512i numers, const struct libdivide_u16_branchfree_t *denom){
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)
|
||||||
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
|
}
|
||||||
|
|
||||||
////////// UINT32
|
////////// UINT32
|
||||||
|
|
||||||
|
|
@ -2183,12 +2146,13 @@ __m512i libdivide_u64_branchfree_do_vec512(
|
||||||
|
|
||||||
////////// SINT16
|
////////// SINT16
|
||||||
|
|
||||||
__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
|
__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) {
|
||||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
|
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)
|
||||||
|
}
|
||||||
|
|
||||||
__m512i libdivide_s16_branchfree_do_vec512(
|
__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||||
__m512i numers, const struct libdivide_s16_branchfree_t *denom){
|
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)
|
||||||
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
|
}
|
||||||
|
|
||||||
////////// SINT32
|
////////// SINT32
|
||||||
|
|
||||||
|
|
@ -2401,25 +2365,11 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y
|
||||||
////////// UINT16
|
////////// UINT16
|
||||||
|
|
||||||
__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
|
__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
|
||||||
uint8_t more = denom->more;
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16)
|
||||||
if (!denom->magic) {
|
|
||||||
return _mm256_srli_epi16(numers, more);
|
|
||||||
} else {
|
|
||||||
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
|
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
|
||||||
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
|
|
||||||
return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
|
|
||||||
} else {
|
|
||||||
return _mm256_srli_epi16(q, more);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__m256i libdivide_u16_branchfree_do_vec256(
|
__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||||
__m256i numers, const struct libdivide_u16_branchfree_t *denom) {
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree)
|
||||||
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
|
|
||||||
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
|
|
||||||
return _mm256_srli_epi16(t, denom->more);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////// UINT32
|
////////// UINT32
|
||||||
|
|
@ -2479,54 +2429,11 @@ __m256i libdivide_u64_branchfree_do_vec256(
|
||||||
////////// SINT16
|
////////// SINT16
|
||||||
|
|
||||||
__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
|
__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
|
||||||
uint8_t more = denom->more;
|
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16)
|
||||||
if (!denom->magic) {
|
|
||||||
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
|
||||||
uint16_t mask = ((uint16_t)1 << shift) - 1;
|
|
||||||
__m256i roundToZeroTweak = _mm256_set1_epi16(mask);
|
|
||||||
// q = numer + ((numer >> 15) & roundToZeroTweak);
|
|
||||||
__m256i q = _mm256_add_epi16(
|
|
||||||
numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
|
|
||||||
q = _mm256_srai_epi16(q, shift);
|
|
||||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
|
||||||
// q = (q ^ sign) - sign;
|
|
||||||
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
|
|
||||||
return q;
|
|
||||||
} else {
|
|
||||||
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
|
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
|
||||||
// must be arithmetic shift
|
|
||||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
|
||||||
// q += ((numer ^ sign) - sign);
|
|
||||||
q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
|
|
||||||
}
|
|
||||||
// q >>= shift
|
|
||||||
q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
|
|
||||||
q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0)
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__m256i libdivide_s16_branchfree_do_vec256(
|
__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||||
__m256i numers, const struct libdivide_s16_branchfree_t *denom) {
|
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree)
|
||||||
int16_t magic = denom->magic;
|
|
||||||
uint8_t more = denom->more;
|
|
||||||
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
|
||||||
// must be arithmetic shift
|
|
||||||
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
|
|
||||||
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
|
|
||||||
q = _mm256_add_epi16(q, numers); // q += numers
|
|
||||||
|
|
||||||
// If q is non-negative, we have nothing to do
|
|
||||||
// If q is negative, we want to add either (2**shift)-1 if d is
|
|
||||||
// a power of 2, or (2**shift) if it is not a power of 2
|
|
||||||
uint16_t is_power_of_2 = (magic == 0);
|
|
||||||
__m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15
|
|
||||||
__m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
|
|
||||||
q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
|
|
||||||
q = _mm256_srai_epi16(q, shift); // q >>= shift
|
|
||||||
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
|
|
||||||
return q;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////// SINT32
|
////////// SINT32
|
||||||
|
|
@ -2754,25 +2661,11 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
|
||||||
////////// UINT26
|
////////// UINT26
|
||||||
|
|
||||||
__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
|
__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
|
||||||
uint8_t more = denom->more;
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16)
|
||||||
if (!denom->magic) {
|
|
||||||
return _mm_srli_epi16(numers, more);
|
|
||||||
} else {
|
|
||||||
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
|
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
|
||||||
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
|
|
||||||
return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
|
|
||||||
} else {
|
|
||||||
return _mm_srli_epi16(q, more);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__m128i libdivide_u16_branchfree_do_vec128(
|
__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
|
||||||
__m128i numers, const struct libdivide_u16_branchfree_t *denom) {
|
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree)
|
||||||
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
|
|
||||||
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
|
|
||||||
return _mm_srli_epi16(t, denom->more);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////// UINT32
|
////////// UINT32
|
||||||
|
|
@ -2832,54 +2725,11 @@ __m128i libdivide_u64_branchfree_do_vec128(
|
||||||
////////// SINT16
|
////////// SINT16
|
||||||
|
|
||||||
__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
|
__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
|
||||||
uint8_t more = denom->more;
|
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16)
|
||||||
if (!denom->magic) {
|
|
||||||
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
|
||||||
uint16_t mask = ((uint16_t)1 << shift) - 1;
|
|
||||||
__m128i roundToZeroTweak = _mm_set1_epi16(mask);
|
|
||||||
// q = numer + ((numer >> 15) & roundToZeroTweak);
|
|
||||||
__m128i q =
|
|
||||||
_mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
|
|
||||||
q = _mm_srai_epi16(q, shift);
|
|
||||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
|
||||||
// q = (q ^ sign) - sign;
|
|
||||||
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
|
|
||||||
return q;
|
|
||||||
} else {
|
|
||||||
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
|
|
||||||
if (more & LIBDIVIDE_ADD_MARKER) {
|
|
||||||
// must be arithmetic shift
|
|
||||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
|
||||||
// q += ((numer ^ sign) - sign);
|
|
||||||
q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
|
|
||||||
}
|
|
||||||
// q >>= shift
|
|
||||||
q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
|
|
||||||
q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
|
|
||||||
return q;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
__m128i libdivide_s16_branchfree_do_vec128(
|
__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
|
||||||
__m128i numers, const struct libdivide_s16_branchfree_t *denom) {
|
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree)
|
||||||
int16_t magic = denom->magic;
|
|
||||||
uint8_t more = denom->more;
|
|
||||||
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
|
|
||||||
// must be arithmetic shift
|
|
||||||
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
|
|
||||||
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
|
|
||||||
q = _mm_add_epi16(q, numers); // q += numers
|
|
||||||
|
|
||||||
// If q is non-negative, we have nothing to do
|
|
||||||
// If q is negative, we want to add either (2**shift)-1 if d is
|
|
||||||
// a power of 2, or (2**shift) if it is not a power of 2
|
|
||||||
uint16_t is_power_of_2 = (magic == 0);
|
|
||||||
__m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
|
|
||||||
__m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
|
|
||||||
q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
|
|
||||||
q = _mm_srai_epi16(q, shift); // q >>= shift
|
|
||||||
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
|
|
||||||
return q;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
////////// SINT32
|
////////// SINT32
|
||||||
|
|
@ -2945,8 +2795,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
|
||||||
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
uint64_t mask = ((uint64_t)1 << shift) - 1;
|
||||||
__m128i roundToZeroTweak = _mm_set1_epi64x(mask);
|
__m128i roundToZeroTweak = _mm_set1_epi64x(mask);
|
||||||
// q = numer + ((numer >> 63) & roundToZeroTweak);
|
// q = numer + ((numer >> 63) & roundToZeroTweak);
|
||||||
__m128i q = _mm_add_epi64(
|
__m128i q =
|
||||||
numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
|
_mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
|
||||||
q = libdivide_s64_shift_right_vec128(q, shift);
|
q = libdivide_s64_shift_right_vec128(q, shift);
|
||||||
__m128i sign = _mm_set1_epi32((int8_t)more >> 7);
|
__m128i sign = _mm_set1_epi32((int8_t)more >> 7);
|
||||||
// q = (q ^ sign) - sign;
|
// q = (q ^ sign) - sign;
|
||||||
|
|
@ -2997,80 +2847,49 @@ __m128i libdivide_s64_branchfree_do_vec128(
|
||||||
|
|
||||||
#ifdef __cplusplus
|
#ifdef __cplusplus
|
||||||
|
|
||||||
//for constexpr zero initialization,
|
|
||||||
//c++11 might handle things ok,
|
|
||||||
//but just limit to at least c++14 to ensure
|
|
||||||
//we don't break anyone's code:
|
|
||||||
|
|
||||||
// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
|
|
||||||
#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
|
|
||||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
|
||||||
|
|
||||||
// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
|
|
||||||
// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
|
|
||||||
#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
|
|
||||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
|
||||||
|
|
||||||
// in case some other obscure compiler has the right __cpp_constexpr :
|
|
||||||
#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
|
|
||||||
#define LIBDIVIDE_CONSTEXPR constexpr
|
|
||||||
|
|
||||||
#else
|
|
||||||
#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
|
|
||||||
#endif
|
|
||||||
|
|
||||||
enum Branching {
|
enum Branching {
|
||||||
BRANCHFULL, // use branching algorithms
|
BRANCHFULL, // use branching algorithms
|
||||||
BRANCHFREE // use branchfree algorithms
|
BRANCHFREE // use branchfree algorithms
|
||||||
};
|
};
|
||||||
|
|
||||||
namespace detail {
|
|
||||||
enum Signedness {
|
|
||||||
SIGNED,
|
|
||||||
UNSIGNED,
|
|
||||||
};
|
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_NEON)
|
#if defined(LIBDIVIDE_NEON)
|
||||||
// Helper to deduce NEON vector type for integral type.
|
// Helper to deduce NEON vector type for integral type.
|
||||||
template <int _WIDTH, Signedness _SIGN>
|
template <typename T>
|
||||||
struct NeonVec {};
|
struct NeonVecFor {};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<16, UNSIGNED> {
|
struct NeonVecFor<uint16_t> {
|
||||||
typedef uint16x8_t type;
|
typedef uint16x8_t type;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<16, SIGNED> {
|
struct NeonVecFor<int16_t> {
|
||||||
typedef int16x8_t type;
|
typedef int16x8_t type;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<32, UNSIGNED> {
|
struct NeonVecFor<uint32_t> {
|
||||||
typedef uint32x4_t type;
|
typedef uint32x4_t type;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<32, SIGNED> {
|
struct NeonVecFor<int32_t> {
|
||||||
typedef int32x4_t type;
|
typedef int32x4_t type;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<64, UNSIGNED> {
|
struct NeonVecFor<uint64_t> {
|
||||||
typedef uint64x2_t type;
|
typedef uint64x2_t type;
|
||||||
};
|
};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct NeonVec<64, SIGNED> {
|
struct NeonVecFor<int64_t> {
|
||||||
typedef int64x2_t type;
|
typedef int64x2_t type;
|
||||||
};
|
};
|
||||||
|
#endif
|
||||||
|
|
||||||
template <typename T>
|
// Versions of our algorithms for SIMD.
|
||||||
struct NeonVecFor {
|
#if defined(LIBDIVIDE_NEON)
|
||||||
// See 'class divider' for an explanation of these template parameters.
|
|
||||||
typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
|
|
||||||
};
|
|
||||||
|
|
||||||
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
|
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
|
||||||
LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
|
LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
|
||||||
typename NeonVecFor<INT_TYPE>::type n) const { \
|
typename NeonVecFor<INT_TYPE>::type n) const { \
|
||||||
|
|
@ -3079,7 +2898,6 @@ struct NeonVecFor {
|
||||||
#else
|
#else
|
||||||
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
|
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_SSE2)
|
#if defined(LIBDIVIDE_SSE2)
|
||||||
#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
|
#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
|
||||||
LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
|
LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
|
||||||
|
|
@ -3112,7 +2930,6 @@ struct NeonVecFor {
|
||||||
#define DISPATCHER_GEN(T, ALGO) \
|
#define DISPATCHER_GEN(T, ALGO) \
|
||||||
libdivide_##ALGO##_t denom; \
|
libdivide_##ALGO##_t denom; \
|
||||||
LIBDIVIDE_INLINE dispatcher() {} \
|
LIBDIVIDE_INLINE dispatcher() {} \
|
||||||
explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
|
|
||||||
LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
|
LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
|
||||||
LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
|
LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
|
||||||
LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
|
LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
|
||||||
|
|
@ -3122,81 +2939,66 @@ struct NeonVecFor {
|
||||||
LIBDIVIDE_DIVIDE_AVX512(ALGO)
|
LIBDIVIDE_DIVIDE_AVX512(ALGO)
|
||||||
|
|
||||||
// The dispatcher selects a specific division algorithm for a given
|
// The dispatcher selects a specific division algorithm for a given
|
||||||
// width, signedness, and ALGO using partial template specialization.
|
// type and ALGO using partial template specialization.
|
||||||
template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
|
template <typename _IntT, Branching ALGO>
|
||||||
struct dispatcher {};
|
struct dispatcher {};
|
||||||
|
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<16, SIGNED, BRANCHFULL> {
|
struct dispatcher<int16_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(int16_t, s16)
|
DISPATCHER_GEN(int16_t, s16)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<16, SIGNED, BRANCHFREE> {
|
struct dispatcher<int16_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(int16_t, s16_branchfree)
|
DISPATCHER_GEN(int16_t, s16_branchfree)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<16, UNSIGNED, BRANCHFULL> {
|
struct dispatcher<uint16_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(uint16_t, u16)
|
DISPATCHER_GEN(uint16_t, u16)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<16, UNSIGNED, BRANCHFREE> {
|
struct dispatcher<uint16_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(uint16_t, u16_branchfree)
|
DISPATCHER_GEN(uint16_t, u16_branchfree)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<32, SIGNED, BRANCHFULL> {
|
struct dispatcher<int32_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(int32_t, s32)
|
DISPATCHER_GEN(int32_t, s32)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<32, SIGNED, BRANCHFREE> {
|
struct dispatcher<int32_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(int32_t, s32_branchfree)
|
DISPATCHER_GEN(int32_t, s32_branchfree)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<32, UNSIGNED, BRANCHFULL> {
|
struct dispatcher<uint32_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(uint32_t, u32)
|
DISPATCHER_GEN(uint32_t, u32)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<32, UNSIGNED, BRANCHFREE> {
|
struct dispatcher<uint32_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(uint32_t, u32_branchfree)
|
DISPATCHER_GEN(uint32_t, u32_branchfree)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<64, SIGNED, BRANCHFULL> {
|
struct dispatcher<int64_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(int64_t, s64)
|
DISPATCHER_GEN(int64_t, s64)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<64, SIGNED, BRANCHFREE> {
|
struct dispatcher<int64_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(int64_t, s64_branchfree)
|
DISPATCHER_GEN(int64_t, s64_branchfree)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<64, UNSIGNED, BRANCHFULL> {
|
struct dispatcher<uint64_t, BRANCHFULL> {
|
||||||
DISPATCHER_GEN(uint64_t, u64)
|
DISPATCHER_GEN(uint64_t, u64)
|
||||||
};
|
};
|
||||||
template <>
|
template <>
|
||||||
struct dispatcher<64, UNSIGNED, BRANCHFREE> {
|
struct dispatcher<uint64_t, BRANCHFREE> {
|
||||||
DISPATCHER_GEN(uint64_t, u64_branchfree)
|
DISPATCHER_GEN(uint64_t, u64_branchfree)
|
||||||
};
|
};
|
||||||
} // namespace detail
|
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_NEON)
|
|
||||||
// Allow NeonVecFor outside of detail namespace.
|
|
||||||
template <typename T>
|
|
||||||
struct NeonVecFor {
|
|
||||||
typedef typename detail::NeonVecFor<T>::type type;
|
|
||||||
};
|
|
||||||
#endif
|
|
||||||
|
|
||||||
// This is the main divider class for use by the user (C++ API).
|
// This is the main divider class for use by the user (C++ API).
|
||||||
// The actual division algorithm is selected using the dispatcher struct
|
// The actual division algorithm is selected using the dispatcher struct
|
||||||
// based on the integer width and algorithm template parameters.
|
// based on the integer and algorithm template parameters.
|
||||||
template <typename T, Branching ALGO = BRANCHFULL>
|
template <typename T, Branching ALGO = BRANCHFULL>
|
||||||
class divider {
|
class divider {
|
||||||
private:
|
private:
|
||||||
// Dispatch based on the size and signedness.
|
typedef dispatcher<T, ALGO> dispatcher_t;
|
||||||
// We avoid using type_traits as it's not available in AVR.
|
|
||||||
// Detect signedness by checking if T(-1) is less than T(0).
|
|
||||||
// Also throw in a shift by 0, which prevents floating point types from being passed.
|
|
||||||
typedef detail::dispatcher<sizeof(T) * 8,
|
|
||||||
(((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
|
|
||||||
dispatcher_t;
|
|
||||||
|
|
||||||
public:
|
public:
|
||||||
// We leave the default constructor empty so that creating
|
// We leave the default constructor empty so that creating
|
||||||
|
|
@ -3204,9 +3006,6 @@ class divider {
|
||||||
// later doesn't slow us down.
|
// later doesn't slow us down.
|
||||||
divider() {}
|
divider() {}
|
||||||
|
|
||||||
// constexpr zero-initialization to allow for use w/ static constinit
|
|
||||||
explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
|
|
||||||
|
|
||||||
// Constructor that takes the divisor as a parameter
|
// Constructor that takes the divisor as a parameter
|
||||||
LIBDIVIDE_INLINE divider(T d) : div(d) {}
|
LIBDIVIDE_INLINE divider(T d) : div(d) {}
|
||||||
|
|
||||||
|
|
@ -3218,7 +3017,7 @@ class divider {
|
||||||
T recover() const { return div.recover(); }
|
T recover() const { return div.recover(); }
|
||||||
|
|
||||||
bool operator==(const divider<T, ALGO> &other) const {
|
bool operator==(const divider<T, ALGO> &other) const {
|
||||||
return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
|
return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
|
bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
|
||||||
|
|
@ -3299,14 +3098,12 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
|
||||||
|
|
||||||
#if defined(LIBDIVIDE_NEON)
|
#if defined(LIBDIVIDE_NEON)
|
||||||
template <typename T, Branching ALGO>
|
template <typename T, Branching ALGO>
|
||||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
|
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
|
||||||
typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
|
|
||||||
return div.divide(n);
|
return div.divide(n);
|
||||||
}
|
}
|
||||||
|
|
||||||
template <typename T, Branching ALGO>
|
template <typename T, Branching ALGO>
|
||||||
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
|
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
|
||||||
typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
|
|
||||||
n = div.divide(n);
|
n = div.divide(n);
|
||||||
return n;
|
return n;
|
||||||
}
|
}
|
||||||
|
|
|
||||||
9
util.h
9
util.h
|
|
@ -9,9 +9,7 @@
|
||||||
#define noreturn __attribute__((noreturn))
|
#define noreturn __attribute__((noreturn))
|
||||||
|
|
||||||
#define likely(x) __builtin_expect(!!(x), 1)
|
#define likely(x) __builtin_expect(!!(x), 1)
|
||||||
#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51)
|
|
||||||
#define unlikely(x) __builtin_expect(!!(x), 0)
|
#define unlikely(x) __builtin_expect(!!(x), 0)
|
||||||
#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51)
|
|
||||||
|
|
||||||
#define min(x, y) ({ \
|
#define min(x, y) ({ \
|
||||||
__typeof__(x) _x = (x); \
|
__typeof__(x) _x = (x); \
|
||||||
|
|
@ -32,13 +30,6 @@
|
||||||
#define STRINGIFY(s) #s
|
#define STRINGIFY(s) #s
|
||||||
#define ALIAS(f) __attribute__((alias(STRINGIFY(f))))
|
#define ALIAS(f) __attribute__((alias(STRINGIFY(f))))
|
||||||
|
|
||||||
// supported since GCC 15
|
|
||||||
#if __has_attribute (nonstring)
|
|
||||||
# define NONSTRING __attribute__ ((nonstring))
|
|
||||||
#else
|
|
||||||
# define NONSTRING
|
|
||||||
#endif
|
|
||||||
|
|
||||||
typedef uint8_t u8;
|
typedef uint8_t u8;
|
||||||
typedef uint16_t u16;
|
typedef uint16_t u16;
|
||||||
typedef uint32_t u32;
|
typedef uint32_t u32;
|
||||||
|
|
|
||||||
Loading…
Add table
Reference in a new issue