Compare commits

...

23 commits

Author SHA1 Message Date
Charles
5cb0ff9f4d gitignore: use exact matches 2025-10-29 16:26:38 -04:00
Daniel Micay
e371736b17 drop legacy compiler versions from GitHub workflow 2025-09-23 18:12:57 -04:00
Daniel Micay
c46d3cab33 add newer Clang versions for GitHub workflow 2025-09-23 18:12:39 -04:00
Christian Göttsche
33ed3027ab Fix two typos 2025-09-21 12:35:28 -04:00
Christian Göttsche
86dde60fcf ReadMe: adjust section about library location 2025-09-21 12:35:28 -04:00
charles25565
ff99511eb4 Update dependencies in README
Update from bookworm to trixie, updating GKIs, and changing to Android 16.
2025-09-17 11:03:53 -04:00
Daniel Micay
c392d40843 update GitHub actions/checkout to 5 2025-08-12 00:28:58 -04:00
Віктор Дуйко
7481c8857f docs: updated the license date 2025-04-05 13:13:18 -04:00
Christian Göttsche
1d7fc7ffe0 support GCC15
GCC 15 starts warning about non NUL-terminated string literals:

    chacha.c:44:31: error: initializer-string for array of ‘char’ truncates NUL terminator but destination lacks ‘nonstring’ attribute (17 chars into 16 available) [-Werror=unterminated-string-initialization]
       44 | static const char sigma[16] = "expand 32-byte k";
          |                               ^~~~~~~~~~~~~~~~~~
2025-04-03 18:31:55 -04:00
Daniel Micay
4fe9018b6f rename calculate_waste.py to calculate-waste 2025-02-17 12:47:30 -05:00
Daniel Micay
3ab23f7ebf update libdivide to 5.2.0 2025-01-25 16:13:22 -05:00
Daniel Micay
c894f3ec1d add newer compiler versions for GitHub workflow 2024-12-15 22:20:01 -05:00
Daniel Micay
c97263ef0c handle GitHub runner image updates
clang-14 and clang-15 are no longer installed by default.
2024-12-15 22:18:40 -05:00
Daniel Micay
a7302add63 update outdated branch in README 2024-10-23 06:36:02 -04:00
Daniel Micay
b1d9571fec remove trailing whitespace 2024-10-12 03:23:52 -04:00
Daniel Micay
e03579253a preserve PROT_MTE when releasing memory 2024-10-12 03:19:16 -04:00
Daniel Micay
9739cb4690 use wrapper for calling memory_map_mte 2024-10-12 03:19:03 -04:00
Daniel Micay
aa950244f8 reuse code for memory_map_mte
This drops the separate error message since that doesn't seem useful.
2024-10-12 03:18:36 -04:00
Daniel Micay
6402e2b0d4 reduce probability hint for is_memtag_enabled 2024-10-12 03:17:44 -04:00
Daniel Micay
e86192e7fe remove redundant warning switches for Android
Android already enables -Wall and -Wextra in the global soong build
settings.
2024-10-09 19:57:15 -04:00
Julien Voisin
6ce663a8bd Fix -Wimplicit-function-declaration error with gcc 14.
```
malloc_info.c: In function 'leak_memory':
malloc_info.c:12:12: error: implicit declaration of function 'malloc' [-Wimplicit-function-declaration]
   12 |     (void)!malloc(1024 * 1024 * 1024);
      |            ^~~~~~
malloc_info.c:10:1: note: include '<stdlib.h>' or provide a declaration of 'malloc'
    9 | #include "../util.h"
  +++ |+#include <stdlib.h>
   10 |
malloc_info.c:12:12: warning: incompatible implicit declaration of built-in function 'malloc' [-Wbuiltin-declaration-mismatch]
   12 |     (void)!malloc(1024 * 1024 * 1024);
      |            ^~~~~~
```

Taken from https://gitlab.alpinelinux.org/alpine/aports/-/merge_requests/72971/

Co-authored-by: @mio
2024-10-03 23:44:15 -04:00
maade93791
9ca9d2d925 android: use more basic CPU target for memtag
This is required for hardened_malloc to work in microdroid on MTE-enabled devices (currently, 8th
and 9th generation Pixels) since PVMFW only supports ARMv8 cores.

https://android.googlesource.com/platform/packages/modules/Virtualization/+/refs/tags/android-15.0.0_r1/pvmfw/platform.dts#100
2024-09-09 19:22:23 -04:00
Daniel Micay
3f07acfab1 update libdivide to 5.1 2024-08-05 02:25:55 -04:00
15 changed files with 420 additions and 183 deletions

View file

@ -11,9 +11,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
version: [12] version: [14]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v5
- name: Setting up gcc version - name: Setting up gcc version
run: | run: |
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
@ -24,9 +24,11 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
version: [14, 15] version: [19, 20]
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v5
- name: Install dependencies
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-19 clang-20
- name: Setting up clang version - name: Setting up clang version
run: | run: |
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100 sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
@ -38,7 +40,7 @@ jobs:
container: container:
image: alpine:latest image: alpine:latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v5
- name: Install dependencies - name: Install dependencies
run: apk update && apk add build-base python3 run: apk update && apk add build-base python3
- name: Build - name: Build
@ -46,7 +48,7 @@ jobs:
build-ubuntu-gcc-aarch64: build-ubuntu-gcc-aarch64:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v4 - uses: actions/checkout@v5
- name: Install dependencies - name: Install dependencies
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu
- name: Build - name: Build

4
.gitignore vendored
View file

@ -1,2 +1,2 @@
out/ /out/
out-light/ /out-light/

View file

@ -5,8 +5,6 @@ common_cflags = [
"-fPIC", "-fPIC",
"-fvisibility=hidden", "-fvisibility=hidden",
//"-fno-plt", //"-fno-plt",
"-Wall",
"-Wextra",
"-Wcast-align", "-Wcast-align",
"-Wcast-qual", "-Wcast-qual",
"-Wwrite-strings", "-Wwrite-strings",
@ -74,7 +72,7 @@ cc_library {
cflags: ["-DLABEL_MEMORY"], cflags: ["-DLABEL_MEMORY"],
}, },
device_has_arm_mte: { device_has_arm_mte: {
cflags: ["-DHAS_ARM_MTE", "-march=armv9-a+memtag"] cflags: ["-DHAS_ARM_MTE", "-march=armv8-a+dotprod+memtag"]
}, },
}, },
apex_available: [ apex_available: [

View file

@ -1,4 +1,4 @@
Copyright © 2018-2024 GrapheneOS Copyright © 2018-2025 GrapheneOS
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View file

@ -65,14 +65,14 @@ used instead as this allocator fundamentally doesn't support that environment.
## Dependencies ## Dependencies
Debian stable (currently Debian 12) determines the most ancient set of Debian stable (currently Debian 13) determines the most ancient set of
supported dependencies: supported dependencies:
* glibc 2.36 * glibc 2.41
* Linux 6.1 * Linux 6.12
* Clang 14.0.6 or GCC 12.2.0 * Clang 19.1.7 or GCC 14.2.0
For Android, the Linux GKI 5.10, 5.15 and 6.1 branches are supported. For Android, the Linux GKI 6.1, 6.6 and 6.12 branches are supported.
However, using more recent releases is highly recommended. Older versions of However, using more recent releases is highly recommended. Older versions of
the dependencies may be compatible at the moment but are not tested and will the dependencies may be compatible at the moment but are not tested and will
@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
along with other hardening for the C standard library implementation. along with other hardening for the C standard library implementation.
For Android, only the current generation, actively developed maintenance branch of the Android For Android, only the current generation, actively developed maintenance branch of the Android
Open Source Project will be supported, which currently means `android13-qpr2-release`. Open Source Project will be supported, which currently means `android16-release`.
## Testing ## Testing
@ -159,8 +159,11 @@ line to the `/etc/ld.so.preload` configuration file:
The format of this configuration file is a whitespace-separated list, so it's The format of this configuration file is a whitespace-separated list, so it's
good practice to put each library on a separate line. good practice to put each library on a separate line.
On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/` For maximum compatibility `libhardened_malloc.so` can be installed into
to avoid preload failures caused by AppArmor profile restrictions. `/usr/lib/` to avoid preload failures caused by AppArmor profiles or systemd
ExecPaths= restrictions. Check for logs of the following format:
ERROR: ld.so: object '/usr/local/lib/libhardened_malloc.so' from /etc/ld.so.preload cannot be preloaded (failed to map segment from shared object): ignored.
Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
will not work when `AT_SECURE` is set such as with setuid binaries. It's also will not work when `AT_SECURE` is set such as with setuid binaries. It's also

View file

@ -44,7 +44,7 @@ void *set_pointer_tag(void *ptr, u8 tag) {
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr)); return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
} }
// This test checks that slab slot allocation uses tag that is distint from tags of its neighbors // This test checks that slab slot allocation uses tag that is distinct from tags of its neighbors
// and from the tag of the previous allocation that used the same slot // and from the tag of the previous allocation that used the same slot
void tag_distinctness() { void tag_distinctness() {
// tag 0 is reserved // tag 0 is reserved

View file

@ -41,7 +41,7 @@ static const unsigned rounds = 8;
a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \ a = PLUS(a, b); d = ROTATE(XOR(d, a), 8); \
c = PLUS(c, d); b = ROTATE(XOR(b, c), 7); c = PLUS(c, d); b = ROTATE(XOR(b, c), 7);
static const char sigma[16] = "expand 32-byte k"; static const char sigma[16] NONSTRING = "expand 32-byte k";
void chacha_keysetup(chacha_ctx *x, const u8 *k) { void chacha_keysetup(chacha_ctx *x, const u8 *k) {
x->input[0] = U8TO32_LITTLE(sigma + 0); x->input[0] = U8TO32_LITTLE(sigma + 0);

View file

@ -94,6 +94,24 @@ static inline bool is_memtag_enabled(void) {
} }
#endif #endif
static void *memory_map_tagged(size_t size) {
#ifdef HAS_ARM_MTE
if (likely51(is_memtag_enabled())) {
return memory_map_mte(size);
}
#endif
return memory_map(size);
}
static bool memory_map_fixed_tagged(void *ptr, size_t size) {
#ifdef HAS_ARM_MTE
if (likely51(is_memtag_enabled())) {
return memory_map_fixed_mte(ptr, size);
}
#endif
return memory_map_fixed(ptr, size);
}
#define SLAB_METADATA_COUNT #define SLAB_METADATA_COUNT
struct slab_metadata { struct slab_metadata {
@ -470,7 +488,7 @@ static void write_after_free_check(const char *p, size_t size) {
} }
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
return; return;
} }
#endif #endif
@ -505,7 +523,7 @@ static void set_slab_canary_value(UNUSED struct slab_metadata *metadata, UNUSED
static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) { static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void *p, UNUSED size_t size) {
#if SLAB_CANARY #if SLAB_CANARY
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
return; return;
} }
#endif #endif
@ -517,7 +535,7 @@ static void set_canary(UNUSED const struct slab_metadata *metadata, UNUSED void
static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) { static void check_canary(UNUSED const struct slab_metadata *metadata, UNUSED const void *p, UNUSED size_t size) {
#if SLAB_CANARY #if SLAB_CANARY
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
return; return;
} }
#endif #endif
@ -624,7 +642,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
write_after_free_check(p, size - canary_size); write_after_free_check(p, size - canary_size);
set_canary(metadata, p, size); set_canary(metadata, p, size);
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
p = tag_and_clear_slab_slot(metadata, p, slot, size); p = tag_and_clear_slab_slot(metadata, p, slot, size);
} }
#endif #endif
@ -661,7 +679,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
if (requested_size) { if (requested_size) {
set_canary(metadata, p, size); set_canary(metadata, p, size);
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
p = tag_and_clear_slab_slot(metadata, p, slot, size); p = tag_and_clear_slab_slot(metadata, p, slot, size);
} }
#endif #endif
@ -688,7 +706,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
if (requested_size) { if (requested_size) {
set_canary(metadata, p, size); set_canary(metadata, p, size);
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
p = tag_and_clear_slab_slot(metadata, p, slot, size); p = tag_and_clear_slab_slot(metadata, p, slot, size);
} }
#endif #endif
@ -717,7 +735,7 @@ static inline void *allocate_small(unsigned arena, size_t requested_size) {
write_after_free_check(p, size - canary_size); write_after_free_check(p, size - canary_size);
set_canary(metadata, p, size); set_canary(metadata, p, size);
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
p = tag_and_clear_slab_slot(metadata, p, slot, size); p = tag_and_clear_slab_slot(metadata, p, slot, size);
} }
#endif #endif
@ -805,7 +823,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
bool skip_zero = false; bool skip_zero = false;
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
if (likely(is_memtag_enabled())) { if (likely51(is_memtag_enabled())) {
arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size); arm_mte_tag_and_clear_mem(set_pointer_tag(p, RESERVED_TAG), size);
// metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot() // metadata->arm_mte_tags is intentionally not updated, see tag_and_clear_slab_slot()
skip_zero = true; skip_zero = true;
@ -890,7 +908,7 @@ static inline void deallocate_small(void *p, const size_t *expected_size) {
if (c->empty_slabs_total + slab_size > max_empty_slabs_total) { if (c->empty_slabs_total + slab_size > max_empty_slabs_total) {
int saved_errno = errno; int saved_errno = errno;
if (!memory_map_fixed(slab, slab_size)) { if (!memory_map_fixed_tagged(slab, slab_size)) {
label_slab(slab, slab_size, class); label_slab(slab, slab_size, class);
stats_slab_deallocate(c, slab_size); stats_slab_deallocate(c, slab_size);
enqueue_free_slab(c, metadata); enqueue_free_slab(c, metadata);
@ -1242,15 +1260,7 @@ COLD static void init_slow_path(void) {
if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) { if (unlikely(memory_protect_rw_metadata(ra->regions, ra->total * sizeof(struct region_metadata)))) {
fatal_error("failed to unprotect memory for regions table"); fatal_error("failed to unprotect memory for regions table");
} }
#ifdef HAS_ARM_MTE ro.slab_region_start = memory_map_tagged(slab_region_size);
if (likely(is_memtag_enabled())) {
ro.slab_region_start = memory_map_mte(slab_region_size);
} else {
ro.slab_region_start = memory_map(slab_region_size);
}
#else
ro.slab_region_start = memory_map(slab_region_size);
#endif
if (unlikely(ro.slab_region_start == NULL)) { if (unlikely(ro.slab_region_start == NULL)) {
fatal_error("failed to allocate slab region"); fatal_error("failed to allocate slab region");
} }
@ -1895,7 +1905,7 @@ EXPORT int h_malloc_trim(UNUSED size_t pad) {
struct slab_metadata *iterator = c->empty_slabs; struct slab_metadata *iterator = c->empty_slabs;
while (iterator) { while (iterator) {
void *slab = get_slab(c, slab_size, iterator); void *slab = get_slab(c, slab_size, iterator);
if (memory_map_fixed(slab, slab_size)) { if (memory_map_fixed_tagged(slab, slab_size)) {
break; break;
} }
label_slab(slab, slab_size, class); label_slab(slab, slab_size, class);

View file

@ -17,8 +17,8 @@
#include "memory.h" #include "memory.h"
#include "util.h" #include "util.h"
void *memory_map(size_t size) { static void *memory_map_prot(size_t size, int prot) {
void *p = mmap(NULL, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); void *p = mmap(NULL, size, prot, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0);
if (unlikely(p == MAP_FAILED)) { if (unlikely(p == MAP_FAILED)) {
if (errno != ENOMEM) { if (errno != ENOMEM) {
fatal_error("non-ENOMEM mmap failure"); fatal_error("non-ENOMEM mmap failure");
@ -28,22 +28,19 @@ void *memory_map(size_t size) {
return p; return p;
} }
void *memory_map(size_t size) {
return memory_map_prot(size, PROT_NONE);
}
#ifdef HAS_ARM_MTE #ifdef HAS_ARM_MTE
// Note that PROT_MTE can't be cleared via mprotect // Note that PROT_MTE can't be cleared via mprotect
void *memory_map_mte(size_t size) { void *memory_map_mte(size_t size) {
void *p = mmap(NULL, size, PROT_MTE, MAP_ANONYMOUS|MAP_PRIVATE, -1, 0); return memory_map_prot(size, PROT_MTE);
if (unlikely(p == MAP_FAILED)) {
if (errno != ENOMEM) {
fatal_error("non-ENOMEM MTE mmap failure");
}
return NULL;
}
return p;
} }
#endif #endif
bool memory_map_fixed(void *ptr, size_t size) { static bool memory_map_fixed_prot(void *ptr, size_t size, int prot) {
void *p = mmap(ptr, size, PROT_NONE, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0); void *p = mmap(ptr, size, prot, MAP_ANONYMOUS|MAP_PRIVATE|MAP_FIXED, -1, 0);
bool ret = p == MAP_FAILED; bool ret = p == MAP_FAILED;
if (unlikely(ret) && errno != ENOMEM) { if (unlikely(ret) && errno != ENOMEM) {
fatal_error("non-ENOMEM MAP_FIXED mmap failure"); fatal_error("non-ENOMEM MAP_FIXED mmap failure");
@ -51,6 +48,17 @@ bool memory_map_fixed(void *ptr, size_t size) {
return ret; return ret;
} }
bool memory_map_fixed(void *ptr, size_t size) {
return memory_map_fixed_prot(ptr, size, PROT_NONE);
}
#ifdef HAS_ARM_MTE
// Note that PROT_MTE can't be cleared via mprotect
bool memory_map_fixed_mte(void *ptr, size_t size) {
return memory_map_fixed_prot(ptr, size, PROT_MTE);
}
#endif
bool memory_unmap(void *ptr, size_t size) { bool memory_unmap(void *ptr, size_t size) {
bool ret = munmap(ptr, size); bool ret = munmap(ptr, size);
if (unlikely(ret) && errno != ENOMEM) { if (unlikely(ret) && errno != ENOMEM) {

View file

@ -15,6 +15,9 @@ void *memory_map(size_t size);
void *memory_map_mte(size_t size); void *memory_map_mte(size_t size);
#endif #endif
bool memory_map_fixed(void *ptr, size_t size); bool memory_map_fixed(void *ptr, size_t size);
#ifdef HAS_ARM_MTE
bool memory_map_fixed_mte(void *ptr, size_t size);
#endif
bool memory_unmap(void *ptr, size_t size); bool memory_unmap(void *ptr, size_t size);
bool memory_protect_ro(void *ptr, size_t size); bool memory_protect_ro(void *ptr, size_t size);
bool memory_protect_rw(void *ptr, size_t size); bool memory_protect_rw(void *ptr, size_t size);

View file

@ -1,5 +1,6 @@
#include <pthread.h> #include <pthread.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h>
#if defined(__GLIBC__) || defined(__ANDROID__) #if defined(__GLIBC__) || defined(__ANDROID__)
#include <malloc.h> #include <malloc.h>

View file

@ -98,7 +98,7 @@ class TestSimpleMemoryCorruption(unittest.TestCase):
self.assertEqual(stderr.decode("utf-8"), self.assertEqual(stderr.decode("utf-8"),
"fatal allocator error: invalid free\n") "fatal allocator error: invalid free\n")
def test_invalid_malloc_usable_size_small_quarantene(self): def test_invalid_malloc_usable_size_small_quarantine(self):
_stdout, stderr, returncode = self.run_test( _stdout, stderr, returncode = self.run_test(
"invalid_malloc_usable_size_small_quarantine") "invalid_malloc_usable_size_small_quarantine")
self.assertEqual(returncode, -6) self.assertEqual(returncode, -6)

View file

@ -1,8 +1,8 @@
// libdivide.h - Optimized integer division // libdivide.h - Optimized integer division
// https://libdivide.com // https://libdivide.com
// //
// Copyright (C) 2010 - 2021 ridiculous_fish, <libdivide@ridiculousfish.com> // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
// Copyright (C) 2016 - 2021 Kim Walisch, <kim.walisch@gmail.com> // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
// //
// libdivide is dual-licensed under the Boost or zlib licenses. // libdivide is dual-licensed under the Boost or zlib licenses.
// You may use libdivide under the terms of either of these. // You may use libdivide under the terms of either of these.
@ -11,11 +11,14 @@
#ifndef LIBDIVIDE_H #ifndef LIBDIVIDE_H
#define LIBDIVIDE_H #define LIBDIVIDE_H
#define LIBDIVIDE_VERSION "5.0" // *** Version numbers are auto generated - do not edit ***
#define LIBDIVIDE_VERSION "5.2.0"
#define LIBDIVIDE_VERSION_MAJOR 5 #define LIBDIVIDE_VERSION_MAJOR 5
#define LIBDIVIDE_VERSION_MINOR 0 #define LIBDIVIDE_VERSION_MINOR 2
#define LIBDIVIDE_VERSION_PATCH 0
#include <stdint.h> #include <stdint.h>
#if !defined(__AVR__) #if !defined(__AVR__)
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
@ -24,15 +27,24 @@
#if defined(LIBDIVIDE_SSE2) #if defined(LIBDIVIDE_SSE2)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
#if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512)
#include <immintrin.h> #include <immintrin.h>
#endif #endif
#if defined(LIBDIVIDE_NEON) #if defined(LIBDIVIDE_NEON)
#include <arm_neon.h> #include <arm_neon.h>
#endif #endif
// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
#if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
#define LIBDIVIDE_X64_INTRINSICS
#endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#if defined(LIBDIVIDE_X64_INTRINSICS)
#include <intrin.h> #include <intrin.h>
#endif
#pragma warning(push) #pragma warning(push)
// disable warning C4146: unary minus operator applied // disable warning C4146: unary minus operator applied
// to unsigned type, result still unsigned // to unsigned type, result still unsigned
@ -238,24 +250,32 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(
int16_t numer, int16_t magic, uint8_t more); int16_t numer, int16_t magic, uint8_t more);
static LIBDIVIDE_INLINE int16_t libdivide_s16_do( static LIBDIVIDE_INLINE int16_t libdivide_s16_do(
int16_t numer, const struct libdivide_s16_t* denom); int16_t numer, const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(
uint16_t numer, uint16_t magic, uint8_t more); uint16_t numer, uint16_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(
uint16_t numer, const struct libdivide_u16_t* denom); uint16_t numer, const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(
int32_t numer, int32_t magic, uint8_t more);
static LIBDIVIDE_INLINE int32_t libdivide_s32_do( static LIBDIVIDE_INLINE int32_t libdivide_s32_do(
int32_t numer, const struct libdivide_s32_t *denom); int32_t numer, const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(
uint32_t numer, uint32_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(
uint32_t numer, const struct libdivide_u32_t *denom); uint32_t numer, const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(
int64_t numer, int64_t magic, uint8_t more);
static LIBDIVIDE_INLINE int64_t libdivide_s64_do( static LIBDIVIDE_INLINE int64_t libdivide_s64_do(
int64_t numer, const struct libdivide_s64_t *denom); int64_t numer, const struct libdivide_s64_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(
uint64_t numer, uint64_t magic, uint8_t more);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(
uint64_t numer, const struct libdivide_u64_t *denom); uint64_t numer, const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(
int16_t numer, const struct libdivide_s16_branchfree_t* denom); int16_t numer, const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do(
uint16_t numer, const struct libdivide_u16_branchfree_t* denom); uint16_t numer, const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(
int32_t numer, const struct libdivide_s32_branchfree_t *denom); int32_t numer, const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do(
@ -265,17 +285,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do(
uint64_t numer, const struct libdivide_u64_branchfree_t *denom); uint64_t numer, const struct libdivide_u64_branchfree_t *denom);
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom);
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom);
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom);
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(
const struct libdivide_s16_branchfree_t* denom); const struct libdivide_s16_branchfree_t *denom);
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(
const struct libdivide_u16_branchfree_t* denom); const struct libdivide_u16_branchfree_t *denom);
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(
const struct libdivide_s32_branchfree_t *denom); const struct libdivide_s32_branchfree_t *denom);
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(
@ -314,7 +334,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
} }
static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) #if defined(LIBDIVIDE_X64_INTRINSICS)
return __umulh(x, y); return __umulh(x, y);
#elif defined(HAS_INT128_T) #elif defined(HAS_INT128_T)
__uint128_t xl = x, yl = y; __uint128_t xl = x, yl = y;
@ -340,7 +360,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
} }
static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
#if defined(LIBDIVIDE_VC) && defined(LIBDIVIDE_X86_64) #if defined(LIBDIVIDE_X64_INTRINSICS)
return __mulh(x, y); return __mulh(x, y);
#elif defined(HAS_INT128_T) #elif defined(HAS_INT128_T)
__int128_t xl = x, yl = y; __int128_t xl = x, yl = y;
@ -393,7 +413,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
#if defined(__AVR__) #if defined(__AVR__)
// Fast way to count leading zeros // Fast way to count leading zeros
return __builtin_clzl(val); return __builtin_clzl(val);
#elif defined(__GNUC__) || __has_builtin(__builtin_clz) #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
// Fast way to count leading zeros // Fast way to count leading zeros
@ -442,7 +462,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
// uint {v}. The result must fit in 16 bits. // uint {v}. The result must fit in 16 bits.
// Returns the quotient directly and the remainder in *r // Returns the quotient directly and the remainder in *r
static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16(
uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) { uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) {
uint32_t n = ((uint32_t)u1 << 16) | u0; uint32_t n = ((uint32_t)u1 << 16) | u0;
uint16_t result = (uint16_t)(n / v); uint16_t result = (uint16_t)(n / v);
*r = (uint16_t)(n - result * (uint32_t)v); *r = (uint16_t)(n - result * (uint32_t)v);
@ -512,7 +532,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
// Check for overflow and divide by 0. // Check for overflow and divide by 0.
if (numhi >= den) { if (numhi >= den) {
if (r != NULL) *r = ~0ull; if (r) *r = ~0ull;
return ~0ull; return ~0ull;
} }
@ -558,11 +578,14 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
q0 = (uint32_t)qhat; q0 = (uint32_t)qhat;
// Return remainder if requested. // Return remainder if requested.
if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; if (r) *r = (rem * b + num0 - q0 * den) >> shift;
return ((uint64_t)q1 << 32) | q0; return ((uint64_t)q1 << 32) | q0;
#endif #endif
} }
#if !(defined(HAS_INT128_T) && \
defined(HAS_INT128_DIV))
// Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0)
static LIBDIVIDE_INLINE void libdivide_u128_shift( static LIBDIVIDE_INLINE void libdivide_u128_shift(
uint64_t *u1, uint64_t *u0, int32_t signed_shift) { uint64_t *u1, uint64_t *u0, int32_t signed_shift) {
@ -579,6 +602,8 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift(
} }
} }
#endif
// Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder.
static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64(
uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) {
@ -696,8 +721,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
// 1 in its recovery algorithm. // 1 in its recovery algorithm.
result.magic = 0; result.magic = 0;
result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); result.more = (uint8_t)(floor_log_2_d - (branchfree != 0));
} } else {
else {
uint8_t more; uint8_t more;
uint16_t rem, proposed_m; uint16_t rem, proposed_m;
proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem);
@ -709,8 +733,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) {
// This power works // This power works
more = floor_log_2_d; more = floor_log_2_d;
} } else {
else {
// We have to use the general 17-bit algorithm. We need to compute // We have to use the general 17-bit algorithm. We need to compute
// (2**power) / d. However, we already have (2**(power-1))/d and // (2**power) / d. However, we already have (2**(power-1))/d and
// its remainder. By doubling both, and then correcting the // its remainder. By doubling both, and then correcting the
@ -742,7 +765,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
} }
struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1);
struct libdivide_u16_branchfree_t ret = { struct libdivide_u16_branchfree_t ret = {
tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) }; tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)};
return ret; return ret;
} }
@ -752,14 +775,12 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
if (!magic) { if (!magic) {
return numer >> more; return numer >> more;
} } else {
else {
uint16_t q = libdivide_mullhi_u16(magic, numer); uint16_t q = libdivide_mullhi_u16(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint16_t t = ((numer - q) >> 1) + q; uint16_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_16_SHIFT_MASK); return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
} } else {
else {
// All upper bits are 0, // All upper bits are 0,
// don't need to mask them off. // don't need to mask them off.
return q >> more; return q >> more;
@ -767,12 +788,12 @@ uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
} }
} }
uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) { uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
return libdivide_u16_do_raw(numer, denom->magic, denom->more); return libdivide_u16_do_raw(numer, denom->magic, denom->more);
} }
uint16_t libdivide_u16_branchfree_do( uint16_t libdivide_u16_branchfree_do(
uint16_t numer, const struct libdivide_u16_branchfree_t* denom) { uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
uint16_t q = libdivide_mullhi_u16(denom->magic, numer); uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
uint16_t t = ((numer - q) >> 1) + q; uint16_t t = ((numer - q) >> 1) + q;
return t >> denom->more; return t >> denom->more;
@ -800,7 +821,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
// overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and
// then double the quotient and remainder. // then double the quotient and remainder.
uint32_t half_n = (uint32_t)1 << (16 + shift); uint32_t half_n = (uint32_t)1 << (16 + shift);
uint32_t d = ( (uint32_t)1 << 16) | denom->magic; uint32_t d = ((uint32_t)1 << 16) | denom->magic;
// Note that the quotient is guaranteed <= 16 bits, but the remainder // Note that the quotient is guaranteed <= 16 bits, but the remainder
// may need 17! // may need 17!
uint16_t half_q = (uint16_t)(half_n / d); uint16_t half_q = (uint16_t)(half_n / d);
@ -912,12 +933,11 @@ struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
return ret; return ret;
} }
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
uint8_t more = denom->more; if (!magic) {
if (!denom->magic) {
return numer >> more; return numer >> more;
} else { } else {
uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t q = libdivide_mullhi_u32(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint32_t t = ((numer - q) >> 1) + q; uint32_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_32_SHIFT_MASK); return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
@ -929,6 +949,10 @@ uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
} }
} }
uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
return libdivide_u32_do_raw(numer, denom->magic, denom->more);
}
uint32_t libdivide_u32_branchfree_do( uint32_t libdivide_u32_branchfree_do(
uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
@ -1072,12 +1096,11 @@ struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
return ret; return ret;
} }
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
uint8_t more = denom->more; if (!magic) {
if (!denom->magic) {
return numer >> more; return numer >> more;
} else { } else {
uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t q = libdivide_mullhi_u64(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint64_t t = ((numer - q) >> 1) + q; uint64_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_64_SHIFT_MASK); return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
@ -1089,6 +1112,10 @@ uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
} }
} }
uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
return libdivide_u64_do_raw(numer, denom->magic, denom->more);
}
uint64_t libdivide_u64_branchfree_do( uint64_t libdivide_u64_branchfree_do(
uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
@ -1428,11 +1455,10 @@ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
return result; return result;
} }
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
if (!denom->magic) { if (!magic) {
uint32_t sign = (int8_t)more >> 7; uint32_t sign = (int8_t)more >> 7;
uint32_t mask = ((uint32_t)1 << shift) - 1; uint32_t mask = ((uint32_t)1 << shift) - 1;
uint32_t uq = numer + ((numer >> 31) & mask); uint32_t uq = numer + ((numer >> 31) & mask);
@ -1441,7 +1467,7 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
q = (q ^ sign) - sign; q = (q ^ sign) - sign;
return q; return q;
} else { } else {
uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int32_t sign = (int8_t)more >> 7; int32_t sign = (int8_t)more >> 7;
@ -1456,6 +1482,10 @@ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
} }
} }
int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
return libdivide_s32_do_raw(numer, denom->magic, denom->more);
}
int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
@ -1597,11 +1627,10 @@ struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
return ret; return ret;
} }
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
if (!denom->magic) { // shift path if (!magic) { // shift path
uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t mask = ((uint64_t)1 << shift) - 1;
uint64_t uq = numer + ((numer >> 63) & mask); uint64_t uq = numer + ((numer >> 63) & mask);
int64_t q = (int64_t)uq; int64_t q = (int64_t)uq;
@ -1611,7 +1640,7 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
q = (q ^ sign) - sign; q = (q ^ sign) - sign;
return q; return q;
} else { } else {
uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer); uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int64_t sign = (int8_t)more >> 7; int64_t sign = (int8_t)more >> 7;
@ -1626,6 +1655,10 @@ int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
} }
} }
int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
return libdivide_s64_do_raw(numer, denom->magic, denom->more);
}
int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
@ -1682,15 +1715,22 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t
// Simplest possible vector type division: treat the vector type as an array // Simplest possible vector type division: treat the vector type as an array
// of underlying native type. // of underlying native type.
#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ //
const size_t count = sizeof(VecT) / sizeof(IntT); \ // Use a union to read a vector via pointer-to-integer, without violating strict
VecT result; \ // aliasing.
IntT *pSource = (IntT *)&numers; \ #define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \
IntT *pTarget = (IntT *)&result; \ const size_t count = sizeof(VecT) / sizeof(IntT); \
for (size_t loop=0; loop<count; ++loop) { \ union type_pun_vec { \
pTarget[loop] = libdivide_##Algo##_do(pSource[loop], denom); \ VecT vec; \
} \ IntT arr[sizeof(VecT) / sizeof(IntT)]; \
return result; \ }; \
union type_pun_vec result; \
union type_pun_vec input; \
input.vec = numers; \
for (size_t loop = 0; loop < count; ++loop) { \
result.arr[loop] = libdivide_##Algo##_do(input.arr[loop], denom); \
} \
return result.vec;
#if defined(LIBDIVIDE_NEON) #if defined(LIBDIVIDE_NEON)
@ -1804,13 +1844,12 @@ static LIBDIVIDE_INLINE int64x2_t libdivide_mullhi_s64_vec128(int64x2_t x, int64
////////// UINT16 ////////// UINT16
uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom) { uint16x8_t libdivide_u16_do_vec128(uint16x8_t numers, const struct libdivide_u16_t *denom){
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16) SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16)}
}
uint16x8_t libdivide_u16_branchfree_do_vec128(uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom) { uint16x8_t libdivide_u16_branchfree_do_vec128(
SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree) uint16x8_t numers, const struct libdivide_u16_branchfree_t *denom){
} SIMPLE_VECTOR_DIVISION(uint16_t, uint16x8_t, u16_branchfree)}
////////// UINT32 ////////// UINT32
@ -1870,13 +1909,12 @@ uint64x2_t libdivide_u64_branchfree_do_vec128(
////////// SINT16 ////////// SINT16
int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom) { int16x8_t libdivide_s16_do_vec128(int16x8_t numers, const struct libdivide_s16_t *denom){
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16) SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16)}
}
int16x8_t libdivide_s16_branchfree_do_vec128(int16x8_t numers, const struct libdivide_s16_branchfree_t *denom) { int16x8_t libdivide_s16_branchfree_do_vec128(
SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree) int16x8_t numers, const struct libdivide_s16_branchfree_t *denom){
} SIMPLE_VECTOR_DIVISION(int16_t, int16x8_t, s16_branchfree)}
////////// SINT32 ////////// SINT32
@ -2082,13 +2120,12 @@ static LIBDIVIDE_INLINE __m512i libdivide_mullhi_s64_vec512(__m512i x, __m512i y
////////// UINT16 ////////// UINT16
__m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom) { __m512i libdivide_u16_do_vec512(__m512i numers, const struct libdivide_u16_t *denom){
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16) SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16)}
}
__m512i libdivide_u16_branchfree_do_vec512(__m512i numers, const struct libdivide_u16_branchfree_t *denom) { __m512i libdivide_u16_branchfree_do_vec512(
SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree) __m512i numers, const struct libdivide_u16_branchfree_t *denom){
} SIMPLE_VECTOR_DIVISION(uint16_t, __m512i, u16_branchfree)}
////////// UINT32 ////////// UINT32
@ -2146,13 +2183,12 @@ __m512i libdivide_u64_branchfree_do_vec512(
////////// SINT16 ////////// SINT16
__m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom) { __m512i libdivide_s16_do_vec512(__m512i numers, const struct libdivide_s16_t *denom){
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16) SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16)}
}
__m512i libdivide_s16_branchfree_do_vec512(__m512i numers, const struct libdivide_s16_branchfree_t *denom) { __m512i libdivide_s16_branchfree_do_vec512(
SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree) __m512i numers, const struct libdivide_s16_branchfree_t *denom){
} SIMPLE_VECTOR_DIVISION(int16_t, __m512i, s16_branchfree)}
////////// SINT32 ////////// SINT32
@ -2365,11 +2401,25 @@ static LIBDIVIDE_INLINE __m256i libdivide_mullhi_s64_vec256(__m256i x, __m256i y
////////// UINT16 ////////// UINT16
__m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) { __m256i libdivide_u16_do_vec256(__m256i numers, const struct libdivide_u16_t *denom) {
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16) uint8_t more = denom->more;
if (!denom->magic) {
return _mm256_srli_epi16(numers, more);
} else {
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
} else {
return _mm256_srli_epi16(q, more);
}
}
} }
__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) { __m256i libdivide_u16_branchfree_do_vec256(
SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree) __m256i numers, const struct libdivide_u16_branchfree_t *denom) {
__m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic));
__m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q);
return _mm256_srli_epi16(t, denom->more);
} }
////////// UINT32 ////////// UINT32
@ -2429,11 +2479,54 @@ __m256i libdivide_u64_branchfree_do_vec256(
////////// SINT16 ////////// SINT16
__m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) {
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16) uint8_t more = denom->more;
if (!denom->magic) {
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
uint16_t mask = ((uint16_t)1 << shift) - 1;
__m256i roundToZeroTweak = _mm256_set1_epi16(mask);
// q = numer + ((numer >> 15) & roundToZeroTweak);
__m256i q = _mm256_add_epi16(
numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak));
q = _mm256_srai_epi16(q, shift);
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
// q = (q ^ sign) - sign;
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign);
return q;
} else {
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
// q += ((numer ^ sign) - sign);
q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign));
}
// q >>= shift
q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0)
return q;
}
} }
__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) { __m256i libdivide_s16_branchfree_do_vec256(
SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree) __m256i numers, const struct libdivide_s16_branchfree_t *denom) {
int16_t magic = denom->magic;
uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
// must be arithmetic shift
__m256i sign = _mm256_set1_epi16((int8_t)more >> 7);
__m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic));
q = _mm256_add_epi16(q, numers); // q += numers
// If q is non-negative, we have nothing to do
// If q is negative, we want to add either (2**shift)-1 if d is
// a power of 2, or (2**shift) if it is not a power of 2
uint16_t is_power_of_2 = (magic == 0);
__m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15
__m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask)
q = _mm256_srai_epi16(q, shift); // q >>= shift
q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign
return q;
} }
////////// SINT32 ////////// SINT32
@ -2661,11 +2754,25 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
////////// UINT26 ////////// UINT26
__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16) uint8_t more = denom->more;
if (!denom->magic) {
return _mm_srli_epi16(numers, more);
} else {
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK));
} else {
return _mm_srli_epi16(q, more);
}
}
} }
__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) { __m128i libdivide_u16_branchfree_do_vec128(
SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree) __m128i numers, const struct libdivide_u16_branchfree_t *denom) {
__m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic));
__m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q);
return _mm_srli_epi16(t, denom->more);
} }
////////// UINT32 ////////// UINT32
@ -2725,11 +2832,54 @@ __m128i libdivide_u64_branchfree_do_vec128(
////////// SINT16 ////////// SINT16
__m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) {
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16) uint8_t more = denom->more;
if (!denom->magic) {
uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
uint16_t mask = ((uint16_t)1 << shift) - 1;
__m128i roundToZeroTweak = _mm_set1_epi16(mask);
// q = numer + ((numer >> 15) & roundToZeroTweak);
__m128i q =
_mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak));
q = _mm_srai_epi16(q, shift);
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
// q = (q ^ sign) - sign;
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign);
return q;
} else {
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic));
if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
// q += ((numer ^ sign) - sign);
q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign));
}
// q >>= shift
q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK);
q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0)
return q;
}
} }
__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) { __m128i libdivide_s16_branchfree_do_vec128(
SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree) __m128i numers, const struct libdivide_s16_branchfree_t *denom) {
int16_t magic = denom->magic;
uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
// must be arithmetic shift
__m128i sign = _mm_set1_epi16((int8_t)more >> 7);
__m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic));
q = _mm_add_epi16(q, numers); // q += numers
// If q is non-negative, we have nothing to do
// If q is negative, we want to add either (2**shift)-1 if d is
// a power of 2, or (2**shift) if it is not a power of 2
uint16_t is_power_of_2 = (magic == 0);
__m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15
__m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2);
q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask)
q = _mm_srai_epi16(q, shift); // q >>= shift
q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign
return q;
} }
////////// SINT32 ////////// SINT32
@ -2795,8 +2945,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de
uint64_t mask = ((uint64_t)1 << shift) - 1; uint64_t mask = ((uint64_t)1 << shift) - 1;
__m128i roundToZeroTweak = _mm_set1_epi64x(mask); __m128i roundToZeroTweak = _mm_set1_epi64x(mask);
// q = numer + ((numer >> 63) & roundToZeroTweak); // q = numer + ((numer >> 63) & roundToZeroTweak);
__m128i q = __m128i q = _mm_add_epi64(
_mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak));
q = libdivide_s64_shift_right_vec128(q, shift); q = libdivide_s64_shift_right_vec128(q, shift);
__m128i sign = _mm_set1_epi32((int8_t)more >> 7); __m128i sign = _mm_set1_epi32((int8_t)more >> 7);
// q = (q ^ sign) - sign; // q = (q ^ sign) - sign;
@ -2847,49 +2997,80 @@ __m128i libdivide_s64_branchfree_do_vec128(
#ifdef __cplusplus #ifdef __cplusplus
//for constexpr zero initialization,
//c++11 might handle things ok,
//but just limit to at least c++14 to ensure
//we don't break anyone's code:
// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
#define LIBDIVIDE_CONSTEXPR constexpr
// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
#define LIBDIVIDE_CONSTEXPR constexpr
// in case some other obscure compiler has the right __cpp_constexpr :
#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
#define LIBDIVIDE_CONSTEXPR constexpr
#else
#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
#endif
enum Branching { enum Branching {
BRANCHFULL, // use branching algorithms BRANCHFULL, // use branching algorithms
BRANCHFREE // use branchfree algorithms BRANCHFREE // use branchfree algorithms
}; };
namespace detail {
enum Signedness {
SIGNED,
UNSIGNED,
};
#if defined(LIBDIVIDE_NEON) #if defined(LIBDIVIDE_NEON)
// Helper to deduce NEON vector type for integral type. // Helper to deduce NEON vector type for integral type.
template <typename T> template <int _WIDTH, Signedness _SIGN>
struct NeonVecFor {}; struct NeonVec {};
template <> template <>
struct NeonVecFor<uint16_t> { struct NeonVec<16, UNSIGNED> {
typedef uint16x8_t type; typedef uint16x8_t type;
}; };
template <> template <>
struct NeonVecFor<int16_t> { struct NeonVec<16, SIGNED> {
typedef int16x8_t type; typedef int16x8_t type;
}; };
template <> template <>
struct NeonVecFor<uint32_t> { struct NeonVec<32, UNSIGNED> {
typedef uint32x4_t type; typedef uint32x4_t type;
}; };
template <> template <>
struct NeonVecFor<int32_t> { struct NeonVec<32, SIGNED> {
typedef int32x4_t type; typedef int32x4_t type;
}; };
template <> template <>
struct NeonVecFor<uint64_t> { struct NeonVec<64, UNSIGNED> {
typedef uint64x2_t type; typedef uint64x2_t type;
}; };
template <> template <>
struct NeonVecFor<int64_t> { struct NeonVec<64, SIGNED> {
typedef int64x2_t type; typedef int64x2_t type;
}; };
#endif
// Versions of our algorithms for SIMD. template <typename T>
#if defined(LIBDIVIDE_NEON) struct NeonVecFor {
// See 'class divider' for an explanation of these template parameters.
typedef typename NeonVec<sizeof(T) * 8, (((T)0 >> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type;
};
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \
LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \ LIBDIVIDE_INLINE typename NeonVecFor<INT_TYPE>::type divide( \
typename NeonVecFor<INT_TYPE>::type n) const { \ typename NeonVecFor<INT_TYPE>::type n) const { \
@ -2898,6 +3079,7 @@ struct NeonVecFor<int64_t> {
#else #else
#define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE)
#endif #endif
#if defined(LIBDIVIDE_SSE2) #if defined(LIBDIVIDE_SSE2)
#define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \
LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \
@ -2930,6 +3112,7 @@ struct NeonVecFor<int64_t> {
#define DISPATCHER_GEN(T, ALGO) \ #define DISPATCHER_GEN(T, ALGO) \
libdivide_##ALGO##_t denom; \ libdivide_##ALGO##_t denom; \
LIBDIVIDE_INLINE dispatcher() {} \ LIBDIVIDE_INLINE dispatcher() {} \
explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
@ -2939,66 +3122,81 @@ struct NeonVecFor<int64_t> {
LIBDIVIDE_DIVIDE_AVX512(ALGO) LIBDIVIDE_DIVIDE_AVX512(ALGO)
// The dispatcher selects a specific division algorithm for a given // The dispatcher selects a specific division algorithm for a given
// type and ALGO using partial template specialization. // width, signedness, and ALGO using partial template specialization.
template <typename _IntT, Branching ALGO> template <int _WIDTH, Signedness _SIGN, Branching _ALGO>
struct dispatcher {}; struct dispatcher {};
template <> template <>
struct dispatcher<int16_t, BRANCHFULL> { struct dispatcher<16, SIGNED, BRANCHFULL> {
DISPATCHER_GEN(int16_t, s16) DISPATCHER_GEN(int16_t, s16)
}; };
template <> template <>
struct dispatcher<int16_t, BRANCHFREE> { struct dispatcher<16, SIGNED, BRANCHFREE> {
DISPATCHER_GEN(int16_t, s16_branchfree) DISPATCHER_GEN(int16_t, s16_branchfree)
}; };
template <> template <>
struct dispatcher<uint16_t, BRANCHFULL> { struct dispatcher<16, UNSIGNED, BRANCHFULL> {
DISPATCHER_GEN(uint16_t, u16) DISPATCHER_GEN(uint16_t, u16)
}; };
template <> template <>
struct dispatcher<uint16_t, BRANCHFREE> { struct dispatcher<16, UNSIGNED, BRANCHFREE> {
DISPATCHER_GEN(uint16_t, u16_branchfree) DISPATCHER_GEN(uint16_t, u16_branchfree)
}; };
template <> template <>
struct dispatcher<int32_t, BRANCHFULL> { struct dispatcher<32, SIGNED, BRANCHFULL> {
DISPATCHER_GEN(int32_t, s32) DISPATCHER_GEN(int32_t, s32)
}; };
template <> template <>
struct dispatcher<int32_t, BRANCHFREE> { struct dispatcher<32, SIGNED, BRANCHFREE> {
DISPATCHER_GEN(int32_t, s32_branchfree) DISPATCHER_GEN(int32_t, s32_branchfree)
}; };
template <> template <>
struct dispatcher<uint32_t, BRANCHFULL> { struct dispatcher<32, UNSIGNED, BRANCHFULL> {
DISPATCHER_GEN(uint32_t, u32) DISPATCHER_GEN(uint32_t, u32)
}; };
template <> template <>
struct dispatcher<uint32_t, BRANCHFREE> { struct dispatcher<32, UNSIGNED, BRANCHFREE> {
DISPATCHER_GEN(uint32_t, u32_branchfree) DISPATCHER_GEN(uint32_t, u32_branchfree)
}; };
template <> template <>
struct dispatcher<int64_t, BRANCHFULL> { struct dispatcher<64, SIGNED, BRANCHFULL> {
DISPATCHER_GEN(int64_t, s64) DISPATCHER_GEN(int64_t, s64)
}; };
template <> template <>
struct dispatcher<int64_t, BRANCHFREE> { struct dispatcher<64, SIGNED, BRANCHFREE> {
DISPATCHER_GEN(int64_t, s64_branchfree) DISPATCHER_GEN(int64_t, s64_branchfree)
}; };
template <> template <>
struct dispatcher<uint64_t, BRANCHFULL> { struct dispatcher<64, UNSIGNED, BRANCHFULL> {
DISPATCHER_GEN(uint64_t, u64) DISPATCHER_GEN(uint64_t, u64)
}; };
template <> template <>
struct dispatcher<uint64_t, BRANCHFREE> { struct dispatcher<64, UNSIGNED, BRANCHFREE> {
DISPATCHER_GEN(uint64_t, u64_branchfree) DISPATCHER_GEN(uint64_t, u64_branchfree)
}; };
} // namespace detail
#if defined(LIBDIVIDE_NEON)
// Allow NeonVecFor outside of detail namespace.
template <typename T>
struct NeonVecFor {
typedef typename detail::NeonVecFor<T>::type type;
};
#endif
// This is the main divider class for use by the user (C++ API). // This is the main divider class for use by the user (C++ API).
// The actual division algorithm is selected using the dispatcher struct // The actual division algorithm is selected using the dispatcher struct
// based on the integer and algorithm template parameters. // based on the integer width and algorithm template parameters.
template <typename T, Branching ALGO = BRANCHFULL> template <typename T, Branching ALGO = BRANCHFULL>
class divider { class divider {
private: private:
typedef dispatcher<T, ALGO> dispatcher_t; // Dispatch based on the size and signedness.
// We avoid using type_traits as it's not available in AVR.
// Detect signedness by checking if T(-1) is less than T(0).
// Also throw in a shift by 0, which prevents floating point types from being passed.
typedef detail::dispatcher<sizeof(T) * 8,
(((T)0 >> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO>
dispatcher_t;
public: public:
// We leave the default constructor empty so that creating // We leave the default constructor empty so that creating
@ -3006,6 +3204,9 @@ class divider {
// later doesn't slow us down. // later doesn't slow us down.
divider() {} divider() {}
// constexpr zero-initialization to allow for use w/ static constinit
explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
// Constructor that takes the divisor as a parameter // Constructor that takes the divisor as a parameter
LIBDIVIDE_INLINE divider(T d) : div(d) {} LIBDIVIDE_INLINE divider(T d) : div(d) {}
@ -3017,7 +3218,7 @@ class divider {
T recover() const { return div.recover(); } T recover() const { return div.recover(); }
bool operator==(const divider<T, ALGO> &other) const { bool operator==(const divider<T, ALGO> &other) const {
return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more;
} }
bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); } bool operator!=(const divider<T, ALGO> &other) const { return !(*this == other); }
@ -3098,12 +3299,14 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider<T, ALGO> &div) {
#if defined(LIBDIVIDE_NEON) #if defined(LIBDIVIDE_NEON)
template <typename T, Branching ALGO> template <typename T, Branching ALGO>
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) { LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/(
typename NeonVecFor<T>::type n, const divider<T, ALGO> &div) {
return div.divide(n); return div.divide(n);
} }
template <typename T, Branching ALGO> template <typename T, Branching ALGO>
LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) { LIBDIVIDE_INLINE typename NeonVecFor<T>::type operator/=(
typename NeonVecFor<T>::type &n, const divider<T, ALGO> &div) {
n = div.divide(n); n = div.divide(n);
return n; return n;
} }

9
util.h
View file

@ -9,7 +9,9 @@
#define noreturn __attribute__((noreturn)) #define noreturn __attribute__((noreturn))
#define likely(x) __builtin_expect(!!(x), 1) #define likely(x) __builtin_expect(!!(x), 1)
#define likely51(x) __builtin_expect_with_probability(!!(x), 1, 0.51)
#define unlikely(x) __builtin_expect(!!(x), 0) #define unlikely(x) __builtin_expect(!!(x), 0)
#define unlikely51(x) __builtin_expect_with_probability(!!(x), 0, 0.51)
#define min(x, y) ({ \ #define min(x, y) ({ \
__typeof__(x) _x = (x); \ __typeof__(x) _x = (x); \
@ -30,6 +32,13 @@
#define STRINGIFY(s) #s #define STRINGIFY(s) #s
#define ALIAS(f) __attribute__((alias(STRINGIFY(f)))) #define ALIAS(f) __attribute__((alias(STRINGIFY(f))))
// supported since GCC 15
#if __has_attribute (nonstring)
# define NONSTRING __attribute__ ((nonstring))
#else
# define NONSTRING
#endif
typedef uint8_t u8; typedef uint8_t u8;
typedef uint16_t u16; typedef uint16_t u16;
typedef uint32_t u32; typedef uint32_t u32;