Compare commits

..

No commits in common. "main" and "14" have entirely different histories.
main ... 14

8 changed files with 164 additions and 201 deletions

View file

@ -11,9 +11,9 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
version: [14] version: [12, 13, 14]
steps: steps:
- uses: actions/checkout@v6 - uses: actions/checkout@v4
- name: Setting up gcc version - name: Setting up gcc version
run: | run: |
sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100 sudo update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-${{ matrix.version }} 100
@ -24,11 +24,11 @@ jobs:
runs-on: ubuntu-latest runs-on: ubuntu-latest
strategy: strategy:
matrix: matrix:
version: [19, 20] version: [14, 15, 16, 17, 18]
steps: steps:
- uses: actions/checkout@v6 - uses: actions/checkout@v4
- name: Install dependencies - name: Install dependencies
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-19 clang-20 run: sudo apt-get update && sudo apt-get install -y --no-install-recommends clang-14 clang-15
- name: Setting up clang version - name: Setting up clang version
run: | run: |
sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100 sudo update-alternatives --install /usr/bin/clang++ clang++ /usr/bin/clang++-${{ matrix.version }} 100
@ -40,7 +40,7 @@ jobs:
container: container:
image: alpine:latest image: alpine:latest
steps: steps:
- uses: actions/checkout@v6 - uses: actions/checkout@v4
- name: Install dependencies - name: Install dependencies
run: apk update && apk add build-base python3 run: apk update && apk add build-base python3
- name: Build - name: Build
@ -48,7 +48,7 @@ jobs:
build-ubuntu-gcc-aarch64: build-ubuntu-gcc-aarch64:
runs-on: ubuntu-latest runs-on: ubuntu-latest
steps: steps:
- uses: actions/checkout@v6 - uses: actions/checkout@v4
- name: Install dependencies - name: Install dependencies
run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu run: sudo apt-get update && sudo apt-get install -y --no-install-recommends gcc-aarch64-linux-gnu g++-aarch64-linux-gnu libgcc-s1-arm64-cross cpp-aarch64-linux-gnu
- name: Build - name: Build

4
.gitignore vendored
View file

@ -1,2 +1,2 @@
/out/ out/
/out-light/ out-light/

View file

@ -1,4 +1,4 @@
Copyright © 2018-2026 GrapheneOS Copyright © 2018-2024 GrapheneOS
Permission is hereby granted, free of charge, to any person obtaining a copy Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal of this software and associated documentation files (the "Software"), to deal

View file

@ -1,4 +1,4 @@
# hardened_malloc # Hardened malloc
* [Introduction](#introduction) * [Introduction](#introduction)
* [Dependencies](#dependencies) * [Dependencies](#dependencies)
@ -65,14 +65,14 @@ used instead as this allocator fundamentally doesn't support that environment.
## Dependencies ## Dependencies
Debian stable (currently Debian 13) determines the most ancient set of Debian stable (currently Debian 12) determines the most ancient set of
supported dependencies: supported dependencies:
* glibc 2.41 * glibc 2.36
* Linux 6.12 * Linux 6.1
* Clang 19.1.7 or GCC 14.2.0 * Clang 14.0.6 or GCC 12.2.0
For Android, the Linux GKI 6.1, 6.6 and 6.12 branches are supported. For Android, the Linux GKI 5.10, 5.15 and 6.1 branches are supported.
However, using more recent releases is highly recommended. Older versions of However, using more recent releases is highly recommended. Older versions of
the dependencies may be compatible at the moment but are not tested and will the dependencies may be compatible at the moment but are not tested and will
@ -83,7 +83,7 @@ there will be custom integration offering better performance in the future
along with other hardening for the C standard library implementation. along with other hardening for the C standard library implementation.
For Android, only the current generation, actively developed maintenance branch of the Android For Android, only the current generation, actively developed maintenance branch of the Android
Open Source Project will be supported, which currently means `android16-qpr2-release`. Open Source Project will be supported, which currently means `android15-release`.
## Testing ## Testing
@ -159,11 +159,8 @@ line to the `/etc/ld.so.preload` configuration file:
The format of this configuration file is a whitespace-separated list, so it's The format of this configuration file is a whitespace-separated list, so it's
good practice to put each library on a separate line. good practice to put each library on a separate line.
For maximum compatibility `libhardened_malloc.so` can be installed into On Debian systems `libhardened_malloc.so` should be installed into `/usr/lib/`
`/usr/lib/` to avoid preload failures caused by AppArmor profiles or systemd to avoid preload failures caused by AppArmor profile restrictions.
ExecPaths= restrictions. Check for logs of the following format:
ERROR: ld.so: object '/usr/local/lib/libhardened_malloc.so' from /etc/ld.so.preload cannot be preloaded (failed to map segment from shared object): ignored.
Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis Using the `LD_PRELOAD` environment variable to load it on a case-by-case basis
will not work when `AT_SECURE` is set such as with setuid binaries. It's also will not work when `AT_SECURE` is set such as with setuid binaries. It's also
@ -414,7 +411,7 @@ was a bit less important and if a core goal was finding latent bugs.
randomly sized guard regions around it randomly sized guard regions around it
* Protection via Memory Protection Keys (MPK) on x86\_64 (disabled by * Protection via Memory Protection Keys (MPK) on x86\_64 (disabled by
default due to low benefit-cost ratio on top of baseline protections) default due to low benefit-cost ratio on top of baseline protections)
* Protection via MTE on ARMv8.5+ * [future] Protection via MTE on ARMv8.5+
* Deterministic detection of any invalid free (unallocated, unaligned, etc.) * Deterministic detection of any invalid free (unallocated, unaligned, etc.)
* Validation of the size passed for C++14 sized deallocation by `delete` * Validation of the size passed for C++14 sized deallocation by `delete`
even for code compiled with earlier standards (detects type confusion if even for code compiled with earlier standards (detects type confusion if

View file

@ -44,7 +44,7 @@ void *set_pointer_tag(void *ptr, u8 tag) {
return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr)); return (void *) (((uintptr_t) tag << 56) | (uintptr_t) untag_pointer(ptr));
} }
// This test checks that slab slot allocation uses tag that is distinct from tags of its neighbors // This test checks that slab slot allocation uses tag that is distint from tags of its neighbors
// and from the tag of the previous allocation that used the same slot // and from the tag of the previous allocation that used the same slot
void tag_distinctness() { void tag_distinctness() {
// tag 0 is reserved // tag 0 is reserved

View file

@ -1530,8 +1530,7 @@ EXPORT void *h_realloc(void *old, size_t size) {
old = untag_pointer(old); old = untag_pointer(old);
size_t old_size; size_t old_size;
bool old_in_slab_region = old < get_slab_region_end() && old >= ro.slab_region_start; if (old < get_slab_region_end() && old >= ro.slab_region_start) {
if (old_in_slab_region) {
old_size = slab_usable_size(old); old_size = slab_usable_size(old);
if (size <= max_slab_size_class && get_size_info(size).size == old_size) { if (size <= max_slab_size_class && get_size_info(size).size == old_size) {
return old_orig; return old_orig;
@ -1648,7 +1647,7 @@ EXPORT void *h_realloc(void *old, size_t size) {
copy_size -= canary_size; copy_size -= canary_size;
} }
memcpy(new, old_orig, copy_size); memcpy(new, old_orig, copy_size);
if (old_in_slab_region) { if (old_size <= max_slab_size_class) {
deallocate_small(old, NULL); deallocate_small(old, NULL);
} else { } else {
deallocate_large(old, NULL); deallocate_large(old, NULL);

View file

@ -98,7 +98,7 @@ class TestSimpleMemoryCorruption(unittest.TestCase):
self.assertEqual(stderr.decode("utf-8"), self.assertEqual(stderr.decode("utf-8"),
"fatal allocator error: invalid free\n") "fatal allocator error: invalid free\n")
def test_invalid_malloc_usable_size_small_quarantine(self): def test_invalid_malloc_usable_size_small_quarantene(self):
_stdout, stderr, returncode = self.run_test( _stdout, stderr, returncode = self.run_test(
"invalid_malloc_usable_size_small_quarantine") "invalid_malloc_usable_size_small_quarantine")
self.assertEqual(returncode, -6) self.assertEqual(returncode, -6)

View file

@ -2,7 +2,7 @@
// https://libdivide.com // https://libdivide.com
// //
// Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com> // Copyright (C) 2010 - 2022 ridiculous_fish, <libdivide@ridiculousfish.com>
// Copyright (C) 2016 - 2026 Kim Walisch, <kim.walisch@gmail.com> // Copyright (C) 2016 - 2022 Kim Walisch, <kim.walisch@gmail.com>
// //
// libdivide is dual-licensed under the Boost or zlib licenses. // libdivide is dual-licensed under the Boost or zlib licenses.
// You may use libdivide under the terms of either of these. // You may use libdivide under the terms of either of these.
@ -12,26 +12,18 @@
#define LIBDIVIDE_H #define LIBDIVIDE_H
// *** Version numbers are auto generated - do not edit *** // *** Version numbers are auto generated - do not edit ***
#define LIBDIVIDE_VERSION "5.3.0" #define LIBDIVIDE_VERSION "5.2.0"
#define LIBDIVIDE_VERSION_MAJOR 5 #define LIBDIVIDE_VERSION_MAJOR 5
#define LIBDIVIDE_VERSION_MINOR 3 #define LIBDIVIDE_VERSION_MINOR 2
#define LIBDIVIDE_VERSION_PATCH 0 #define LIBDIVIDE_VERSION_PATCH 0
#include <stdint.h> #include <stdint.h>
#if !defined(__AVR__) && __STDC_HOSTED__ != 0 #if !defined(__AVR__)
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#endif #endif
#if defined(_MSC_VER) && (defined(__cplusplus) && (__cplusplus >= 202002L)) || \
(defined(_MSVC_LANG) && (_MSVC_LANG >= 202002L))
#if __has_include(<bit>)
#include <bit>
#define LIBDIVIDE_VC_CXX20
#endif
#endif
#if defined(LIBDIVIDE_SSE2) #if defined(LIBDIVIDE_SSE2)
#include <emmintrin.h> #include <emmintrin.h>
#endif #endif
@ -45,23 +37,23 @@
#endif #endif
// Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics // Clang-cl prior to Visual Studio 2022 doesn't include __umulh/__mulh intrinsics
#if defined(_MSC_VER) && (!defined(__clang__) || _MSC_VER > 1930) && \ #if defined(_MSC_VER) && defined(LIBDIVIDE_X86_64) && (!defined(__clang__) || _MSC_VER>1930)
(defined(_M_X64) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)) #define LIBDIVIDE_X64_INTRINSICS
#define LIBDIVIDE_MULH_INTRINSICS
#endif #endif
#if defined(_MSC_VER) #if defined(_MSC_VER)
#if defined(LIBDIVIDE_MULH_INTRINSICS) || !defined(__clang__) #if defined(LIBDIVIDE_X64_INTRINSICS)
#include <intrin.h> #include <intrin.h>
#endif #endif
#ifndef __clang__
#pragma warning(push) #pragma warning(push)
// 4146: unary minus operator applied to unsigned type, result still unsigned // disable warning C4146: unary minus operator applied
// to unsigned type, result still unsigned
#pragma warning(disable : 4146) #pragma warning(disable : 4146)
// disable warning C4204: nonstandard extension used : non-constant aggregate
// 4204: nonstandard extension used : non-constant aggregate initializer // initializer
//
// It's valid C99
#pragma warning(disable : 4204) #pragma warning(disable : 4204)
#endif
#define LIBDIVIDE_VC #define LIBDIVIDE_VC
#endif #endif
@ -103,14 +95,10 @@
#endif #endif
#endif #endif
#ifndef LIBDIVIDE_INLINE #ifndef LIBDIVIDE_INLINE
#ifdef _MSC_VER
#define LIBDIVIDE_INLINE __forceinline
#else
#define LIBDIVIDE_INLINE inline #define LIBDIVIDE_INLINE inline
#endif #endif
#endif
#if defined(__AVR__) || __STDC_HOSTED__ == 0 #if defined(__AVR__)
#define LIBDIVIDE_ERROR(msg) #define LIBDIVIDE_ERROR(msg)
#else #else
#define LIBDIVIDE_ERROR(msg) \ #define LIBDIVIDE_ERROR(msg) \
@ -120,7 +108,7 @@
} while (0) } while (0)
#endif #endif
#if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__) && __STDC_HOSTED__ != 0 #if defined(LIBDIVIDE_ASSERTIONS_ON) && !defined(__AVR__)
#define LIBDIVIDE_ASSERT(x) \ #define LIBDIVIDE_ASSERT(x) \
do { \ do { \
if (!(x)) { \ if (!(x)) { \
@ -134,67 +122,9 @@
#endif #endif
#ifdef __cplusplus #ifdef __cplusplus
// Our __builtin_clz() implementation for the MSVC compiler
// requires C++20 or later for constexpr support.
#if defined(LIBDIVIDE_VC_CXX20)
#define LIBDIVIDE_CONSTEXPR_INLINE constexpr LIBDIVIDE_INLINE
// Use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
// For constexpr zero initialization, c++11 might handle things ok,
// but just limit to at least c++14 to ensure we don't break anyone's code:
#elif (!defined(_MSC_VER) || defined(__clang__)) && \
defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
#define LIBDIVIDE_CONSTEXPR_INLINE constexpr LIBDIVIDE_INLINE
#else
#define LIBDIVIDE_CONSTEXPR_INLINE LIBDIVIDE_INLINE
#endif
namespace libdivide { namespace libdivide {
#endif #endif
#if defined(_MSC_VER) && !defined(__clang__)
// Required for C programming language
#ifndef LIBDIVIDE_CONSTEXPR_INLINE
#define LIBDIVIDE_CONSTEXPR_INLINE LIBDIVIDE_INLINE
#endif
static LIBDIVIDE_CONSTEXPR_INLINE int __builtin_clz(unsigned x) {
#if defined(LIBDIVIDE_VC_CXX20)
return std::countl_zero(x);
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountLeadingZeros(x);
#elif defined(__AVX2__) || defined(__LZCNT__)
return (int)_lzcnt_u32(x);
#else
unsigned long r;
_BitScanReverse(&r, x);
return (int)(r ^ 31);
#endif
}
static LIBDIVIDE_CONSTEXPR_INLINE int __builtin_clzll(unsigned long long x) {
#if defined(LIBDIVIDE_VC_CXX20)
return std::countl_zero(x);
#elif defined(_M_ARM) || defined(_M_ARM64) || defined(_M_HYBRID_X86_ARM64) || defined(_M_ARM64EC)
return (int)_CountLeadingZeros64(x);
#elif defined(_WIN64)
#if defined(__AVX2__) || defined(__LZCNT__)
return (int)_lzcnt_u64(x);
#else
unsigned long r;
_BitScanReverse64(&r, x);
return (int)(r ^ 63);
#endif
#else
int l = __builtin_clz((unsigned)x) + 32;
int h = __builtin_clz((unsigned)(x >> 32));
return !!((unsigned)(x >> 32)) ? h : l;
#endif
}
#endif // MSVC __builtin_clz()
// pack divider structs to prevent compilers from padding. // pack divider structs to prevent compilers from padding.
// This reduces memory usage by up to 43% when using a large // This reduces memory usage by up to 43% when using a large
// array of libdivide dividers and improves performance // array of libdivide dividers and improves performance
@ -404,7 +334,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_mullhi_s32(int32_t x, int32_t y) {
} }
static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
#if defined(LIBDIVIDE_MULH_INTRINSICS) #if defined(LIBDIVIDE_X64_INTRINSICS)
return __umulh(x, y); return __umulh(x, y);
#elif defined(HAS_INT128_T) #elif defined(HAS_INT128_T)
__uint128_t xl = x, yl = y; __uint128_t xl = x, yl = y;
@ -430,7 +360,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) {
} }
static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { static LIBDIVIDE_INLINE int64_t libdivide_mullhi_s64(int64_t x, int64_t y) {
#if defined(LIBDIVIDE_MULH_INTRINSICS) #if defined(LIBDIVIDE_X64_INTRINSICS)
return __mulh(x, y); return __mulh(x, y);
#elif defined(HAS_INT128_T) #elif defined(HAS_INT128_T)
__int128_t xl = x, yl = y; __int128_t xl = x, yl = y;
@ -456,9 +386,15 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) {
// Fast way to count leading zeros // Fast way to count leading zeros
// On the AVR 8-bit architecture __builtin_clz() works on a int16_t. // On the AVR 8-bit architecture __builtin_clz() works on a int16_t.
return __builtin_clz(val); return __builtin_clz(val);
#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
// Fast way to count leading zeros // Fast way to count leading zeros
return (int16_t)(__builtin_clz(val) - 16); return __builtin_clz(val) - 16;
#elif defined(LIBDIVIDE_VC)
unsigned long result;
if (_BitScanReverse(&result, (unsigned long)val)) {
return (int16_t)(15 - result);
}
return 0;
#else #else
if (val == 0) return 16; if (val == 0) return 16;
int16_t result = 4; int16_t result = 4;
@ -479,9 +415,15 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
#if defined(__AVR__) #if defined(__AVR__)
// Fast way to count leading zeros // Fast way to count leading zeros
return __builtin_clzl(val); return __builtin_clzl(val);
#elif defined(__GNUC__) || __has_builtin(__builtin_clz) || defined(_MSC_VER) #elif defined(__GNUC__) || __has_builtin(__builtin_clz)
// Fast way to count leading zeros // Fast way to count leading zeros
return __builtin_clz(val); return __builtin_clz(val);
#elif defined(LIBDIVIDE_VC)
unsigned long result;
if (_BitScanReverse(&result, val)) {
return 31 - result;
}
return 0;
#else #else
if (val == 0) return 32; if (val == 0) return 32;
int32_t result = 8; int32_t result = 8;
@ -499,9 +441,15 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) {
} }
static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) {
#if defined(__GNUC__) || __has_builtin(__builtin_clzll) || defined(_MSC_VER) #if defined(__GNUC__) || __has_builtin(__builtin_clzll)
// Fast way to count leading zeros // Fast way to count leading zeros
return __builtin_clzll(val); return __builtin_clzll(val);
#elif defined(LIBDIVIDE_VC) && defined(_WIN64)
unsigned long result;
if (_BitScanReverse64(&result, val)) {
return 63 - result;
}
return 0;
#else #else
uint32_t hi = val >> 32; uint32_t hi = val >> 32;
uint32_t lo = val & 0xFFFFFFFF; uint32_t lo = val & 0xFFFFFFFF;
@ -548,7 +496,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
// it's not LIBDIVIDE_INLINEd. // it's not LIBDIVIDE_INLINEd.
#if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM) #if defined(LIBDIVIDE_X86_64) && defined(LIBDIVIDE_GCC_STYLE_ASM)
uint64_t result; uint64_t result;
__asm__("div %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi)); __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(den), "a"(numlo), "d"(numhi));
return result; return result;
#else #else
// We work in base 2**32. // We work in base 2**32.
@ -598,7 +546,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64(
shift = libdivide_count_leading_zeros64(den); shift = libdivide_count_leading_zeros64(den);
den <<= shift; den <<= shift;
numhi <<= shift; numhi <<= shift;
numhi |= (numlo >> (-shift & 63)) & (uint64_t)(-(int64_t)shift >> 63); numhi |= (numlo >> (-shift & 63)) & (-(int64_t)shift >> 63);
numlo <<= shift; numlo <<= shift;
// Extract the low digits of the numerator and both digits of the denominator. // Extract the low digits of the numerator and both digits of the denominator.
@ -807,11 +755,11 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_u16_gen(uint16_t d) { struct libdivide_u16_t libdivide_u16_gen(uint16_t d) {
return libdivide_internal_u16_gen(d, 0); return libdivide_internal_u16_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) {
if (d == 1) { if (d == 1) {
LIBDIVIDE_ERROR("branchfree divider must be != 1"); LIBDIVIDE_ERROR("branchfree divider must be != 1");
} }
@ -824,11 +772,11 @@ static LIBDIVIDE_INLINE struct libdivide_u16_branchfree_t libdivide_u16_branchfr
// The original libdivide_u16_do takes a const pointer. However, this cannot be used // The original libdivide_u16_do takes a const pointer. However, this cannot be used
// with a compile time constant libdivide_u16_t: it will generate a warning about // with a compile time constant libdivide_u16_t: it will generate a warning about
// taking the address of a temporary. Hence this overload. // taking the address of a temporary. Hence this overload.
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) {
if (!magic) { if (!magic) {
return numer >> more; return numer >> more;
} else { } else {
uint16_t q = libdivide_mullhi_u16(numer, magic); uint16_t q = libdivide_mullhi_u16(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint16_t t = ((numer - q) >> 1) + q; uint16_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_16_SHIFT_MASK); return t >> (more & LIBDIVIDE_16_SHIFT_MASK);
@ -840,18 +788,18 @@ static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t m
} }
} }
static LIBDIVIDE_INLINE uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) {
return libdivide_u16_do_raw(numer, denom->magic, denom->more); return libdivide_u16_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( uint16_t libdivide_u16_branchfree_do(
uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { uint16_t numer, const struct libdivide_u16_branchfree_t *denom) {
uint16_t q = libdivide_mullhi_u16(numer, denom->magic); uint16_t q = libdivide_mullhi_u16(denom->magic, numer);
uint16_t t = ((numer - q) >> 1) + q; uint16_t t = ((numer - q) >> 1) + q;
return t >> denom->more; return t >> denom->more;
} }
static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
@ -889,7 +837,7 @@ static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u1
} }
} }
static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) { uint16_t libdivide_u16_branchfree_recover(const struct libdivide_u16_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
@ -971,11 +919,11 @@ static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_internal_u32_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { struct libdivide_u32_t libdivide_u32_gen(uint32_t d) {
return libdivide_internal_u32_gen(d, 0); return libdivide_internal_u32_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) {
if (d == 1) { if (d == 1) {
LIBDIVIDE_ERROR("branchfree divider must be != 1"); LIBDIVIDE_ERROR("branchfree divider must be != 1");
} }
@ -985,11 +933,11 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr
return ret; return ret;
} }
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) { uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t magic, uint8_t more) {
if (!magic) { if (!magic) {
return numer >> more; return numer >> more;
} else { } else {
uint32_t q = libdivide_mullhi_u32(numer, magic); uint32_t q = libdivide_mullhi_u32(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint32_t t = ((numer - q) >> 1) + q; uint32_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_32_SHIFT_MASK); return t >> (more & LIBDIVIDE_32_SHIFT_MASK);
@ -1001,18 +949,18 @@ static LIBDIVIDE_INLINE uint32_t libdivide_u32_do_raw(uint32_t numer, uint32_t m
} }
} }
static LIBDIVIDE_INLINE uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) {
return libdivide_u32_do_raw(numer, denom->magic, denom->more); return libdivide_u32_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( uint32_t libdivide_u32_branchfree_do(
uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t numer, const struct libdivide_u32_branchfree_t *denom) {
uint32_t q = libdivide_mullhi_u32(numer, denom->magic); uint32_t q = libdivide_mullhi_u32(denom->magic, numer);
uint32_t t = ((numer - q) >> 1) + q; uint32_t t = ((numer - q) >> 1) + q;
return t >> denom->more; return t >> denom->more;
} }
static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
@ -1050,7 +998,7 @@ static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u3
} }
} }
static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
@ -1079,7 +1027,7 @@ static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover(const struct l
} }
} }
////////// UINT64 /////////// UINT64
static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen( static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
uint64_t d, int branchfree) { uint64_t d, int branchfree) {
@ -1134,11 +1082,11 @@ static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_internal_u64_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { struct libdivide_u64_t libdivide_u64_gen(uint64_t d) {
return libdivide_internal_u64_gen(d, 0); return libdivide_internal_u64_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) {
if (d == 1) { if (d == 1) {
LIBDIVIDE_ERROR("branchfree divider must be != 1"); LIBDIVIDE_ERROR("branchfree divider must be != 1");
} }
@ -1148,11 +1096,11 @@ static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfr
return ret; return ret;
} }
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) { uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t magic, uint8_t more) {
if (!magic) { if (!magic) {
return numer >> more; return numer >> more;
} else { } else {
uint64_t q = libdivide_mullhi_u64(numer, magic); uint64_t q = libdivide_mullhi_u64(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
uint64_t t = ((numer - q) >> 1) + q; uint64_t t = ((numer - q) >> 1) + q;
return t >> (more & LIBDIVIDE_64_SHIFT_MASK); return t >> (more & LIBDIVIDE_64_SHIFT_MASK);
@ -1164,18 +1112,18 @@ static LIBDIVIDE_INLINE uint64_t libdivide_u64_do_raw(uint64_t numer, uint64_t m
} }
} }
static LIBDIVIDE_INLINE uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) {
return libdivide_u64_do_raw(numer, denom->magic, denom->more); return libdivide_u64_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t libdivide_u64_branchfree_do(
uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t numer, const struct libdivide_u64_branchfree_t *denom) {
uint64_t q = libdivide_mullhi_u64(numer, denom->magic); uint64_t q = libdivide_mullhi_u64(denom->magic, numer);
uint64_t t = ((numer - q) >> 1) + q; uint64_t t = ((numer - q) >> 1) + q;
return t >> denom->more; return t >> denom->more;
} }
static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
@ -1219,7 +1167,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u6
} }
} }
static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
@ -1254,7 +1202,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_recover(const struct l
} }
} }
////////// SINT16 /////////// SINT16
static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen( static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
int16_t d, int branchfree) { int16_t d, int branchfree) {
@ -1322,11 +1270,11 @@ static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_internal_s16_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_s16_t libdivide_s16_gen(int16_t d) { struct libdivide_s16_t libdivide_s16_gen(int16_t d) {
return libdivide_internal_s16_gen(d, 0); return libdivide_internal_s16_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) { struct libdivide_s16_branchfree_t libdivide_s16_branchfree_gen(int16_t d) {
struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1); struct libdivide_s16_t tmp = libdivide_internal_s16_gen(d, 1);
struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more}; struct libdivide_s16_branchfree_t result = {tmp.magic, tmp.more};
return result; return result;
@ -1335,7 +1283,7 @@ static LIBDIVIDE_INLINE struct libdivide_s16_branchfree_t libdivide_s16_branchfr
// The original libdivide_s16_do takes a const pointer. However, this cannot be used // The original libdivide_s16_do takes a const pointer. However, this cannot be used
// with a compile time constant libdivide_s16_t: it will generate a warning about // with a compile time constant libdivide_s16_t: it will generate a warning about
// taking the address of a temporary. Hence this overload. // taking the address of a temporary. Hence this overload.
static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) { int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more) {
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
if (!magic) { if (!magic) {
@ -1347,7 +1295,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magi
q = (q ^ sign) - sign; q = (q ^ sign) - sign;
return q; return q;
} else { } else {
uint16_t uq = (uint16_t)libdivide_mullhi_s16(numer, magic); uint16_t uq = (uint16_t)libdivide_mullhi_s16(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int16_t sign = (int8_t)more >> 7; int16_t sign = (int8_t)more >> 7;
@ -1362,17 +1310,17 @@ static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magi
} }
} }
static LIBDIVIDE_INLINE int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) { int16_t libdivide_s16_do(int16_t numer, const struct libdivide_s16_t *denom) {
return libdivide_s16_do_raw(numer, denom->magic, denom->more); return libdivide_s16_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) { int16_t libdivide_s16_branchfree_do(int16_t numer, const struct libdivide_s16_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int16_t sign = (int8_t)more >> 7; int16_t sign = (int8_t)more >> 7;
int16_t magic = denom->magic; int16_t magic = denom->magic;
int16_t q = libdivide_mullhi_s16(numer, magic); int16_t q = libdivide_mullhi_s16(magic, numer);
q += numer; q += numer;
// If q is non-negative, we have nothing to do // If q is non-negative, we have nothing to do
@ -1390,7 +1338,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do(int16_t numer, const
return q; return q;
} }
static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) { int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK;
if (!denom->magic) { if (!denom->magic) {
@ -1425,12 +1373,11 @@ static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16
} }
} }
static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) { int16_t libdivide_s16_branchfree_recover(const struct libdivide_s16_branchfree_t *denom) {
const struct libdivide_s16_t den = {denom->magic, denom->more}; return libdivide_s16_recover((const struct libdivide_s16_t *)denom);
return libdivide_s16_recover(&den);
} }
////////// SINT32 /////////// SINT32
static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen( static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
int32_t d, int branchfree) { int32_t d, int branchfree) {
@ -1498,17 +1445,17 @@ static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_internal_s32_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_s32_t libdivide_s32_gen(int32_t d) { struct libdivide_s32_t libdivide_s32_gen(int32_t d) {
return libdivide_internal_s32_gen(d, 0); return libdivide_internal_s32_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) {
struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1);
struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more};
return result; return result;
} }
static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) { int32_t libdivide_s32_do_raw(int32_t numer, int32_t magic, uint8_t more) {
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
if (!magic) { if (!magic) {
@ -1520,7 +1467,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magi
q = (q ^ sign) - sign; q = (q ^ sign) - sign;
return q; return q;
} else { } else {
uint32_t uq = (uint32_t)libdivide_mullhi_s32(numer, magic); uint32_t uq = (uint32_t)libdivide_mullhi_s32(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int32_t sign = (int8_t)more >> 7; int32_t sign = (int8_t)more >> 7;
@ -1535,17 +1482,17 @@ static LIBDIVIDE_INLINE int32_t libdivide_s32_do_raw(int32_t numer, int32_t magi
} }
} }
static LIBDIVIDE_INLINE int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) {
return libdivide_s32_do_raw(numer, denom->magic, denom->more); return libdivide_s32_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int32_t sign = (int8_t)more >> 7; int32_t sign = (int8_t)more >> 7;
int32_t magic = denom->magic; int32_t magic = denom->magic;
int32_t q = libdivide_mullhi_s32(numer, magic); int32_t q = libdivide_mullhi_s32(magic, numer);
q += numer; q += numer;
// If q is non-negative, we have nothing to do // If q is non-negative, we have nothing to do
@ -1563,7 +1510,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do(int32_t numer, const
return q; return q;
} }
static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK;
if (!denom->magic) { if (!denom->magic) {
@ -1598,12 +1545,11 @@ static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32
} }
} }
static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) {
const struct libdivide_s32_t den = {denom->magic, denom->more}; return libdivide_s32_recover((const struct libdivide_s32_t *)denom);
return libdivide_s32_recover(&den);
} }
////////// SINT64 ///////////// SINT64
static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen( static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
int64_t d, int branchfree) { int64_t d, int branchfree) {
@ -1671,17 +1617,17 @@ static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_internal_s64_gen(
return result; return result;
} }
static LIBDIVIDE_INLINE struct libdivide_s64_t libdivide_s64_gen(int64_t d) { struct libdivide_s64_t libdivide_s64_gen(int64_t d) {
return libdivide_internal_s64_gen(d, 0); return libdivide_internal_s64_gen(d, 0);
} }
static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) {
struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1);
struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more};
return ret; return ret;
} }
static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) { int64_t libdivide_s64_do_raw(int64_t numer, int64_t magic, uint8_t more) {
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
if (!magic) { // shift path if (!magic) { // shift path
@ -1694,7 +1640,7 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magi
q = (q ^ sign) - sign; q = (q ^ sign) - sign;
return q; return q;
} else { } else {
uint64_t uq = (uint64_t)libdivide_mullhi_s64(numer, magic); uint64_t uq = (uint64_t)libdivide_mullhi_s64(magic, numer);
if (more & LIBDIVIDE_ADD_MARKER) { if (more & LIBDIVIDE_ADD_MARKER) {
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int64_t sign = (int8_t)more >> 7; int64_t sign = (int8_t)more >> 7;
@ -1709,17 +1655,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_do_raw(int64_t numer, int64_t magi
} }
} }
static LIBDIVIDE_INLINE int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) {
return libdivide_s64_do_raw(numer, denom->magic, denom->more); return libdivide_s64_do_raw(numer, denom->magic, denom->more);
} }
static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
// must be arithmetic shift and then sign extend // must be arithmetic shift and then sign extend
int64_t sign = (int8_t)more >> 7; int64_t sign = (int8_t)more >> 7;
int64_t magic = denom->magic; int64_t magic = denom->magic;
int64_t q = libdivide_mullhi_s64(numer, magic); int64_t q = libdivide_mullhi_s64(magic, numer);
q += numer; q += numer;
// If q is non-negative, we have nothing to do. // If q is non-negative, we have nothing to do.
@ -1737,7 +1683,7 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do(int64_t numer, const
return q; return q;
} }
static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK;
if (denom->magic == 0) { // shift path if (denom->magic == 0) { // shift path
@ -1763,9 +1709,8 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64
} }
} }
static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) {
const struct libdivide_s64_t den = {denom->magic, denom->more}; return libdivide_s64_recover((const struct libdivide_s64_t *)denom);
return libdivide_s64_recover(&den);
} }
// Simplest possible vector type division: treat the vector type as an array // Simplest possible vector type division: treat the vector type as an array
@ -2806,7 +2751,7 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y
return p; return p;
} }
////////// UINT16 ////////// UINT26
__m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) {
uint8_t more = denom->more; uint8_t more = denom->more;
@ -3048,10 +2993,32 @@ __m128i libdivide_s64_branchfree_do_vec128(
#endif #endif
////////// C++ stuff /////////// C++ stuff
#ifdef __cplusplus #ifdef __cplusplus
//for constexpr zero initialization,
//c++11 might handle things ok,
//but just limit to at least c++14 to ensure
//we don't break anyone's code:
// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr
#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L)
#define LIBDIVIDE_CONSTEXPR constexpr
// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c)
// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170)
#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L
#define LIBDIVIDE_CONSTEXPR constexpr
// in case some other obscure compiler has the right __cpp_constexpr :
#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L
#define LIBDIVIDE_CONSTEXPR constexpr
#else
#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE
#endif
enum Branching { enum Branching {
BRANCHFULL, // use branching algorithms BRANCHFULL, // use branching algorithms
BRANCHFREE // use branchfree algorithms BRANCHFREE // use branchfree algorithms
@ -3145,7 +3112,7 @@ struct NeonVecFor {
#define DISPATCHER_GEN(T, ALGO) \ #define DISPATCHER_GEN(T, ALGO) \
libdivide_##ALGO##_t denom; \ libdivide_##ALGO##_t denom; \
LIBDIVIDE_INLINE dispatcher() {} \ LIBDIVIDE_INLINE dispatcher() {} \
explicit LIBDIVIDE_CONSTEXPR_INLINE dispatcher(decltype(nullptr)) : denom{} {} \ explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \
LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \
LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \
LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \
@ -3238,7 +3205,7 @@ class divider {
divider() {} divider() {}
// constexpr zero-initialization to allow for use w/ static constinit // constexpr zero-initialization to allow for use w/ static constinit
explicit LIBDIVIDE_CONSTEXPR_INLINE divider(decltype(nullptr)) : div(nullptr) {} explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {}
// Constructor that takes the divisor as a parameter // Constructor that takes the divisor as a parameter
LIBDIVIDE_INLINE divider(T d) : div(d) {} LIBDIVIDE_INLINE divider(T d) : div(d) {}
@ -3355,7 +3322,7 @@ using branchfree_divider = divider<T, BRANCHFREE>;
#endif // __cplusplus #endif // __cplusplus
#if defined(_MSC_VER) && !defined(__clang__) #if defined(_MSC_VER)
#pragma warning(pop) #pragma warning(pop)
#endif #endif