diff --git a/third_party/libdivide.h b/third_party/libdivide.h index e9a31d1..4421888 100644 --- a/third_party/libdivide.h +++ b/third_party/libdivide.h @@ -1,8 +1,8 @@ // libdivide.h - Optimized integer division // https://libdivide.com // -// Copyright (C) 2010 - 2021 ridiculous_fish, -// Copyright (C) 2016 - 2021 Kim Walisch, +// Copyright (C) 2010 - 2022 ridiculous_fish, +// Copyright (C) 2016 - 2022 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. @@ -11,11 +11,12 @@ #ifndef LIBDIVIDE_H #define LIBDIVIDE_H -#define LIBDIVIDE_VERSION "5.0" +#define LIBDIVIDE_VERSION "5.1" #define LIBDIVIDE_VERSION_MAJOR 5 -#define LIBDIVIDE_VERSION_MINOR 0 +#define LIBDIVIDE_VERSION_MINOR 1 #include + #if !defined(__AVR__) #include #include @@ -24,9 +25,11 @@ #if defined(LIBDIVIDE_SSE2) #include #endif + #if defined(LIBDIVIDE_AVX2) || defined(LIBDIVIDE_AVX512) #include #endif + #if defined(LIBDIVIDE_NEON) #include #endif @@ -37,7 +40,7 @@ // disable warning C4146: unary minus operator applied // to unsigned type, result still unsigned #pragma warning(disable : 4146) -// disable warning C4204: nonstandard extension used : non-constant aggregate +// disable warning C4204: nonstandard extension used : non-constant aggregate // initializer // // It's valid C99 @@ -235,14 +238,12 @@ static LIBDIVIDE_INLINE struct libdivide_u32_branchfree_t libdivide_u32_branchfr static LIBDIVIDE_INLINE struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); static LIBDIVIDE_INLINE struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); -static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw( - int16_t numer, int16_t magic, uint8_t more); +static LIBDIVIDE_INLINE int16_t libdivide_s16_do_raw(int16_t numer, int16_t magic, uint8_t more); static LIBDIVIDE_INLINE int16_t libdivide_s16_do( - int16_t numer, const struct libdivide_s16_t* denom); -static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw( - uint16_t numer, uint16_t magic, uint8_t more); + int16_t numer, const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more); static LIBDIVIDE_INLINE uint16_t libdivide_u16_do( - uint16_t numer, const struct libdivide_u16_t* denom); + uint16_t numer, const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_do( int32_t numer, const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_do( @@ -253,9 +254,9 @@ static LIBDIVIDE_INLINE uint64_t libdivide_u64_do( uint64_t numer, const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_do( - int16_t numer, const struct libdivide_s16_branchfree_t* denom); + int16_t numer, const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_do( - uint16_t numer, const struct libdivide_u16_branchfree_t* denom); + uint16_t numer, const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_do( int32_t numer, const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_do( @@ -265,17 +266,17 @@ static LIBDIVIDE_INLINE int64_t libdivide_s64_branchfree_do( static LIBDIVIDE_INLINE uint64_t libdivide_u64_branchfree_do( uint64_t numer, const struct libdivide_u64_branchfree_t *denom); -static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t* denom); -static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t* denom); +static LIBDIVIDE_INLINE int16_t libdivide_s16_recover(const struct libdivide_s16_t *denom); +static LIBDIVIDE_INLINE uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); static LIBDIVIDE_INLINE int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static LIBDIVIDE_INLINE uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); static LIBDIVIDE_INLINE int16_t libdivide_s16_branchfree_recover( - const struct libdivide_s16_branchfree_t* denom); + const struct libdivide_s16_branchfree_t *denom); static LIBDIVIDE_INLINE uint16_t libdivide_u16_branchfree_recover( - const struct libdivide_u16_branchfree_t* denom); + const struct libdivide_u16_branchfree_t *denom); static LIBDIVIDE_INLINE int32_t libdivide_s32_branchfree_recover( const struct libdivide_s32_branchfree_t *denom); static LIBDIVIDE_INLINE uint32_t libdivide_u32_branchfree_recover( @@ -393,7 +394,7 @@ static LIBDIVIDE_INLINE int16_t libdivide_count_leading_zeros16(uint16_t val) { static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros32(uint32_t val) { #if defined(__AVR__) - // Fast way to count leading zeros + // Fast way to count leading zeros return __builtin_clzl(val); #elif defined(__GNUC__) || __has_builtin(__builtin_clz) // Fast way to count leading zeros @@ -442,7 +443,7 @@ static LIBDIVIDE_INLINE int32_t libdivide_count_leading_zeros64(uint64_t val) { // uint {v}. The result must fit in 16 bits. // Returns the quotient directly and the remainder in *r static LIBDIVIDE_INLINE uint16_t libdivide_32_div_16_to_16( - uint16_t u1, uint16_t u0, uint16_t v, uint16_t* r) { + uint16_t u1, uint16_t u0, uint16_t v, uint16_t *r) { uint32_t n = ((uint32_t)u1 << 16) | u0; uint16_t result = (uint16_t)(n / v); *r = (uint16_t)(n - result * (uint32_t)v); @@ -512,7 +513,7 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( // Check for overflow and divide by 0. if (numhi >= den) { - if (r != NULL) *r = ~0ull; + if (r) *r = ~0ull; return ~0ull; } @@ -558,11 +559,14 @@ static LIBDIVIDE_INLINE uint64_t libdivide_128_div_64_to_64( q0 = (uint32_t)qhat; // Return remainder if requested. - if (r != NULL) *r = (rem * b + num0 - q0 * den) >> shift; + if (r) *r = (rem * b + num0 - q0 * den) >> shift; return ((uint64_t)q1 << 32) | q0; #endif } +#if !(defined(HAS_INT128_T) && \ + defined(HAS_INT128_DIV)) + // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) static LIBDIVIDE_INLINE void libdivide_u128_shift( uint64_t *u1, uint64_t *u0, int32_t signed_shift) { @@ -579,6 +583,8 @@ static LIBDIVIDE_INLINE void libdivide_u128_shift( } } +#endif + // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. static LIBDIVIDE_INLINE uint64_t libdivide_128_div_128_to_64( uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { @@ -696,8 +702,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); - } - else { + } else { uint8_t more; uint16_t rem, proposed_m; proposed_m = libdivide_32_div_16_to_16((uint16_t)1 << floor_log_2_d, 0, d, &rem); @@ -709,8 +714,7 @@ static LIBDIVIDE_INLINE struct libdivide_u16_t libdivide_internal_u16_gen( if (!branchfree && (e < ((uint16_t)1 << floor_log_2_d))) { // This power works more = floor_log_2_d; - } - else { + } else { // We have to use the general 17-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the @@ -742,7 +746,7 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { } struct libdivide_u16_t tmp = libdivide_internal_u16_gen(d, 1); struct libdivide_u16_branchfree_t ret = { - tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK) }; + tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_16_SHIFT_MASK)}; return ret; } @@ -752,27 +756,25 @@ struct libdivide_u16_branchfree_t libdivide_u16_branchfree_gen(uint16_t d) { uint16_t libdivide_u16_do_raw(uint16_t numer, uint16_t magic, uint8_t more) { if (!magic) { return numer >> more; - } - else { + } else { uint16_t q = libdivide_mullhi_u16(magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint16_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_16_SHIFT_MASK); - } - else { + } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } - } + } } -uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t* denom) { +uint16_t libdivide_u16_do(uint16_t numer, const struct libdivide_u16_t *denom) { return libdivide_u16_do_raw(numer, denom->magic, denom->more); } uint16_t libdivide_u16_branchfree_do( - uint16_t numer, const struct libdivide_u16_branchfree_t* denom) { + uint16_t numer, const struct libdivide_u16_branchfree_t *denom) { uint16_t q = libdivide_mullhi_u16(denom->magic, numer); uint16_t t = ((numer - q) >> 1) + q; return t >> denom->more; @@ -800,7 +802,7 @@ uint16_t libdivide_u16_recover(const struct libdivide_u16_t *denom) { // overflow. So we have to compute it as 2^(16+shift)/(m+2^16), and // then double the quotient and remainder. uint32_t half_n = (uint32_t)1 << (16 + shift); - uint32_t d = ( (uint32_t)1 << 16) | denom->magic; + uint32_t d = ((uint32_t)1 << 16) | denom->magic; // Note that the quotient is guaranteed <= 16 bits, but the remainder // may need 17! uint16_t half_q = (uint16_t)(half_n / d); @@ -1682,15 +1684,22 @@ int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t // Simplest possible vector type division: treat the vector type as an array // of underlying native type. -#define SIMPLE_VECTOR_DIVISION(IntT, VecT, Algo) \ - const size_t count = sizeof(VecT) / sizeof(IntT); \ - VecT result; \ - IntT *pSource = (IntT *)&numers; \ - IntT *pTarget = (IntT *)&result; \ - for (size_t loop=0; loopmore; + if (!denom->magic) { + return _mm256_srli_epi16(numers, more); + } else { + __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); + return _mm256_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); + } else { + return _mm256_srli_epi16(q, more); + } + } } -__m256i libdivide_u16_branchfree_do_vec256(__m256i numers, const struct libdivide_u16_branchfree_t *denom) { - SIMPLE_VECTOR_DIVISION(uint16_t, __m256i, u16_branchfree) +__m256i libdivide_u16_branchfree_do_vec256( + __m256i numers, const struct libdivide_u16_branchfree_t *denom) { + __m256i q = _mm256_mulhi_epu16(numers, _mm256_set1_epi16(denom->magic)); + __m256i t = _mm256_adds_epu16(_mm256_srli_epi16(_mm256_subs_epu16(numers, q), 1), q); + return _mm256_srli_epi16(t, denom->more); } ////////// UINT32 @@ -2429,11 +2448,54 @@ __m256i libdivide_u64_branchfree_do_vec256( ////////// SINT16 __m256i libdivide_s16_do_vec256(__m256i numers, const struct libdivide_s16_t *denom) { - SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16) + uint8_t more = denom->more; + if (!denom->magic) { + uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + uint16_t mask = ((uint16_t)1 << shift) - 1; + __m256i roundToZeroTweak = _mm256_set1_epi16(mask); + // q = numer + ((numer >> 15) & roundToZeroTweak); + __m256i q = _mm256_add_epi16( + numers, _mm256_and_si256(_mm256_srai_epi16(numers, 15), roundToZeroTweak)); + q = _mm256_srai_epi16(q, shift); + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); + return q; + } else { + __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm256_add_epi16(q, _mm256_sub_epi16(_mm256_xor_si256(numers, sign), sign)); + } + // q >>= shift + q = _mm256_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); + q = _mm256_add_epi16(q, _mm256_srli_epi16(q, 15)); // q += (q < 0) + return q; + } } -__m256i libdivide_s16_branchfree_do_vec256(__m256i numers, const struct libdivide_s16_branchfree_t *denom) { - SIMPLE_VECTOR_DIVISION(int16_t, __m256i, s16_branchfree) +__m256i libdivide_s16_branchfree_do_vec256( + __m256i numers, const struct libdivide_s16_branchfree_t *denom) { + int16_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift + __m256i sign = _mm256_set1_epi16((int8_t)more >> 7); + __m256i q = _mm256_mulhi_epi16(numers, _mm256_set1_epi16(magic)); + q = _mm256_add_epi16(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + __m256i q_sign = _mm256_srai_epi16(q, 15); // q_sign = q >> 15 + __m256i mask = _mm256_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); + q = _mm256_add_epi16(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm256_srai_epi16(q, shift); // q >>= shift + q = _mm256_sub_epi16(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign + return q; } ////////// SINT32 @@ -2661,11 +2723,25 @@ static LIBDIVIDE_INLINE __m128i libdivide_mullhi_s64_vec128(__m128i x, __m128i y ////////// UINT26 __m128i libdivide_u16_do_vec128(__m128i numers, const struct libdivide_u16_t *denom) { - SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16) + uint8_t more = denom->more; + if (!denom->magic) { + return _mm_srli_epi16(numers, more); + } else { + __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); + return _mm_srli_epi16(t, (more & LIBDIVIDE_16_SHIFT_MASK)); + } else { + return _mm_srli_epi16(q, more); + } + } } -__m128i libdivide_u16_branchfree_do_vec128(__m128i numers, const struct libdivide_u16_branchfree_t *denom) { - SIMPLE_VECTOR_DIVISION(uint16_t, __m128i, u16_branchfree) +__m128i libdivide_u16_branchfree_do_vec128( + __m128i numers, const struct libdivide_u16_branchfree_t *denom) { + __m128i q = _mm_mulhi_epu16(numers, _mm_set1_epi16(denom->magic)); + __m128i t = _mm_adds_epu16(_mm_srli_epi16(_mm_subs_epu16(numers, q), 1), q); + return _mm_srli_epi16(t, denom->more); } ////////// UINT32 @@ -2725,11 +2801,54 @@ __m128i libdivide_u64_branchfree_do_vec128( ////////// SINT16 __m128i libdivide_s16_do_vec128(__m128i numers, const struct libdivide_s16_t *denom) { - SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16) + uint8_t more = denom->more; + if (!denom->magic) { + uint16_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + uint16_t mask = ((uint16_t)1 << shift) - 1; + __m128i roundToZeroTweak = _mm_set1_epi16(mask); + // q = numer + ((numer >> 15) & roundToZeroTweak); + __m128i q = + _mm_add_epi16(numers, _mm_and_si128(_mm_srai_epi16(numers, 15), roundToZeroTweak)); + q = _mm_srai_epi16(q, shift); + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + // q = (q ^ sign) - sign; + q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); + return q; + } else { + __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(denom->magic)); + if (more & LIBDIVIDE_ADD_MARKER) { + // must be arithmetic shift + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + // q += ((numer ^ sign) - sign); + q = _mm_add_epi16(q, _mm_sub_epi16(_mm_xor_si128(numers, sign), sign)); + } + // q >>= shift + q = _mm_srai_epi16(q, more & LIBDIVIDE_16_SHIFT_MASK); + q = _mm_add_epi16(q, _mm_srli_epi16(q, 15)); // q += (q < 0) + return q; + } } -__m128i libdivide_s16_branchfree_do_vec128(__m128i numers, const struct libdivide_s16_branchfree_t *denom) { - SIMPLE_VECTOR_DIVISION(int16_t, __m128i, s16_branchfree) +__m128i libdivide_s16_branchfree_do_vec128( + __m128i numers, const struct libdivide_s16_branchfree_t *denom) { + int16_t magic = denom->magic; + uint8_t more = denom->more; + uint8_t shift = more & LIBDIVIDE_16_SHIFT_MASK; + // must be arithmetic shift + __m128i sign = _mm_set1_epi16((int8_t)more >> 7); + __m128i q = _mm_mulhi_epi16(numers, _mm_set1_epi16(magic)); + q = _mm_add_epi16(q, numers); // q += numers + + // If q is non-negative, we have nothing to do + // If q is negative, we want to add either (2**shift)-1 if d is + // a power of 2, or (2**shift) if it is not a power of 2 + uint16_t is_power_of_2 = (magic == 0); + __m128i q_sign = _mm_srai_epi16(q, 15); // q_sign = q >> 15 + __m128i mask = _mm_set1_epi16(((uint16_t)1 << shift) - is_power_of_2); + q = _mm_add_epi16(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) + q = _mm_srai_epi16(q, shift); // q >>= shift + q = _mm_sub_epi16(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign + return q; } ////////// SINT32 @@ -2795,8 +2914,8 @@ __m128i libdivide_s64_do_vec128(__m128i numers, const struct libdivide_s64_t *de uint64_t mask = ((uint64_t)1 << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); - __m128i q = - _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); + __m128i q = _mm_add_epi64( + numers, _mm_and_si128(libdivide_s64_signbits_vec128(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vec128(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; @@ -2847,49 +2966,80 @@ __m128i libdivide_s64_branchfree_do_vec128( #ifdef __cplusplus +//for constexpr zero initialization, +//c++11 might handle things ok, +//but just limit to at least c++14 to ensure +//we don't break anyone's code: + +// for gcc and clang, use https://en.cppreference.com/w/cpp/feature_test#cpp_constexpr +#if (defined(__GNUC__) || defined(__clang__)) && (__cpp_constexpr >= 201304L) +#define LIBDIVIDE_CONSTEXPR constexpr + +// supposedly, MSVC might not implement feature test macros right (https://stackoverflow.com/questions/49316752/feature-test-macros-not-working-properly-in-visual-c) +// so check that _MSVC_LANG corresponds to at least c++14, and _MSC_VER corresponds to at least VS 2017 15.0 (for extended constexpr support https://learn.microsoft.com/en-us/cpp/overview/visual-cpp-language-conformance?view=msvc-170) +#elif defined(_MSC_VER) && _MSC_VER >= 1910 && defined(_MSVC_LANG) && _MSVC_LANG >=201402L +#define LIBDIVIDE_CONSTEXPR constexpr + +// in case some other obscure compiler has the right __cpp_constexpr : +#elif defined(__cpp_constexpr) && __cpp_constexpr >= 201304L +#define LIBDIVIDE_CONSTEXPR constexpr + +#else +#define LIBDIVIDE_CONSTEXPR LIBDIVIDE_INLINE +#endif + enum Branching { BRANCHFULL, // use branching algorithms BRANCHFREE // use branchfree algorithms }; +namespace detail { +enum Signedness { + SIGNED, + UNSIGNED, +}; + #if defined(LIBDIVIDE_NEON) // Helper to deduce NEON vector type for integral type. -template -struct NeonVecFor {}; +template +struct NeonVec {}; template <> -struct NeonVecFor { +struct NeonVec<16, UNSIGNED> { typedef uint16x8_t type; }; template <> -struct NeonVecFor { +struct NeonVec<16, SIGNED> { typedef int16x8_t type; }; template <> -struct NeonVecFor { +struct NeonVec<32, UNSIGNED> { typedef uint32x4_t type; }; template <> -struct NeonVecFor { +struct NeonVec<32, SIGNED> { typedef int32x4_t type; }; template <> -struct NeonVecFor { +struct NeonVec<64, UNSIGNED> { typedef uint64x2_t type; }; template <> -struct NeonVecFor { +struct NeonVec<64, SIGNED> { typedef int64x2_t type; }; -#endif -// Versions of our algorithms for SIMD. -#if defined(LIBDIVIDE_NEON) +template +struct NeonVecFor { + // See 'class divider' for an explanation of these template parameters. + typedef typename NeonVec> 0) > (T)(-1) ? SIGNED : UNSIGNED)>::type type; +}; + #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) \ LIBDIVIDE_INLINE typename NeonVecFor::type divide( \ typename NeonVecFor::type n) const { \ @@ -2898,6 +3048,7 @@ struct NeonVecFor { #else #define LIBDIVIDE_DIVIDE_NEON(ALGO, INT_TYPE) #endif + #if defined(LIBDIVIDE_SSE2) #define LIBDIVIDE_DIVIDE_SSE2(ALGO) \ LIBDIVIDE_INLINE __m128i divide(__m128i n) const { \ @@ -2930,6 +3081,7 @@ struct NeonVecFor { #define DISPATCHER_GEN(T, ALGO) \ libdivide_##ALGO##_t denom; \ LIBDIVIDE_INLINE dispatcher() {} \ + explicit LIBDIVIDE_CONSTEXPR dispatcher(decltype(nullptr)) : denom{} {} \ LIBDIVIDE_INLINE dispatcher(T d) : denom(libdivide_##ALGO##_gen(d)) {} \ LIBDIVIDE_INLINE T divide(T n) const { return libdivide_##ALGO##_do(n, &denom); } \ LIBDIVIDE_INLINE T recover() const { return libdivide_##ALGO##_recover(&denom); } \ @@ -2939,66 +3091,81 @@ struct NeonVecFor { LIBDIVIDE_DIVIDE_AVX512(ALGO) // The dispatcher selects a specific division algorithm for a given -// type and ALGO using partial template specialization. -template +// width, signedness, and ALGO using partial template specialization. +template struct dispatcher {}; template <> -struct dispatcher { +struct dispatcher<16, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int16_t, s16) }; template <> -struct dispatcher { +struct dispatcher<16, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int16_t, s16_branchfree) }; template <> -struct dispatcher { +struct dispatcher<16, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint16_t, u16) }; template <> -struct dispatcher { +struct dispatcher<16, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint16_t, u16_branchfree) }; template <> -struct dispatcher { +struct dispatcher<32, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int32_t, s32) }; template <> -struct dispatcher { +struct dispatcher<32, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int32_t, s32_branchfree) }; template <> -struct dispatcher { +struct dispatcher<32, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint32_t, u32) }; template <> -struct dispatcher { +struct dispatcher<32, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint32_t, u32_branchfree) }; template <> -struct dispatcher { +struct dispatcher<64, SIGNED, BRANCHFULL> { DISPATCHER_GEN(int64_t, s64) }; template <> -struct dispatcher { +struct dispatcher<64, SIGNED, BRANCHFREE> { DISPATCHER_GEN(int64_t, s64_branchfree) }; template <> -struct dispatcher { +struct dispatcher<64, UNSIGNED, BRANCHFULL> { DISPATCHER_GEN(uint64_t, u64) }; template <> -struct dispatcher { +struct dispatcher<64, UNSIGNED, BRANCHFREE> { DISPATCHER_GEN(uint64_t, u64_branchfree) }; +} // namespace detail + +#if defined(LIBDIVIDE_NEON) +// Allow NeonVecFor outside of detail namespace. +template +struct NeonVecFor { + typedef typename detail::NeonVecFor::type type; +}; +#endif // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct -// based on the integer and algorithm template parameters. +// based on the integer width and algorithm template parameters. template class divider { private: - typedef dispatcher dispatcher_t; + // Dispatch based on the size and signedness. + // We avoid using type_traits as it's not available in AVR. + // Detect signedness by checking if T(-1) is less than T(0). + // Also throw in a shift by 0, which prevents floating point types from being passed. + typedef detail::dispatcher> 0) > (T)(-1) ? detail::SIGNED : detail::UNSIGNED), ALGO> + dispatcher_t; public: // We leave the default constructor empty so that creating @@ -3006,6 +3173,9 @@ class divider { // later doesn't slow us down. divider() {} + // constexpr zero-initialization to allow for use w/ static constinit + explicit LIBDIVIDE_CONSTEXPR divider(decltype(nullptr)) : div(nullptr) {} + // Constructor that takes the divisor as a parameter LIBDIVIDE_INLINE divider(T d) : div(d) {} @@ -3017,7 +3187,7 @@ class divider { T recover() const { return div.recover(); } bool operator==(const divider &other) const { - return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; + return div.denom.magic == other.div.denom.magic && div.denom.more == other.div.denom.more; } bool operator!=(const divider &other) const { return !(*this == other); } @@ -3098,12 +3268,14 @@ LIBDIVIDE_INLINE __m512i operator/=(__m512i &n, const divider &div) { #if defined(LIBDIVIDE_NEON) template -LIBDIVIDE_INLINE typename NeonVecFor::type operator/(typename NeonVecFor::type n, const divider &div) { +LIBDIVIDE_INLINE typename NeonVecFor::type operator/( + typename NeonVecFor::type n, const divider &div) { return div.divide(n); } template -LIBDIVIDE_INLINE typename NeonVecFor::type operator/=(typename NeonVecFor::type &n, const divider &div) { +LIBDIVIDE_INLINE typename NeonVecFor::type operator/=( + typename NeonVecFor::type &n, const divider &div) { n = div.divide(n); return n; }