pax_global_header00006660000000000000000000000064135515564250014525gustar00rootroot0000000000000052 comment=08da2f6feb1432e9d70fb3a41da2d62ed5a57152 libdivide-3.0/000077500000000000000000000000001355155642500133225ustar00rootroot00000000000000libdivide-3.0/.gitignore000066400000000000000000000002551355155642500153140ustar00rootroot00000000000000*.exe *.obj *.o *.dSYM benchmark primes tester /build* /.vscode CMakeCache.txt CMakeFiles CMakeScripts Makefile cmake_install.cmake install_manifest.txt CTestTestfile.cmake libdivide-3.0/CHANGELOG.md000066400000000000000000000112641355155642500151370ustar00rootroot00000000000000# Changelog This is a list of notable changes to libdivide. ## [3.0](https://github.com/ridiculousfish/libdivide/releases/tag/v3.0) - 2019-10-16 * BREAKING * ```libdivide.h``` now requires C++11 or later * BUG FIXES * Support all 32-bit and 64-bit integer types in C++ ([#58](https://github.com/ridiculousfish/libdivide/issues/58)) * Fix cross compilation ([#59](https://github.com/ridiculousfish/libdivide/issues/59)) * ENHANCEMENT * Add support for CMake ```find_package(libdivide)``` ## [2.0](https://github.com/ridiculousfish/libdivide/releases/tag/v2.0) - 2019-06-22 * BREAKING * Removed unswitch functionality ([#46](https://github.com/ridiculousfish/libdivide/issues/46)) * Renamed macro ```LIBDIVIDE_USE_SSE2``` to ```LIBDIVIDE_SSE2``` * Renamed ```divider::recover_divisor()``` to ```divider::recover()``` * BUG FIXES * Remove ```_udiv128()``` as not yet supported by clang-cl and icl compilers * Fix C++ linker issue caused by anonymous namespace ([#54](https://github.com/ridiculousfish/libdivide/pull/54)) * Fix clang-cl (Windows) linker issue ([#56](https://github.com/ridiculousfish/libdivide/issues/56)) * ENHANCEMENT * Add AVX2 & AVX512 vector division * Speed up SSE2 ```libdivide_mullhi_u64_vector()``` * Support +1 & -1 signed branchfree dividers ([4a1d5a7](https://github.com/ridiculousfish/libdivide/commit/4a1d5a7008af7f401f8f39c2f44f3dd0116a9839)) * Speed up unsigned branchfull power of 2 dividers ([2422199](https://github.com/ridiculousfish/libdivide/commit/24221996a082774bd486b04f329a1cdd1a80c8df)) * Simplify C++ templates * Simplify ```more``` bit flags of the ```libdivide_*_t``` structs * Get rid of ```MAYBE_VECTOR()``` hack * TESTING * ```tester.cpp```: Convert to modern C++ * ```tester.cpp```: Add more test cases * ```benchmark_branchfreee.cpp```: Convert to modern C++ * ```benchmark.c```: Prevent compilers from optmizing too much * BUILD * Automatically detect SSE2/AVX2/AVX512 * DOCS * ```doc/C-API.md```: Add C API reference * ```doc/CPP-API.md```: Add C++ API reference * ```README.md```: Add [vector division](https://github.com/ridiculousfish/libdivide#vector-division) and [performance tips](https://github.com/ridiculousfish/libdivide#performance-tips) sections ## [1.1](https://github.com/ridiculousfish/libdivide/releases/tag/v1.1) - 2019-05-29 * BUG FIXES * Fix bug in ```libdivide_128_div_64_to_64()``` ([#45](https://github.com/ridiculousfish/libdivide/issues/45)) * Fix MSVC ARM 64-bit bug ([07931e9](https://github.com/ridiculousfish/libdivide/commit/07931e9cb670311ba67cd6a16fdbcb67bb97a592)) * Fix ```-Wshift-count-overflow``` warning on avr CPU architecture ([#41](https://github.com/ridiculousfish/libdivide/pull/41)) * Fix ```-Wshadow``` warning in ```libdivide_s32_do()``` * Fix ```-Wignored-attributes``` warnings when compiling SSE2 code using GCC 9 * ENHANCEMENT * ```libdivide_128_div_64_to_64()```: optimize using ```_udiv128()``` for MSVC 2019 or later * ```libdivide_128_div_64_to_64()```: optimize using ```__uint128_t``` for GCC/Clang on 64-bit CPU architectures * Add ```LIBDIVIDE_VERSION``` macro to ```libdivide.h``` * Clean up SSE2 code in ```libdivide.h``` * Increase runtime of test cases in ```primes_benchmark.cpp``` * BUILD * Remove windows directory with legacy Visual Studio project files * Move test programs to test directory ## [1.0](https://github.com/ridiculousfish/libdivide/releases/tag/v1.0) - 2018-01-21 * BREAKING * Branchfull divider must not be ```0``` ([#38](https://github.com/ridiculousfish/libdivide/pull/38)) * Branchfree divider must not be ```-1```, ```0```, ```1``` ([#38](https://github.com/ridiculousfish/libdivide/pull/38)) * ENHANCEMENT * Add proper error handling ([#38](https://github.com/ridiculousfish/libdivide/pull/38)) * Add C++ support for ```/=``` operator * Speedup 64-bit divisor recovery by up to 30% * Simplify C++ templates * Add include guards to ```libdivide.h```! * Get rid of ```goto``` in ```libdivide_128_div_64_to_64()``` * Use ```#if defined(MACRO)``` instead of ```#if MACRO``` * Silence compiler warnings from crash functions * TESTING * Tests should ```exit(1)``` on error, required by ```make test``` * Silence unused parameter warnings * Silence GCC 7.2.0 maybe uninitialized warnings * Silence unused return value warning * BUILD * Port build system from ```make``` to ```CMake``` * Automatically detect if the CPU and compiler support SSE2 * Automatically enable C++11 * DOCS * Update build instructions in ```README.md``` * Update benchmark section with branchfree divider * Add C example section * Add C++ example section * Add "Branchfull vs branchfree" section * Add section about unswitching * New ```CHANGELOG.md```file libdivide-3.0/CMakeLists.txt000066400000000000000000000170761355155642500160750ustar00rootroot00000000000000cmake_minimum_required(VERSION 3.4) set(LIBDIVIDE_VERSION "3.0") project(libdivide C CXX) include(CheckCXXCompilerFlag) include(CheckCXXSourceRuns) include(GNUInstallDirs) include(CMakePackageConfigHelpers) include(CMakePushCheckState) # Build options ################################################ option(BUILD_TESTS "Build the test programs" ON) option(ENABLE_VECTOR_EXTENSIONS "Enable CPU vector instructions for test programs" ON) # By default we automatically enable the widest vector # instruction set supported by your x86/x64 CPU. # But you can also force a specific vector instruction # set using the options below. option(LIBDIVIDE_SSE2 "Enable SSE2 vector instructions" OFF) option(LIBDIVIDE_AVX2 "Enable AVX2 vector instructions" OFF) option(LIBDIVIDE_AVX512 "Enable AVX512 vector instructions" OFF) # By default enable release mode ############################### if(NOT CMAKE_VERSION VERSION_LESS 3.9) get_property(isMultiConfig GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) elseif(CMAKE_CONFIGURATION_TYPES) set(isMultiConfig TRUE) endif() if(NOT isMultiConfig AND NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." FORCE) endif() # Enable assertions in debug mode ############################## string(TOUPPER "${CMAKE_BUILD_TYPE}" BUILD_TYPE) if("${BUILD_TYPE}" MATCHES DEBUG) set(LIBDIVIDE_ASSERTIONS -DLIBDIVIDE_ASSERTIONS_ON) endif() # Check if x86/x64 CPU ######################################## # Note that check_cxx_source_runs() must not be used when # cross-compiling otherwise the following error will occur: # CMake Error: TRY_RUN() invoked in cross-compiling mode, ... if (BUILD_TESTS AND ENABLE_VECTOR_EXTENSIONS AND NOT CMAKE_CROSSCOMPILING) check_cxx_source_runs(" int main() { #if !defined(__i386__) && \ !defined(__x86_64__) && \ !defined(_M_IX86) && \ !defined(_M_X64) Compile error: not x86 CPU architecture #endif return 0; }" cpu_x86) if (cpu_x86) set(ENABLE_X86_VECTOR_EXTENSIONS ON) cmake_push_check_state() set(CMAKE_REQUIRED_FLAGS -Werror) check_cxx_compiler_flag(-march=native march_native) cmake_pop_check_state() if (march_native) # -march=native required for AVX2/AVX512 on x86 set(NATIVE_FLAG -march=native) endif() endif() endif() # Disable auto vectorization ################################### # We disable auto vectorization on x86 (and x64-64) in order # to prevent the compiler from vectorizing our scalar benchmarks # which would make the benchmark results less useful. if(ENABLE_X86_VECTOR_EXTENSIONS) if("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang") check_cxx_compiler_flag("-fno-vectorize" fno_vectorize) if(fno_vectorize) set(NO_VECTORIZE -fno-vectorize) endif() elseif("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") check_cxx_compiler_flag("-fno-tree-vectorize" fno_tree_vectorize) if(fno_tree_vectorize) set(NO_VECTORIZE -fno-tree-vectorize) endif() endif() endif() # Check if CPU supports AVX512/AVX2/SSE2 ####################### if(LIBDIVIDE_AVX512) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_AVX512) elseif(LIBDIVIDE_AVX2) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_AVX2) elseif(LIBDIVIDE_SSE2) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_SSE2) elseif(ENABLE_X86_VECTOR_EXTENSIONS) cmake_push_check_state() if(march_native) set(CMAKE_REQUIRED_FLAGS "-march=native") endif() check_cxx_source_runs(" #include int main() { __m512i a = _mm512_set_epi32(1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0); __m512i b = _mm512_set_epi32(0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1); __m512i c = _mm512_add_epi32(a, b); __m512i d = _mm512_srai_epi32(c, 23); d = _mm512_shuffle_epi32(d, (_MM_PERM_ENUM) 0xB1); return 0; }" avx512) if(avx512) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_AVX512) else() check_cxx_source_runs(" #include int main() { __m256i a = _mm256_set_epi32(1, 0, 1, 0, 1, 0, 1, 0); __m256i b = _mm256_set_epi32(0, 1, 0, 1, 0, 1, 0, 1); b = _mm256_add_epi32(a, b); return 0; }" avx2) if(avx2) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_AVX2) else() check_cxx_source_runs(" #include int main() { __m128i a = _mm_set_epi32(1, 0, 1, 0); __m128i b = _mm_set_epi32(0, 1, 0, 1); b = _mm_add_epi32(a, b); return 0; }" sse2) if(sse2) set(LIBDIVIDE_VECTOR_EXT -DLIBDIVIDE_SSE2) endif() endif() endif() cmake_pop_check_state() endif() # libdivide header-only library target ######################### add_library(libdivide INTERFACE) add_library(libdivide::libdivide ALIAS libdivide) target_compile_features(libdivide INTERFACE cxx_alias_templates) target_include_directories(libdivide INTERFACE $ $) install(FILES libdivide.h COMPONENT libdivide-header DESTINATION "${CMAKE_INSTALL_INCLUDEDIR}") # CMake find_package(libdivide) support ######################## install(TARGETS libdivide EXPORT libdivideConfig RUNTIME DESTINATION "${CMAKE_INSTALL_BINDIR}" LIBRARY DESTINATION "${CMAKE_INSTALL_LIBDIR}" ARCHIVE DESTINATION "${CMAKE_INSTALL_LIBDIR}") export(TARGETS libdivide NAMESPACE libdivide:: FILE "${CMAKE_CURRENT_BINARY_DIR}/libdivideConfig.cmake") install(EXPORT libdivideConfig NAMESPACE libdivide:: DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/libdivide") write_basic_package_version_file( "${CMAKE_CURRENT_BINARY_DIR}/libdivideConfigVersion.cmake" VERSION ${LIBDIVIDE_VERSION} COMPATIBILITY SameMajorVersion) install(FILES "${CMAKE_CURRENT_BINARY_DIR}/libdivideConfigVersion.cmake" DESTINATION "${CMAKE_INSTALL_LIBDIR}/cmake/libdivide") # Build test programs ########################################## if (BUILD_TESTS) find_package(Threads REQUIRED QUIET) add_executable(tester test/tester.cpp) add_executable(benchmark test/benchmark.c) add_executable(benchmark_branchfree test/benchmark_branchfree.cpp) target_link_libraries(tester libdivide Threads::Threads) target_link_libraries(benchmark libdivide) target_link_libraries(benchmark_branchfree libdivide) target_compile_options(tester PRIVATE "${NATIVE_FLAG}" "${NO_VECTORIZE}") target_compile_options(benchmark PRIVATE "${NATIVE_FLAG}" "${NO_VECTORIZE}") target_compile_options(benchmark_branchfree PRIVATE "${NATIVE_FLAG}" "${NO_VECTORIZE}") target_compile_definitions(tester PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}") target_compile_definitions(benchmark PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}") target_compile_definitions(benchmark_branchfree PRIVATE "${LIBDIVIDE_ASSERTIONS}" "${LIBDIVIDE_VECTOR_EXT}") endif() # Enable testing ############################################### if (BUILD_TESTS) enable_testing() add_test(tester tester) add_test(benchmark_branchfree benchmark_branchfree) endif() libdivide-3.0/LICENSE.txt000066400000000000000000000052201355155642500151440ustar00rootroot00000000000000 libdivide is made available under two licenses. You may choose either of the following licenses when using libdivide. zlib License ------------ Copyright (C) 2010 - 2019 ridiculous_fish, Copyright (C) 2016 - 2019 Kim Walisch, This software is provided 'as-is', without any express or implied warranty. In no event will the authors be held liable for any damages arising from the use of this software. Permission is granted to anyone to use this software for any purpose, including commercial applications, and to alter it and redistribute it freely, subject to the following restrictions: 1. The origin of this software must not be misrepresented; you must not claim that you wrote the original software. If you use this software in a product, an acknowledgment in the product documentation would be appreciated but is not required. 2. Altered source versions must be plainly marked as such, and must not be misrepresented as being the original software. 3. This notice may not be removed or altered from any source distribution. Boost License ------------- Copyright (C) 2010 - 2019 ridiculous_fish, Copyright (C) 2016 - 2019 Kim Walisch, Boost Software License - Version 1.0 - August 17th, 2003 Permission is hereby granted, free of charge, to any person or organization obtaining a copy of the software and accompanying documentation covered by this license (the "Software") to use, reproduce, display, distribute, execute, and transmit the Software, and to prepare derivative works of the Software, and to permit third-parties to whom the Software is furnished to do so, all subject to the following: The copyright notices in the Software and this entire statement, including the above license grant, this restriction and the following disclaimer, must be included in all copies of the Software, in whole or in part, and all derivative works of the Software, unless such copies or derivative works are solely in the form of machine-executable object code generated by a source language processor. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE, TITLE AND NON-INFRINGEMENT. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR ANYONE DISTRIBUTING THE SOFTWARE BE LIABLE FOR ANY DAMAGES OR OTHER LIABILITY, WHETHER IN CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. libdivide-3.0/README.md000066400000000000000000000213401355155642500146010ustar00rootroot00000000000000# libdivide [![Build Status](https://ci.appveyor.com/api/projects/status/github/ridiculousfish/libdivide?branch=master&svg=true)](https://ci.appveyor.com/project/kimwalisch/libdivide) [![Github Releases](https://img.shields.io/github/release/ridiculousfish/libdivide.svg)](https://github.com/ridiculousfish/libdivide/releases) ```libdivide.h``` is a header-only C/C++ library for optimizing integer division. Integer division is one of the slowest instructions on most CPUs e.g. on current x64 CPUs a 64-bit integer division has a latency of up to 90 clock cycles whereas a multiplication has a latency of only 3 clock cycles. libdivide allows you to replace expensive integer divsion instructions by a sequence of shift, add and multiply instructions that will calculate the integer division much faster. On current CPUs you can get a **speedup of up to 10x** for 64-bit integer division and a speedup of up to to 5x for 32-bit integer division when using libdivide. libdivide also supports [SSE2](https://en.wikipedia.org/wiki/SSE2), [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) and [AVX512](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) vector division which provides an even larger speedup. You can test how much speedup you can achieve on your CPU using the [benchmark](#benchmark-program) program. See https://libdivide.com for more information on libdivide. # C++ example The first code snippet divides all integers in a vector using integer division. This is slow as integer division is at least one order of magnitude slower than any other integer arithmetic operation on current CPUs. ```C++ void divide(std::vector& vect, int64_t divisor) { // Slow, uses integer division for (auto& n : vect) n /= divisor; } ``` The second code snippet runs much faster, it uses libdivide to compute the integer division using a sequence of shift, add and multiply instructions hence avoiding the slow integer divison operation. ```C++ #include "libdivide.h" void divide(std::vector& vect, int64_t divisor) { libdivide::divider fast_d(divisor); // Fast, computes division using libdivide for (auto& n : vect) n /= fast_d; } ``` Generally libdivide will give at significant speedup if: * The divisor is only known at runtime * The divisor is reused multiple times e.g. in a loop # C example You first need to generate a libdivide divider using one of the ```libdivide_*_gen``` functions (```*```: ```s32```, ```u32```, ```s64```, ```u64```) which can then be used to compute the actual integer division using the corresponding ```libdivide_*_do``` function. ```C #include "libdivide.h" void divide(int64_t *array, size_t size, int64_t divisor) { struct libdivide_s64_t fast_d = libdivide_s64_gen(divisor); // Fast, computes division using libdivide for (size_t i = 0; i < size; i++) array[i] = libdivide_s64_do(array[i], &fast_d); } ``` # API reference * [C API](https://github.com/ridiculousfish/libdivide/blob/master/doc/C-API.md) * [C++ API](https://github.com/ridiculousfish/libdivide/blob/master/doc/CPP-API.md) # Branchfull vs branchfree The default libdivide divider makes use of [branches](https://en.wikipedia.org/wiki/Branch_(computer_science)) to compute the integer division. When the same divider is used inside a hot loop as in the C++ example section the CPU will accurately predict the branches and there will be no performance slowdown. Often the compiler is even able to move the branches outside the body of the loop hence completely eliminating the branches, this is called loop-invariant code motion. libdivide also has a branchfree divider type which computes the integer division without using any branch instructions. The branchfree divider generally uses a few more instructions than the default branchfull divider. The main use case for the branchfree divider is when you have an array of different divisors and you need to iterate over the divisors. In this case the default branchfull divider would exhibit poor performance as the CPU won't be able to correctly predict the branches. ```C++ #include "libdivide.h" // 64-bit branchfree divider type using branchfree_t = libdivide::branchfree_divider; uint64_t divide(uint64_t x, std::vector& vect) { uint64_t sum = 0; for (auto& fast_d : vect) sum += x / fast_d; return sum; } ``` Caveats of branchfree divider: * Unsigned branchfree divider cannot be ```1``` * Faster for unsigned types than for signed types # Vector division libdivide supports [SSE2](https://en.wikipedia.org/wiki/SSE2), [AVX2](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) and [AVX512](https://en.wikipedia.org/wiki/Advanced_Vector_Extensions) vector division on x86 and x64 CPUs. In the example below we divide the packed 32-bit integers inside an AVX512 vector using libdivide. libdivide supports 32-bit and 64-bit vector division for both signed and unsigned integers. ```C++ #include "libdivide.h" void divide(std::vector<__m512i>& vect, uint32_t divisor) { libdivide::divider fast_d(divisor); // AVX512 vector division for (auto& n : vect) n /= fast_d; } ``` Note that you need to define one of macros below to enable vector division: * ```LIBDIVIDE_SSE2``` * ```LIBDIVIDE_AVX2``` * ```LIBDIVIDE_AVX512``` # Performance tips * If possible use unsigned integer types because libdivide's unsigned division is measurably faster than its signed division. This is especially true for the branchfree divider. * Try both the default branchfull divider and the branchfree divider in your program and choose the one that performs best. The branchfree divider is more likely to get auto vectorized by the compiler (if you compile with e.g. ```-march=native```). But don't forget that the unsigned branchfree divider cannot be 1. * Vector division is much faster for 32-bit than for 64-bit. This is because there are currently no vector multiplication instructions on x86 to efficiently calculate 64-bit * 64-bit to 128-bit. # Build instructions libdivide has one test program and two benchmark programs which can be built using cmake and a recent C++ compiler that supports C++11 or later. Optionally ```libdivide.h``` can also be installed to ```/usr/local/include```. ```bash cmake . make -j sudo make install ``` # Tester program You can pass the **tester** program one or more of the following arguments: ```u32```, ```s32```, ```u64```, ```s64``` to test the four cases (signed, unsigned, 32-bit, or 64-bit), or run it with no arguments to test all four. The tester will verify the correctness of libdivide via a set of randomly chosen numerators and denominators, by comparing the result of libdivide's division to hardware division. It will stop with an error message as soon as it finds a discrepancy. # Benchmark program You can pass the **benchmark** program one or more of the following arguments: ```u32```, ```s32```, ```u64```, ```s64``` to compare libdivide's speed against hardware division. **benchmark** tests a simple function that inputs an array of random numerators and a single divisor, and returns the sum of their quotients. It tests this using both hardware division, and the various division approaches supported by libdivide, including vector division. It will output data like this: ```bash # system scalar scl_bf vector vec_bf gener algo 1 9.684 0.792 0.783 0.426 0.426 1.346 0 2 9.078 0.781 1.609 0.426 1.529 1.346 0 3 9.078 1.355 1.609 1.334 1.531 29.045 1 4 9.076 0.787 1.609 0.426 1.529 1.346 0 5 9.074 1.349 1.609 1.334 1.531 29.045 1 6 9.078 1.349 1.609 1.334 1.531 29.045 1 ... ``` It will keep going as long as you let it, so it's best to stop it when you are happy with the denominators tested. These columns have the following significance. All times are in nanoseconds, lower is better. ```bash #: The divisor that is tested system: Hardware divide time scalar: libdivide time, using scalar division scl_bf: libdivide time, using scalar branchfree division vector: libdivide time, using vector division vec_bf: libdivide time, using vector branchfree division gener: Time taken to generate the divider struct algo: The algorithm used. ``` The **benchmark** program will also verify that each function returns the same value, so benchmark is valuable for its verification as well. # Contributing We currently do not have automated testing! Hence, before sending in patches, it would be nice if you compiled your new code at high warning levels using at least MSVC and GCC (or Clang). Also run the tester program to verify correctness and the benchmark programs to ensure you have not introduced any performance regressions. ### Happy hacking! libdivide-3.0/appveyor.yml000066400000000000000000000056101355155642500157140ustar00rootroot00000000000000# Automated Windows and Linux testing using appveyor.com # https://ci.appveyor.com/projects version: '{branch}-{build}' image: - Ubuntu1804 - Visual Studio 2015 - Visual Studio 2017 platform: - x86 - x64 for: - matrix: only: - image: Ubuntu1804 platform: x86 environment: CFLAGS: "-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" CXXFLAGS: "-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" install: - sudo apt install --yes cppcheck build_script: - cmake . -DCMAKE_BUILD_TYPE=Debug - make VERBOSE=1 test_script: - cppcheck . --error-exitcode=1 --force -i doc - ./tester - ./benchmark_branchfree - matrix: only: - image: Ubuntu1804 platform: x64 environment: CFLAGS: "-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" CXXFLAGS: "-Wall -Wextra -pedantic -Werror -O1 -g -fsanitize=address -fsanitize=undefined -fno-sanitize-recover=all -fno-omit-frame-pointer" install: - sudo apt install --yes cppcheck build_script: - CC=clang CXX=clang++ cmake . -DCMAKE_BUILD_TYPE=Debug - make VERBOSE=1 test_script: - cppcheck . --error-exitcode=1 --force -i doc - ./tester - ./benchmark_branchfree - matrix: only: - image: Visual Studio 2015 platform: x86 environment: CFLAGS: "/DLIBDIVIDE_ASSERTIONS_ON" CXXFLAGS: "/DLIBDIVIDE_ASSERTIONS_ON" build_script: - cmake . -G "Visual Studio 14 2015" - cmake --build . --config Release test_script: - cd Release - tester.exe - benchmark_branchfree.exe - matrix: only: - image: Visual Studio 2015 platform: x64 environment: CFLAGS: "/DLIBDIVIDE_ASSERTIONS_ON" CXXFLAGS: "/DLIBDIVIDE_ASSERTIONS_ON" build_script: - cmake . -G "Visual Studio 14 2015 Win64" - cmake --build . --config Release test_script: - cd Release - tester.exe - benchmark_branchfree.exe - matrix: only: - image: Visual Studio 2017 platform: x86 build_script: - cmake . -G "Visual Studio 15 2017" - cmake --build . --config Release test_script: - cd Release - tester.exe - benchmark_branchfree.exe - matrix: only: - image: Visual Studio 2017 platform: x64 environment: CFLAGS: "/W3 /WX /DLIBDIVIDE_ASSERTIONS_ON" CXXFLAGS: "/W3 /WX /DLIBDIVIDE_ASSERTIONS_ON" build_script: - cmake . -G "Visual Studio 15 2017 Win64" - cmake --build . --config Release test_script: - cd Release - tester.exe - benchmark_branchfree.exe libdivide-3.0/doc/000077500000000000000000000000001355155642500140675ustar00rootroot00000000000000libdivide-3.0/doc/C-API.md000066400000000000000000000123311355155642500152020ustar00rootroot00000000000000# libdivide C API Note that all of libdivide's public API functions are declared as ```static inline``` for performance reasons, however ```static inline``` is omitted in the code sections below in order to increase readability. ## Generate libdivide divider ```C /* Generate a libdivide divider */ struct libdivide_s32_t libdivide_s32_gen(int32_t d); struct libdivide_u32_t libdivide_u32_gen(uint32_t d); struct libdivide_s64_t libdivide_s64_gen(int64_t d); struct libdivide_u64_t libdivide_u64_gen(uint64_t d); /* Generate a branchfree libdivide divider */ struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); ``` ## libdivide division ```C /* libdivide division */ int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); /* libdivide branchfree division */ int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom); uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom); int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom); uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom); ``` ## libdivide SSE2 vector division ```C /* libdivide SSE2 division */ __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); /* libdivide SSE2 branchfree division */ __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); ``` You need to define ```LIBDIVIDE_SSE2``` to enable SSE2 vector division. ## libdivide AVX2 vector division ```C /* libdivide AVX2 division */ __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom); __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom); __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom); __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom); /* libdivide AVX2 branchfree division */ __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom); __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom); __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom); __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom); ``` You need to define ```LIBDIVIDE_AVX2``` to enable AVX2 vector division. ## libdivide AVX512 vector division ```C /* libdivide AVX512 division */ __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom); __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom); __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom); __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom); /* libdivide AVX512 branchfree division */ __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom); __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom); __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom); __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom); ``` You need to define ```LIBDIVIDE_AVX512``` to enable AVX512 vector division. ## Recover divider ```C /* Recover the original divider */ int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); /* Recover the original divider */ int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom); uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom); int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom); uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom); ``` libdivide-3.0/doc/CPP-API.md000066400000000000000000000044121355155642500154430ustar00rootroot00000000000000# libdivide C++ API The entire content of ```libdivide.h``` is wrapped inside the ```libdivide``` namespace, for clarity the ```libdivide``` namespace is omitted in the code sections below. ## divider class ```C++ // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct // based on the integer and algorithm template parameters. template class divider { public: // Generate a libdivide divisor for d divider(T d); // Recover the original divider T recover() const; bool operator==(const divider& other) const; bool operator!=(const divider& other) const; // ... private: // Storage for the actual divisor dispatcher div; }; ``` ## branchfree_divider ```branchfree_divider``` is a convenience typedef which redirects to the divider class: ```C++ template using branchfree_divider = divider; ``` ## Operator ```/``` and ```/=``` ```C++ // Overload of operator / template T operator/(T n, const divider& div); // Overload of operator /= template T& operator/=(T& n, const divider& div); ``` ## SSE2 vector division ```C++ // Overload of operator / template __m128i operator/(__m128i n, const divider& div); // Overload of operator /= template __m128i& operator/=(__m128i& n, const divider& div); ``` You need to define ```LIBDIVIDE_SSE2``` to enable SSE2 vector division. ## AVX2 vector division ```C++ // Overload of operator / template __m256i operator/(__m256i n, const divider& div); // Overload of operator /= template __m256i& operator/=(__m256i& n, const divider& div); ``` You need to define ```LIBDIVIDE_AVX2``` to enable AVX2 vector division. ## AVX512 vector division ```C++ // Overload of operator / template __m512i operator/(__m512i n, const divider& div); // Overload of operator /= template __m512i& operator/=(__m512i& n, const divider& div); ``` You need to define ```LIBDIVIDE_AVX512``` to enable AVX512 vector division. libdivide-3.0/doc/divide_by_constants_codegen_reference.c000066400000000000000000000231621355155642500237730ustar00rootroot00000000000000/* Reference implementations of computing and using the "magic number" approach to dividing by constants, including codegen instructions. The unsigned division incorporates the "round down" optimization per ridiculous_fish. This is free and unencumbered software. Any copyright is dedicated to the Public Domain. */ #include //for CHAR_BIT #include /* Types used in the computations below. These can be redefined to the types appropriate for the desired division type (i.e. uint can be defined as unsigned long long). Note that the uint type is used in compute_signed_magic_info, so the uint type must not be smaller than the sint type. */ typedef unsigned int uint; typedef signed int sint; /* Computes "magic info" for performing signed division by a fixed integer D. The type 'sint' is assumed to be defined as a signed integer type large enough to hold both the dividend and the divisor. Here >> is arithmetic (signed) shift, and >>> is logical shift. To emit code for n/d, rounding towards zero, use the following sequence: m = compute_signed_magic_info(D) emit("result = (m.multiplier * n) >> SINT_BITS"); if d > 0 and m.multiplier < 0: emit("result += n") if d < 0 and m.multiplier > 0: emit("result -= n") if m.post_shift > 0: emit("result >>= m.shift") emit("result += (result < 0)") The shifts by SINT_BITS may be "free" if the high half of the full multiply is put in a separate register. The final add can of course be implemented via the sign bit, e.g. result += (result >>> (SINT_BITS - 1)) or result -= (result >> (SINT_BITS - 1)) This code is heavily indebted to Hacker's Delight by Henry Warren. See http://www.hackersdelight.org/HDcode/magic.c.txt Used with permission from http://www.hackersdelight.org/permissions.htm */ struct magics_info { sint multiplier; // the "magic number" multiplier unsigned shift; // shift for the dividend after multiplying }; struct magics_info compute_signed_magic_info(sint D); /* Computes "magic info" for performing unsigned division by a fixed positive integer D. The type 'uint' is assumed to be defined as an unsigned integer type large enough to hold both the dividend and the divisor. num_bits can be set appropriately if n is known to be smaller than the largest uint; if this is not known then pass (sizeof(uint) * CHAR_BIT) for num_bits. Assume we have a hardware register of width UINT_BITS, a known constant D which is not zero and not a power of 2, and a variable n of width num_bits (which may be up to UINT_BITS). To emit code for n/d, use one of the two following sequences (here >>> refers to a logical bitshift): m = compute_unsigned_magic_info(D, num_bits) if m.pre_shift > 0: emit("n >>>= m.pre_shift") if m.increment: emit("n = saturated_increment(n)") emit("result = (m.multiplier * n) >>> UINT_BITS") if m.post_shift > 0: emit("result >>>= m.post_shift") or m = compute_unsigned_magic_info(D, num_bits) if m.pre_shift > 0: emit("n >>>= m.pre_shift") emit("result = m.multiplier * n") if m.increment: emit("result = result + m.multiplier") emit("result >>>= UINT_BITS") if m.post_shift > 0: emit("result >>>= m.post_shift") The shifts by UINT_BITS may be "free" if the high half of the full multiply is put in a separate register. saturated_increment(n) means "increment n unless it would wrap to 0," i.e. if n == (1 << UINT_BITS)-1: result = n else: result = n+1 A common way to implement this is with the carry bit. For example, on x86: add 1 sbb 0 Some invariants: 1: At least one of pre_shift and increment is zero 2: multiplier is never zero This code incorporates the "round down" optimization per ridiculous_fish. */ struct magicu_info { uint multiplier; // the "magic number" multiplier unsigned pre_shift; // shift for the dividend before multiplying unsigned post_shift; //shift for the dividend after multiplying int increment; // 0 or 1; if set then increment the numerator, using one of the two strategies }; struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits); /* Implementations follow */ struct magicu_info compute_unsigned_magic_info(uint D, unsigned num_bits) { //The numerator must fit in a uint assert(num_bits > 0 && num_bits <= sizeof(uint) * CHAR_BIT); // D must be larger than zero and not a power of 2 assert(D & (D-1)); // The eventual result struct magicu_info result; // Bits in a uint const unsigned UINT_BITS = sizeof(uint) * CHAR_BIT; // The extra shift implicit in the difference between UINT_BITS and num_bits const unsigned extra_shift = UINT_BITS - num_bits; // The initial power of 2 is one less than the first one that can possibly work const uint initial_power_of_2 = (uint)1 << (UINT_BITS-1); // The remainder and quotient of our power of 2 divided by d uint quotient = initial_power_of_2 / D, remainder = initial_power_of_2 % D; // ceil(log_2 D) unsigned ceil_log_2_D; // The magic info for the variant "round down" algorithm uint down_multiplier = 0; unsigned down_exponent = 0; int has_magic_down = 0; // Compute ceil(log_2 D) ceil_log_2_D = 0; uint tmp; for (tmp = D; tmp > 0; tmp >>= 1) ceil_log_2_D += 1; // Begin a loop that increments the exponent, until we find a power of 2 that works. unsigned exponent; for (exponent = 0; ; exponent++) { // Quotient and remainder is from previous exponent; compute it for this exponent. if (remainder >= D - remainder) { // Doubling remainder will wrap around D quotient = quotient * 2 + 1; remainder = remainder * 2 - D; } else { // Remainder will not wrap quotient = quotient * 2; remainder = remainder * 2; } // We're done if this exponent works for the round_up algorithm. // Note that exponent may be larger than the maximum shift supported, // so the check for >= ceil_log_2_D is critical. if ((exponent + extra_shift >= ceil_log_2_D) || (D - remainder) <= ((uint)1 << (exponent + extra_shift))) break; // Set magic_down if we have not set it yet and this exponent works for the round_down algorithm if (! has_magic_down && remainder <= ((uint)1 << (exponent + extra_shift))) { has_magic_down = 1; down_multiplier = quotient; down_exponent = exponent; } } if (exponent < ceil_log_2_D) { // magic_up is efficient result.multiplier = quotient + 1; result.pre_shift = 0; result.post_shift = exponent; result.increment = 0; } else if (D & 1) { // Odd divisor, so use magic_down, which must have been set assert(has_magic_down); result.multiplier = down_multiplier; result.pre_shift = 0; result.post_shift = down_exponent; result.increment = 1; } else { // Even divisor, so use a prefix-shifted dividend unsigned pre_shift = 0; uint shifted_D = D; while ((shifted_D & 1) == 0) { shifted_D >>= 1; pre_shift += 1; } result = compute_unsigned_magic_info(shifted_D, num_bits - pre_shift); assert(result.increment == 0 && result.pre_shift == 0); //expect no increment or pre_shift in this path result.pre_shift = pre_shift; } return result; } struct magics_info compute_signed_magic_info(sint D) { // D must not be zero and must not be a power of 2 (or its negative) assert(D != 0 && (D & -D) != D && (D & -D) != -D); // Our result struct magics_info result; // Bits in an sint const unsigned SINT_BITS = sizeof(sint) * CHAR_BIT; // Absolute value of D (we know D is not the most negative value since that's a power of 2) const uint abs_d = (D < 0 ? -D : D); // The initial power of 2 is one less than the first one that can possibly work // "two31" in Warren unsigned exponent = SINT_BITS - 1; const uint initial_power_of_2 = (uint)1 << exponent; // Compute the absolute value of our "test numerator," // which is the largest dividend whose remainder with d is d-1. // This is called anc in Warren. const uint tmp = initial_power_of_2 + (D < 0); const uint abs_test_numer = tmp - 1 - tmp % abs_d; // Initialize our quotients and remainders (q1, r1, q2, r2 in Warren) uint quotient1 = initial_power_of_2 / abs_test_numer, remainder1 = initial_power_of_2 % abs_test_numer; uint quotient2 = initial_power_of_2 / abs_d, remainder2 = initial_power_of_2 % abs_d; uint delta; // Begin our loop do { // Update the exponent exponent++; // Update quotient1 and remainder1 quotient1 *= 2; remainder1 *= 2; if (remainder1 >= abs_test_numer) { quotient1 += 1; remainder1 -= abs_test_numer; } // Update quotient2 and remainder2 quotient2 *= 2; remainder2 *= 2; if (remainder2 >= abs_d) { quotient2 += 1; remainder2 -= abs_d; } // Keep going as long as (2**exponent) / abs_d <= delta delta = abs_d - remainder2; } while (quotient1 < delta || (quotient1 == delta && remainder1 == 0)); result.multiplier = quotient2 + 1; if (D < 0) result.multiplier = -result.multiplier; result.shift = exponent - SINT_BITS; return result; } libdivide-3.0/libdivide.h000066400000000000000000002340411355155642500154320ustar00rootroot00000000000000// libdivide.h - Optimized integer division // https://libdivide.com // // Copyright (C) 2010 - 2019 ridiculous_fish, // Copyright (C) 2016 - 2019 Kim Walisch, // // libdivide is dual-licensed under the Boost or zlib licenses. // You may use libdivide under the terms of either of these. // See LICENSE.txt for more details. #ifndef LIBDIVIDE_H #define LIBDIVIDE_H #define LIBDIVIDE_VERSION "3.0" #define LIBDIVIDE_VERSION_MAJOR 3 #define LIBDIVIDE_VERSION_MINOR 0 #include #if defined(__cplusplus) #include #include #include #else #include #include #endif #if defined(LIBDIVIDE_AVX512) #include #elif defined(LIBDIVIDE_AVX2) #include #elif defined(LIBDIVIDE_SSE2) #include #endif #if defined(_MSC_VER) #include // disable warning C4146: unary minus operator applied // to unsigned type, result still unsigned #pragma warning(disable: 4146) #define LIBDIVIDE_VC #endif #if !defined(__has_builtin) #define __has_builtin(x) 0 #endif #if defined(__SIZEOF_INT128__) #define HAS_INT128_T // clang-cl on Windows does not yet support 128-bit division #if !(defined(__clang__) && defined(LIBDIVIDE_VC)) #define HAS_INT128_DIV #endif #endif #if defined(__x86_64__) || defined(_M_X64) #define LIBDIVIDE_X86_64 #endif #if defined(__i386__) #define LIBDIVIDE_i386 #endif #if defined(__GNUC__) || defined(__clang__) #define LIBDIVIDE_GCC_STYLE_ASM #endif #if defined(__cplusplus) || defined(LIBDIVIDE_VC) #define LIBDIVIDE_FUNCTION __FUNCTION__ #else #define LIBDIVIDE_FUNCTION __func__ #endif #define LIBDIVIDE_ERROR(msg) \ do { \ fprintf(stderr, "libdivide.h:%d: %s(): Error: %s\n", \ __LINE__, LIBDIVIDE_FUNCTION, msg); \ exit(-1); \ } while (0) #if defined(LIBDIVIDE_ASSERTIONS_ON) #define LIBDIVIDE_ASSERT(x) \ do { \ if (!(x)) { \ fprintf(stderr, "libdivide.h:%d: %s(): Assertion failed: %s\n", \ __LINE__, LIBDIVIDE_FUNCTION, #x); \ exit(-1); \ } \ } while (0) #else #define LIBDIVIDE_ASSERT(x) #endif #ifdef __cplusplus namespace libdivide { #endif // pack divider structs to prevent compilers from padding. // This reduces memory usage by up to 43% when using a large // array of libdivide dividers and improves performance // by up to 10% because of reduced memory bandwidth. #pragma pack(push, 1) struct libdivide_u32_t { uint32_t magic; uint8_t more; }; struct libdivide_s32_t { int32_t magic; uint8_t more; }; struct libdivide_u64_t { uint64_t magic; uint8_t more; }; struct libdivide_s64_t { int64_t magic; uint8_t more; }; struct libdivide_u32_branchfree_t { uint32_t magic; uint8_t more; }; struct libdivide_s32_branchfree_t { int32_t magic; uint8_t more; }; struct libdivide_u64_branchfree_t { uint64_t magic; uint8_t more; }; struct libdivide_s64_branchfree_t { int64_t magic; uint8_t more; }; #pragma pack(pop) // Explanation of the "more" field: // // * Bits 0-5 is the shift value (for shift path or mult path). // * Bit 6 is the add indicator for mult path. // * Bit 7 is set if the divisor is negative. We use bit 7 as the negative // divisor indicator so that we can efficiently use sign extension to // create a bitmask with all bits set to 1 (if the divisor is negative) // or 0 (if the divisor is positive). // // u32: [0-4] shift value // [5] ignored // [6] add indicator // magic number of 0 indicates shift path // // s32: [0-4] shift value // [5] ignored // [6] add indicator // [7] indicates negative divisor // magic number of 0 indicates shift path // // u64: [0-5] shift value // [6] add indicator // magic number of 0 indicates shift path // // s64: [0-5] shift value // [6] add indicator // [7] indicates negative divisor // magic number of 0 indicates shift path // // In s32 and s64 branchfree modes, the magic number is negated according to // whether the divisor is negated. In branchfree strategy, it is not negated. enum { LIBDIVIDE_32_SHIFT_MASK = 0x1F, LIBDIVIDE_64_SHIFT_MASK = 0x3F, LIBDIVIDE_ADD_MARKER = 0x40, LIBDIVIDE_NEGATIVE_DIVISOR = 0x80 }; static inline struct libdivide_s32_t libdivide_s32_gen(int32_t d); static inline struct libdivide_u32_t libdivide_u32_gen(uint32_t d); static inline struct libdivide_s64_t libdivide_s64_gen(int64_t d); static inline struct libdivide_u64_t libdivide_u64_gen(uint64_t d); static inline struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d); static inline struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d); static inline struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d); static inline struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d); static inline int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom); static inline uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom); static inline int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom); static inline uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom); static inline int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom); static inline uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom); static inline int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom); static inline uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom); static inline int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom); static inline uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom); static inline int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom); static inline uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom); static inline int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom); static inline uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom); static inline int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom); static inline uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom); //////// Internal Utility Functions static inline uint32_t libdivide_mullhi_u32(uint32_t x, uint32_t y) { uint64_t xl = x, yl = y; uint64_t rl = xl * yl; return (uint32_t)(rl >> 32); } static inline int32_t libdivide_mullhi_s32(int32_t x, int32_t y) { int64_t xl = x, yl = y; int64_t rl = xl * yl; // needs to be arithmetic shift return (int32_t)(rl >> 32); } static inline uint64_t libdivide_mullhi_u64(uint64_t x, uint64_t y) { #if defined(LIBDIVIDE_VC) && \ defined(LIBDIVIDE_X86_64) return __umulh(x, y); #elif defined(HAS_INT128_T) __uint128_t xl = x, yl = y; __uint128_t rl = xl * yl; return (uint64_t)(rl >> 64); #else // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) uint32_t mask = 0xFFFFFFFF; uint32_t x0 = (uint32_t)(x & mask); uint32_t x1 = (uint32_t)(x >> 32); uint32_t y0 = (uint32_t)(y & mask); uint32_t y1 = (uint32_t)(y >> 32); uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); uint64_t x0y1 = x0 * (uint64_t)y1; uint64_t x1y0 = x1 * (uint64_t)y0; uint64_t x1y1 = x1 * (uint64_t)y1; uint64_t temp = x1y0 + x0y0_hi; uint64_t temp_lo = temp & mask; uint64_t temp_hi = temp >> 32; return x1y1 + temp_hi + ((temp_lo + x0y1) >> 32); #endif } static inline int64_t libdivide_mullhi_s64(int64_t x, int64_t y) { #if defined(LIBDIVIDE_VC) && \ defined(LIBDIVIDE_X86_64) return __mulh(x, y); #elif defined(HAS_INT128_T) __int128_t xl = x, yl = y; __int128_t rl = xl * yl; return (int64_t)(rl >> 64); #else // full 128 bits are x0 * y0 + (x0 * y1 << 32) + (x1 * y0 << 32) + (x1 * y1 << 64) uint32_t mask = 0xFFFFFFFF; uint32_t x0 = (uint32_t)(x & mask); uint32_t y0 = (uint32_t)(y & mask); int32_t x1 = (int32_t)(x >> 32); int32_t y1 = (int32_t)(y >> 32); uint32_t x0y0_hi = libdivide_mullhi_u32(x0, y0); int64_t t = x1 * (int64_t)y0 + x0y0_hi; int64_t w1 = x0 * (int64_t)y1 + (t & mask); return x1 * (int64_t)y1 + (t >> 32) + (w1 >> 32); #endif } static inline int32_t libdivide_count_leading_zeros32(uint32_t val) { #if defined(__GNUC__) || \ __has_builtin(__builtin_clz) // Fast way to count leading zeros return __builtin_clz(val); #elif defined(LIBDIVIDE_VC) unsigned long result; if (_BitScanReverse(&result, val)) { return 31 - result; } return 0; #else int32_t result = 0; uint32_t hi = 1U << 31; for (; ~val & hi; hi >>= 1) { result++; } return result; #endif } static inline int32_t libdivide_count_leading_zeros64(uint64_t val) { #if defined(__GNUC__) || \ __has_builtin(__builtin_clzll) // Fast way to count leading zeros return __builtin_clzll(val); #elif defined(LIBDIVIDE_VC) && defined(_WIN64) unsigned long result; if (_BitScanReverse64(&result, val)) { return 63 - result; } return 0; #else uint32_t hi = val >> 32; uint32_t lo = val & 0xFFFFFFFF; if (hi != 0) return libdivide_count_leading_zeros32(hi); return 32 + libdivide_count_leading_zeros32(lo); #endif } // libdivide_64_div_32_to_32: divides a 64-bit uint {u1, u0} by a 32-bit // uint {v}. The result must fit in 32 bits. // Returns the quotient directly and the remainder in *r static inline uint32_t libdivide_64_div_32_to_32(uint32_t u1, uint32_t u0, uint32_t v, uint32_t *r) { #if (defined(LIBDIVIDE_i386) || defined(LIBDIVIDE_X86_64)) && \ defined(LIBDIVIDE_GCC_STYLE_ASM) uint32_t result; __asm__("divl %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1) ); return result; #else uint64_t n = ((uint64_t)u1 << 32) | u0; uint32_t result = (uint32_t)(n / v); *r = (uint32_t)(n - result * (uint64_t)v); return result; #endif } // libdivide_128_div_64_to_64: divides a 128-bit uint {u1, u0} by a 64-bit // uint {v}. The result must fit in 64 bits. // Returns the quotient directly and the remainder in *r static uint64_t libdivide_128_div_64_to_64(uint64_t u1, uint64_t u0, uint64_t v, uint64_t *r) { #if defined(LIBDIVIDE_X86_64) && \ defined(LIBDIVIDE_GCC_STYLE_ASM) uint64_t result; __asm__("divq %[v]" : "=a"(result), "=d"(*r) : [v] "r"(v), "a"(u0), "d"(u1) ); return result; #elif defined(HAS_INT128_T) && \ defined(HAS_INT128_DIV) __uint128_t n = ((__uint128_t)u1 << 64) | u0; uint64_t result = (uint64_t)(n / v); *r = (uint64_t)(n - result * (__uint128_t)v); return result; #else // Code taken from Hacker's Delight: // http://www.hackersdelight.org/HDcode/divlu.c. // License permits inclusion here per: // http://www.hackersdelight.org/permissions.htm const uint64_t b = (1ULL << 32); // Number base (32 bits) uint64_t un1, un0; // Norm. dividend LSD's uint64_t vn1, vn0; // Norm. divisor digits uint64_t q1, q0; // Quotient digits uint64_t un64, un21, un10; // Dividend digit pairs uint64_t rhat; // A remainder int32_t s; // Shift amount for norm // If overflow, set rem. to an impossible value, // and return the largest possible quotient if (u1 >= v) { *r = (uint64_t) -1; return (uint64_t) -1; } // count leading zeros s = libdivide_count_leading_zeros64(v); if (s > 0) { // Normalize divisor v = v << s; un64 = (u1 << s) | (u0 >> (64 - s)); un10 = u0 << s; // Shift dividend left } else { // Avoid undefined behavior of (u0 >> 64). // The behavior is undefined if the right operand is // negative, or greater than or equal to the length // in bits of the promoted left operand. un64 = u1; un10 = u0; } // Break divisor up into two 32-bit digits vn1 = v >> 32; vn0 = v & 0xFFFFFFFF; // Break right half of dividend into two digits un1 = un10 >> 32; un0 = un10 & 0xFFFFFFFF; // Compute the first quotient digit, q1 q1 = un64 / vn1; rhat = un64 - q1 * vn1; while (q1 >= b || q1 * vn0 > b * rhat + un1) { q1 = q1 - 1; rhat = rhat + vn1; if (rhat >= b) break; } // Multiply and subtract un21 = un64 * b + un1 - q1 * v; // Compute the second quotient digit q0 = un21 / vn1; rhat = un21 - q0 * vn1; while (q0 >= b || q0 * vn0 > b * rhat + un0) { q0 = q0 - 1; rhat = rhat + vn1; if (rhat >= b) break; } *r = (un21 * b + un0 - q0 * v) >> s; return q1 * b + q0; #endif } // Bitshift a u128 in place, left (signed_shift > 0) or right (signed_shift < 0) static inline void libdivide_u128_shift(uint64_t *u1, uint64_t *u0, int32_t signed_shift) { if (signed_shift > 0) { uint32_t shift = signed_shift; *u1 <<= shift; *u1 |= *u0 >> (64 - shift); *u0 <<= shift; } else if (signed_shift < 0) { uint32_t shift = -signed_shift; *u0 >>= shift; *u0 |= *u1 << (64 - shift); *u1 >>= shift; } } // Computes a 128 / 128 -> 64 bit division, with a 128 bit remainder. static uint64_t libdivide_128_div_128_to_64(uint64_t u_hi, uint64_t u_lo, uint64_t v_hi, uint64_t v_lo, uint64_t *r_hi, uint64_t *r_lo) { #if defined(HAS_INT128_T) && \ defined(HAS_INT128_DIV) __uint128_t ufull = u_hi; __uint128_t vfull = v_hi; ufull = (ufull << 64) | u_lo; vfull = (vfull << 64) | v_lo; uint64_t res = (uint64_t)(ufull / vfull); __uint128_t remainder = ufull - (vfull * res); *r_lo = (uint64_t)remainder; *r_hi = (uint64_t)(remainder >> 64); return res; #else // Adapted from "Unsigned Doubleword Division" in Hacker's Delight // We want to compute u / v typedef struct { uint64_t hi; uint64_t lo; } u128_t; u128_t u = {u_hi, u_lo}; u128_t v = {v_hi, v_lo}; if (v.hi == 0) { // divisor v is a 64 bit value, so we just need one 128/64 division // Note that we are simpler than Hacker's Delight here, because we know // the quotient fits in 64 bits whereas Hacker's Delight demands a full // 128 bit quotient *r_hi = 0; return libdivide_128_div_64_to_64(u.hi, u.lo, v.lo, r_lo); } // Here v >= 2**64 // We know that v.hi != 0, so count leading zeros is OK // We have 0 <= n <= 63 uint32_t n = libdivide_count_leading_zeros64(v.hi); // Normalize the divisor so its MSB is 1 u128_t v1t = v; libdivide_u128_shift(&v1t.hi, &v1t.lo, n); uint64_t v1 = v1t.hi; // i.e. v1 = v1t >> 64 // To ensure no overflow u128_t u1 = u; libdivide_u128_shift(&u1.hi, &u1.lo, -1); // Get quotient from divide unsigned insn. uint64_t rem_ignored; uint64_t q1 = libdivide_128_div_64_to_64(u1.hi, u1.lo, v1, &rem_ignored); // Undo normalization and division of u by 2. u128_t q0 = {0, q1}; libdivide_u128_shift(&q0.hi, &q0.lo, n); libdivide_u128_shift(&q0.hi, &q0.lo, -63); // Make q0 correct or too small by 1 // Equivalent to `if (q0 != 0) q0 = q0 - 1;` if (q0.hi != 0 || q0.lo != 0) { q0.hi -= (q0.lo == 0); // borrow q0.lo -= 1; } // Now q0 is correct. // Compute q0 * v as q0v // = (q0.hi << 64 + q0.lo) * (v.hi << 64 + v.lo) // = (q0.hi * v.hi << 128) + (q0.hi * v.lo << 64) + // (q0.lo * v.hi << 64) + q0.lo * v.lo) // Each term is 128 bit // High half of full product (upper 128 bits!) are dropped u128_t q0v = {0, 0}; q0v.hi = q0.hi*v.lo + q0.lo*v.hi + libdivide_mullhi_u64(q0.lo, v.lo); q0v.lo = q0.lo*v.lo; // Compute u - q0v as u_q0v // This is the remainder u128_t u_q0v = u; u_q0v.hi -= q0v.hi + (u.lo < q0v.lo); // second term is borrow u_q0v.lo -= q0v.lo; // Check if u_q0v >= v // This checks if our remainder is larger than the divisor if ((u_q0v.hi > v.hi) || (u_q0v.hi == v.hi && u_q0v.lo >= v.lo)) { // Increment q0 q0.lo += 1; q0.hi += (q0.lo == 0); // carry // Subtract v from remainder u_q0v.hi -= v.hi + (u_q0v.lo < v.lo); u_q0v.lo -= v.lo; } *r_hi = u_q0v.hi; *r_lo = u_q0v.lo; LIBDIVIDE_ASSERT(q0.hi == 0); return q0.lo; #endif } ////////// UINT32 static inline struct libdivide_u32_t libdivide_internal_u32_gen(uint32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_u32_t result; uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(d); // Power of 2 if ((d & (d - 1)) == 0) { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); } else { uint8_t more; uint32_t rem, proposed_m; proposed_m = libdivide_64_div_32_to_32(1U << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint32_t e = d - rem; // This power works if e < 2**floor_log_2_d. if (!branchfree && (e < (1U << floor_log_2_d))) { // This power works more = floor_log_2_d; } else { // We have to use the general 33-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } result.magic = 1 + proposed_m; result.more = more; // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases. } return result; } struct libdivide_u32_t libdivide_u32_gen(uint32_t d) { return libdivide_internal_u32_gen(d, 0); } struct libdivide_u32_branchfree_t libdivide_u32_branchfree_gen(uint32_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u32_t tmp = libdivide_internal_u32_gen(d, 1); struct libdivide_u32_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_32_SHIFT_MASK)}; return ret; } uint32_t libdivide_u32_do(uint32_t numer, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; } else { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint32_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_32_SHIFT_MASK); } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } } } uint32_t libdivide_u32_branchfree_do(uint32_t numer, const struct libdivide_u32_branchfree_t *denom) { uint32_t q = libdivide_mullhi_u32(denom->magic, numer); uint32_t t = ((numer - q) >> 1) + q; return t >> denom->more; } uint32_t libdivide_u32_recover(const struct libdivide_u32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { return 1U << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(32 + shift) // Therefore we have d = 2^(32 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor uint32_t hi_dividend = 1U << shift; uint32_t rem_ignored; return 1 + libdivide_64_div_32_to_32(hi_dividend, 0, denom->magic, &rem_ignored); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. uint64_t half_n = 1ULL << (32 + shift); uint64_t d = (1ULL << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); uint64_t rem = half_n % d; // We computed 2^(32+shift)/(m+2^32) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits uint32_t full_q = half_q + half_q + ((rem<<1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } uint32_t libdivide_u32_branchfree_recover(const struct libdivide_u32_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { return 1U << (shift + 1); } else { // Here we wish to compute d = 2^(32+shift+1)/(m+2^32). // Notice (m + 2^32) is a 33 bit number. Use 64 bit division for now // Also note that shift may be as high as 31, so shift + 1 will // overflow. So we have to compute it as 2^(32+shift)/(m+2^32), and // then double the quotient and remainder. uint64_t half_n = 1ULL << (32 + shift); uint64_t d = (1ULL << 32) | denom->magic; // Note that the quotient is guaranteed <= 32 bits, but the remainder // may need 33! uint32_t half_q = (uint32_t)(half_n / d); uint64_t rem = half_n % d; // We computed 2^(32+shift)/(m+2^32) // Need to double it, and then add 1 to the quotient if doubling th // remainder would increase the quotient. // Note that rem<<1 cannot overflow, since rem < d and d is 33 bits uint32_t full_q = half_q + half_q + ((rem<<1) >= d); // We rounded down in gen (hence +1) return full_q + 1; } } /////////// UINT64 static inline struct libdivide_u64_t libdivide_internal_u64_gen(uint64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_u64_t result; uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(d); // Power of 2 if ((d & (d - 1)) == 0) { // We need to subtract 1 from the shift value in case of an unsigned // branchfree divider because there is a hardcoded right shift by 1 // in its division algorithm. Because of this we also need to add back // 1 in its recovery algorithm. result.magic = 0; result.more = (uint8_t)(floor_log_2_d - (branchfree != 0)); } else { uint64_t proposed_m, rem; uint8_t more; // (1 << (64 + floor_log_2_d)) / d proposed_m = libdivide_128_div_64_to_64(1ULL << floor_log_2_d, 0, d, &rem); LIBDIVIDE_ASSERT(rem > 0 && rem < d); const uint64_t e = d - rem; // This power works if e < 2**floor_log_2_d. if (!branchfree && e < (1ULL << floor_log_2_d)) { // This power works more = floor_log_2_d; } else { // We have to use the general 65-bit algorithm. We need to compute // (2**power) / d. However, we already have (2**(power-1))/d and // its remainder. By doubling both, and then correcting the // remainder, we can compute the larger division. // don't care about overflow here - in fact, we expect it proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= d || twice_rem < rem) proposed_m += 1; more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } result.magic = 1 + proposed_m; result.more = more; // result.more's shift should in general be ceil_log_2_d. But if we // used the smaller power, we subtract one from the shift because we're // using the smaller power. If we're using the larger power, we // subtract one from the shift because it's taken care of by the add // indicator. So floor_log_2_d happens to be correct in both cases, // which is why we do it outside of the if statement. } return result; } struct libdivide_u64_t libdivide_u64_gen(uint64_t d) { return libdivide_internal_u64_gen(d, 0); } struct libdivide_u64_branchfree_t libdivide_u64_branchfree_gen(uint64_t d) { if (d == 1) { LIBDIVIDE_ERROR("branchfree divider must be != 1"); } struct libdivide_u64_t tmp = libdivide_internal_u64_gen(d, 1); struct libdivide_u64_branchfree_t ret = {tmp.magic, (uint8_t)(tmp.more & LIBDIVIDE_64_SHIFT_MASK)}; return ret; } uint64_t libdivide_u64_do(uint64_t numer, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return numer >> more; } else { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { uint64_t t = ((numer - q) >> 1) + q; return t >> (more & LIBDIVIDE_64_SHIFT_MASK); } else { // All upper bits are 0, // don't need to mask them off. return q >> more; } } } uint64_t libdivide_u64_branchfree_do(uint64_t numer, const struct libdivide_u64_branchfree_t *denom) { uint64_t q = libdivide_mullhi_u64(denom->magic, numer); uint64_t t = ((numer - q) >> 1) + q; return t >> denom->more; } uint64_t libdivide_u64_recover(const struct libdivide_u64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { return 1ULL << shift; } else if (!(more & LIBDIVIDE_ADD_MARKER)) { // We compute q = n/d = n*m / 2^(64 + shift) // Therefore we have d = 2^(64 + shift) / m // We need to ceil it. // We know d is not a power of 2, so m is not a power of 2, // so we can just add 1 to the floor uint64_t hi_dividend = 1ULL << shift; uint64_t rem_ignored; return 1 + libdivide_128_div_64_to_64(hi_dividend, 0, denom->magic, &rem_ignored); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See // libdivide_u32_recover for more on what we do here. // TODO: do something better than 128 bit math // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; } } uint64_t libdivide_u64_branchfree_recover(const struct libdivide_u64_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { return 1ULL << (shift + 1); } else { // Here we wish to compute d = 2^(64+shift+1)/(m+2^64). // Notice (m + 2^64) is a 65 bit number. This gets hairy. See // libdivide_u32_recover for more on what we do here. // TODO: do something better than 128 bit math // Full n is a (potentially) 129 bit value // half_n is a 128 bit value // Compute the hi half of half_n. Low half is 0. uint64_t half_n_hi = 1ULL << shift, half_n_lo = 0; // d is a 65 bit value. The high bit is always set to 1. const uint64_t d_hi = 1, d_lo = denom->magic; // Note that the quotient is guaranteed <= 64 bits, // but the remainder may need 65! uint64_t r_hi, r_lo; uint64_t half_q = libdivide_128_div_128_to_64(half_n_hi, half_n_lo, d_hi, d_lo, &r_hi, &r_lo); // We computed 2^(64+shift)/(m+2^64) // Double the remainder ('dr') and check if that is larger than d // Note that d is a 65 bit value, so r1 is small and so r1 + r1 // cannot overflow uint64_t dr_lo = r_lo + r_lo; uint64_t dr_hi = r_hi + r_hi + (dr_lo < r_lo); // last term is carry int dr_exceeds_d = (dr_hi > d_hi) || (dr_hi == d_hi && dr_lo >= d_lo); uint64_t full_q = half_q + half_q + (dr_exceeds_d ? 1 : 0); return full_q + 1; } } /////////// SINT32 static inline struct libdivide_s32_t libdivide_internal_s32_gen(int32_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_s32_t result; // If d is a power of 2, or negative a power of 2, we have to use a shift. // This is especially important because the magic algorithm fails for -1. // To check if d is a power of 2 or its inverse, it suffices to check // whether its absolute value has exactly one bit set. This works even for // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set // and is a power of 2. uint32_t ud = (uint32_t)d; uint32_t absD = (d < 0) ? -ud : ud; uint32_t floor_log_2_d = 31 - libdivide_count_leading_zeros32(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { // Branchfree and normal paths are exactly the same result.magic = 0; result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); } else { LIBDIVIDE_ASSERT(floor_log_2_d >= 1); uint8_t more; // the dividend here is 2**(floor_log_2_d + 31), so the low 32 bit word // is 0 and the high word is floor_log_2_d - 1 uint32_t rem, proposed_m; proposed_m = libdivide_64_div_32_to_32(1U << (floor_log_2_d - 1), 0, absD, &rem); const uint32_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. if (!branchfree && e < (1U << floor_log_2_d)) { // This power works more = floor_log_2_d - 1; } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an // int32_t. proposed_m += proposed_m; const uint32_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } proposed_m += 1; int32_t magic = (int32_t)proposed_m; // Mark if we are negative. Note we only negate the magic number in the // branchfull case. if (d < 0) { more |= LIBDIVIDE_NEGATIVE_DIVISOR; if (!branchfree) { magic = -magic; } } result.more = more; result.magic = magic; } return result; } struct libdivide_s32_t libdivide_s32_gen(int32_t d) { return libdivide_internal_s32_gen(d, 0); } struct libdivide_s32_branchfree_t libdivide_s32_branchfree_gen(int32_t d) { struct libdivide_s32_t tmp = libdivide_internal_s32_gen(d, 1); struct libdivide_s32_branchfree_t result = {tmp.magic, tmp.more}; return result; } int32_t libdivide_s32_do(int32_t numer, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { uint32_t sign = (int8_t)more >> 7; uint32_t mask = (1U << shift) - 1; uint32_t uq = numer + ((numer >> 31) & mask); int32_t q = (int32_t)uq; q >>= shift; q = (q ^ sign) - sign; return q; } else { uint32_t uq = (uint32_t)libdivide_mullhi_s32(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; // q += (more < 0 ? -numer : numer) // cast required to avoid UB uq += ((uint32_t)numer ^ sign) - sign; } int32_t q = (int32_t)uq; q >>= shift; q += (q < 0); return q; } } int32_t libdivide_s32_branchfree_do(int32_t numer, const struct libdivide_s32_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift and then sign extend int32_t sign = (int8_t)more >> 7; int32_t magic = denom->magic; int32_t q = libdivide_mullhi_s32(magic, numer); q += numer; // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is a power of // 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); uint32_t q_sign = (uint32_t)(q >> 31); q += q_sign & ((1U << shift) - is_power_of_2); // Now arithmetic right shift q >>= shift; // Negate if needed q = (q ^ sign) - sign; return q; } int32_t libdivide_s32_recover(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; if (!denom->magic) { uint32_t absD = 1U << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } return (int32_t)absD; } else { // Unsigned math is much easier // We negate the magic number only in the branchfull case, and we don't // know which case we're in. However we have enough information to // determine the correct sign of the magic number. The divisor was // negative if LIBDIVIDE_NEGATIVE_DIVISOR is set. If ADD_MARKER is set, // the magic number's sign is opposite that of the divisor. // We want to compute the positive magic number. int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; // Handle the power of 2 case (including branchfree) if (denom->magic == 0) { int32_t result = 1U << shift; return negative_divisor ? -result : result; } uint32_t d = (uint32_t)(magic_was_negated ? -denom->magic : denom->magic); uint64_t n = 1ULL << (32 + shift); // this shift cannot exceed 30 uint32_t q = (uint32_t)(n / d); int32_t result = (int32_t)q; result += 1; return negative_divisor ? -result : result; } } int32_t libdivide_s32_branchfree_recover(const struct libdivide_s32_branchfree_t *denom) { return libdivide_s32_recover((const struct libdivide_s32_t *)denom); } ///////////// SINT64 static inline struct libdivide_s64_t libdivide_internal_s64_gen(int64_t d, int branchfree) { if (d == 0) { LIBDIVIDE_ERROR("divider must be != 0"); } struct libdivide_s64_t result; // If d is a power of 2, or negative a power of 2, we have to use a shift. // This is especially important because the magic algorithm fails for -1. // To check if d is a power of 2 or its inverse, it suffices to check // whether its absolute value has exactly one bit set. This works even for // INT_MIN, because abs(INT_MIN) == INT_MIN, and INT_MIN has one bit set // and is a power of 2. uint64_t ud = (uint64_t)d; uint64_t absD = (d < 0) ? -ud : ud; uint32_t floor_log_2_d = 63 - libdivide_count_leading_zeros64(absD); // check if exactly one bit is set, // don't care if absD is 0 since that's divide by zero if ((absD & (absD - 1)) == 0) { // Branchfree and non-branchfree cases are the same result.magic = 0; result.more = floor_log_2_d | (d < 0 ? LIBDIVIDE_NEGATIVE_DIVISOR : 0); } else { // the dividend here is 2**(floor_log_2_d + 63), so the low 64 bit word // is 0 and the high word is floor_log_2_d - 1 uint8_t more; uint64_t rem, proposed_m; proposed_m = libdivide_128_div_64_to_64(1ULL << (floor_log_2_d - 1), 0, absD, &rem); const uint64_t e = absD - rem; // We are going to start with a power of floor_log_2_d - 1. // This works if works if e < 2**floor_log_2_d. if (!branchfree && e < (1ULL << floor_log_2_d)) { // This power works more = floor_log_2_d - 1; } else { // We need to go one higher. This should not make proposed_m // overflow, but it will make it negative when interpreted as an // int32_t. proposed_m += proposed_m; const uint64_t twice_rem = rem + rem; if (twice_rem >= absD || twice_rem < rem) proposed_m += 1; // note that we only set the LIBDIVIDE_NEGATIVE_DIVISOR bit if we // also set ADD_MARKER this is an annoying optimization that // enables algorithm #4 to avoid the mask. However we always set it // in the branchfree case more = floor_log_2_d | LIBDIVIDE_ADD_MARKER; } proposed_m += 1; int64_t magic = (int64_t)proposed_m; // Mark if we are negative if (d < 0) { more |= LIBDIVIDE_NEGATIVE_DIVISOR; if (!branchfree) { magic = -magic; } } result.more = more; result.magic = magic; } return result; } struct libdivide_s64_t libdivide_s64_gen(int64_t d) { return libdivide_internal_s64_gen(d, 0); } struct libdivide_s64_branchfree_t libdivide_s64_branchfree_gen(int64_t d) { struct libdivide_s64_t tmp = libdivide_internal_s64_gen(d, 1); struct libdivide_s64_branchfree_t ret = {tmp.magic, tmp.more}; return ret; } int64_t libdivide_s64_do(int64_t numer, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (!denom->magic) { // shift path uint64_t mask = (1ULL << shift) - 1; uint64_t uq = numer + ((numer >> 63) & mask); int64_t q = (int64_t)uq; q >>= shift; // must be arithmetic shift and then sign-extend int64_t sign = (int8_t)more >> 7; q = (q ^ sign) - sign; return q; } else { uint64_t uq = (uint64_t)libdivide_mullhi_s64(denom->magic, numer); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; // q += (more < 0 ? -numer : numer) // cast required to avoid UB uq += ((uint64_t)numer ^ sign) - sign; } int64_t q = (int64_t)uq; q >>= shift; q += (q < 0); return q; } } int64_t libdivide_s64_branchfree_do(int64_t numer, const struct libdivide_s64_branchfree_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift and then sign extend int64_t sign = (int8_t)more >> 7; int64_t magic = denom->magic; int64_t q = libdivide_mullhi_s64(magic, numer); q += numer; // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is a power of // 2, or (2**shift) if it is not a power of 2. uint64_t is_power_of_2 = (magic == 0); uint64_t q_sign = (uint64_t)(q >> 63); q += q_sign & ((1ULL << shift) - is_power_of_2); // Arithmetic right shift q >>= shift; // Negate if needed q = (q ^ sign) - sign; return q; } int64_t libdivide_s64_recover(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; if (denom->magic == 0) { // shift path uint64_t absD = 1ULL << shift; if (more & LIBDIVIDE_NEGATIVE_DIVISOR) { absD = -absD; } return (int64_t)absD; } else { // Unsigned math is much easier int negative_divisor = (more & LIBDIVIDE_NEGATIVE_DIVISOR); int magic_was_negated = (more & LIBDIVIDE_ADD_MARKER) ? denom->magic > 0 : denom->magic < 0; uint64_t d = (uint64_t)(magic_was_negated ? -denom->magic : denom->magic); uint64_t n_hi = 1ULL << shift, n_lo = 0; uint64_t rem_ignored; uint64_t q = libdivide_128_div_64_to_64(n_hi, n_lo, d, &rem_ignored); int64_t result = (int64_t)(q + 1); if (negative_divisor) { result = -result; } return result; } } int64_t libdivide_s64_branchfree_recover(const struct libdivide_s64_branchfree_t *denom) { return libdivide_s64_recover((const struct libdivide_s64_t *)denom); } #if defined(LIBDIVIDE_AVX512) static inline __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom); static inline __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom); static inline __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom); static inline __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom); static inline __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom); static inline __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom); static inline __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom); static inline __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions static inline __m512i libdivide_s64_signbits(__m512i v) {; return _mm512_srai_epi64(v, 63); } static inline __m512i libdivide_s64_shift_right_vector(__m512i v, int amt) { return _mm512_srai_epi64(v, amt); } // Here, b is assumed to contain one 32-bit value repeated. static inline __m512i libdivide_mullhi_u32_vector(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epu32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epu32(a1X3X, b), mask); return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); } // b is one 32-bit value repeated. static inline __m512i libdivide_mullhi_s32_vector(__m512i a, __m512i b) { __m512i hi_product_0Z2Z = _mm512_srli_epi64(_mm512_mul_epi32(a, b), 32); __m512i a1X3X = _mm512_srli_epi64(a, 32); __m512i mask = _mm512_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0, -1, 0); __m512i hi_product_Z1Z3 = _mm512_and_si512(_mm512_mul_epi32(a1X3X, b), mask); return _mm512_or_si512(hi_product_0Z2Z, hi_product_Z1Z3); } // Here, y is assumed to contain one 64-bit value repeated. // https://stackoverflow.com/a/28827013 static inline __m512i libdivide_mullhi_u64_vector(__m512i x, __m512i y) { __m512i lomask = _mm512_set1_epi64(0xffffffff); __m512i xh = _mm512_shuffle_epi32(x, (_MM_PERM_ENUM) 0xB1); __m512i yh = _mm512_shuffle_epi32(y, (_MM_PERM_ENUM) 0xB1); __m512i w0 = _mm512_mul_epu32(x, y); __m512i w1 = _mm512_mul_epu32(x, yh); __m512i w2 = _mm512_mul_epu32(xh, y); __m512i w3 = _mm512_mul_epu32(xh, yh); __m512i w0h = _mm512_srli_epi64(w0, 32); __m512i s1 = _mm512_add_epi64(w1, w0h); __m512i s1l = _mm512_and_si512(s1, lomask); __m512i s1h = _mm512_srli_epi64(s1, 32); __m512i s2 = _mm512_add_epi64(w2, s1l); __m512i s2h = _mm512_srli_epi64(s2, 32); __m512i hi = _mm512_add_epi64(w3, s1h); hi = _mm512_add_epi64(hi, s2h); return hi; } // y is one 64-bit value repeated. static inline __m512i libdivide_mullhi_s64_vector(__m512i x, __m512i y) { __m512i p = libdivide_mullhi_u64_vector(x, y); __m512i t1 = _mm512_and_si512(libdivide_s64_signbits(x), y); __m512i t2 = _mm512_and_si512(libdivide_s64_signbits(y), x); p = _mm512_sub_epi64(p, t1); p = _mm512_sub_epi64(p, t2); return p; } ////////// UINT32 __m512i libdivide_u32_do_vector(__m512i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi32(numers, more); } else { __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, shift); } else { return _mm512_srli_epi32(q, more); } } } __m512i libdivide_u32_branchfree_do_vector(__m512i numers, const struct libdivide_u32_branchfree_t *denom) { __m512i q = libdivide_mullhi_u32_vector(numers, _mm512_set1_epi32(denom->magic)); __m512i t = _mm512_add_epi32(_mm512_srli_epi32(_mm512_sub_epi32(numers, q), 1), q); return _mm512_srli_epi32(t, denom->more); } ////////// UINT64 __m512i libdivide_u64_do_vector(__m512i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm512_srli_epi64(numers, more); } else { __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, shift); } else { return _mm512_srli_epi64(q, more); } } } __m512i libdivide_u64_branchfree_do_vector(__m512i numers, const struct libdivide_u64_branchfree_t *denom) { __m512i q = libdivide_mullhi_u64_vector(numers, _mm512_set1_epi64(denom->magic)); __m512i t = _mm512_add_epi64(_mm512_srli_epi64(_mm512_sub_epi64(numers, q), 1), q); return _mm512_srli_epi64(t, denom->more); } ////////// SINT32 __m512i libdivide_s32_do_vector(__m512i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m512i q = _mm512_add_epi32(numers, _mm512_and_si512(_mm512_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm512_srai_epi32(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); return q; } else { __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm512_add_epi32(q, _mm512_sub_epi32(_mm512_xor_si512(numers, sign), sign)); } // q >>= shift q = _mm512_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm512_add_epi32(q, _mm512_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m512i libdivide_s32_branchfree_do_vector(__m512i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); __m512i q = libdivide_mullhi_s32_vector(numers, _mm512_set1_epi32(magic)); q = _mm512_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m512i q_sign = _mm512_srai_epi32(q, 31); // q_sign = q >> 31 __m512i mask = _mm512_set1_epi32((1U << shift) - is_power_of_2); q = _mm512_add_epi32(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = _mm512_srai_epi32(q, shift); // q >>= shift q = _mm512_sub_epi32(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m512i libdivide_s64_do_vector(__m512i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m512i roundToZeroTweak = _mm512_set1_epi64(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m512i q = _mm512_add_epi64(numers, _mm512_and_si512(libdivide_s64_signbits(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vector(q, shift); __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); return q; } else { __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm512_add_epi64(q, _mm512_sub_epi64(_mm512_xor_si512(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm512_add_epi64(q, _mm512_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m512i libdivide_s64_branchfree_do_vector(__m512i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m512i sign = _mm512_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m512i q = libdivide_mullhi_s64_vector(numers, _mm512_set1_epi64(magic)); q = _mm512_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m512i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m512i mask = _mm512_set1_epi64((1ULL << shift) - is_power_of_2); q = _mm512_add_epi64(q, _mm512_and_si512(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift q = _mm512_sub_epi64(_mm512_xor_si512(q, sign), sign); // q = (q ^ sign) - sign return q; } #elif defined(LIBDIVIDE_AVX2) static inline __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom); static inline __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom); static inline __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom); static inline __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom); static inline __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom); static inline __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom); static inline __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom); static inline __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm256_srai_epi64(v, 63) (from AVX512). static inline __m256i libdivide_s64_signbits(__m256i v) { __m256i hiBitsDuped = _mm256_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m256i signBits = _mm256_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm256_srai_epi64 (from AVX512). static inline __m256i libdivide_s64_shift_right_vector(__m256i v, int amt) { const int b = 64 - amt; __m256i m = _mm256_set1_epi64x(1ULL << (b - 1)); __m256i x = _mm256_srli_epi64(v, amt); __m256i result = _mm256_sub_epi64(_mm256_xor_si256(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. static inline __m256i libdivide_mullhi_u32_vector(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epu32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epu32(a1X3X, b), mask); return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); } // b is one 32-bit value repeated. static inline __m256i libdivide_mullhi_s32_vector(__m256i a, __m256i b) { __m256i hi_product_0Z2Z = _mm256_srli_epi64(_mm256_mul_epi32(a, b), 32); __m256i a1X3X = _mm256_srli_epi64(a, 32); __m256i mask = _mm256_set_epi32(-1, 0, -1, 0, -1, 0, -1, 0); __m256i hi_product_Z1Z3 = _mm256_and_si256(_mm256_mul_epi32(a1X3X, b), mask); return _mm256_or_si256(hi_product_0Z2Z, hi_product_Z1Z3); } // Here, y is assumed to contain one 64-bit value repeated. // https://stackoverflow.com/a/28827013 static inline __m256i libdivide_mullhi_u64_vector(__m256i x, __m256i y) { __m256i lomask = _mm256_set1_epi64x(0xffffffff); __m256i xh = _mm256_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h __m256i yh = _mm256_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h __m256i w0 = _mm256_mul_epu32(x, y); // x0l*y0l, x1l*y1l __m256i w1 = _mm256_mul_epu32(x, yh); // x0l*y0h, x1l*y1h __m256i w2 = _mm256_mul_epu32(xh, y); // x0h*y0l, x1h*y0l __m256i w3 = _mm256_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h __m256i w0h = _mm256_srli_epi64(w0, 32); __m256i s1 = _mm256_add_epi64(w1, w0h); __m256i s1l = _mm256_and_si256(s1, lomask); __m256i s1h = _mm256_srli_epi64(s1, 32); __m256i s2 = _mm256_add_epi64(w2, s1l); __m256i s2h = _mm256_srli_epi64(s2, 32); __m256i hi = _mm256_add_epi64(w3, s1h); hi = _mm256_add_epi64(hi, s2h); return hi; } // y is one 64-bit value repeated. static inline __m256i libdivide_mullhi_s64_vector(__m256i x, __m256i y) { __m256i p = libdivide_mullhi_u64_vector(x, y); __m256i t1 = _mm256_and_si256(libdivide_s64_signbits(x), y); __m256i t2 = _mm256_and_si256(libdivide_s64_signbits(y), x); p = _mm256_sub_epi64(p, t1); p = _mm256_sub_epi64(p, t2); return p; } ////////// UINT32 __m256i libdivide_u32_do_vector(__m256i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi32(numers, more); } else { __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, shift); } else { return _mm256_srli_epi32(q, more); } } } __m256i libdivide_u32_branchfree_do_vector(__m256i numers, const struct libdivide_u32_branchfree_t *denom) { __m256i q = libdivide_mullhi_u32_vector(numers, _mm256_set1_epi32(denom->magic)); __m256i t = _mm256_add_epi32(_mm256_srli_epi32(_mm256_sub_epi32(numers, q), 1), q); return _mm256_srli_epi32(t, denom->more); } ////////// UINT64 __m256i libdivide_u64_do_vector(__m256i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm256_srli_epi64(numers, more); } else { __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, shift); } else { return _mm256_srli_epi64(q, more); } } } __m256i libdivide_u64_branchfree_do_vector(__m256i numers, const struct libdivide_u64_branchfree_t *denom) { __m256i q = libdivide_mullhi_u64_vector(numers, _mm256_set1_epi64x(denom->magic)); __m256i t = _mm256_add_epi64(_mm256_srli_epi64(_mm256_sub_epi64(numers, q), 1), q); return _mm256_srli_epi64(t, denom->more); } ////////// SINT32 __m256i libdivide_s32_do_vector(__m256i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m256i q = _mm256_add_epi32(numers, _mm256_and_si256(_mm256_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm256_srai_epi32(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); return q; } else { __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm256_add_epi32(q, _mm256_sub_epi32(_mm256_xor_si256(numers, sign), sign)); } // q >>= shift q = _mm256_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm256_add_epi32(q, _mm256_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m256i libdivide_s32_branchfree_do_vector(__m256i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); __m256i q = libdivide_mullhi_s32_vector(numers, _mm256_set1_epi32(magic)); q = _mm256_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m256i q_sign = _mm256_srai_epi32(q, 31); // q_sign = q >> 31 __m256i mask = _mm256_set1_epi32((1U << shift) - is_power_of_2); q = _mm256_add_epi32(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = _mm256_srai_epi32(q, shift); // q >>= shift q = _mm256_sub_epi32(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m256i libdivide_s64_do_vector(__m256i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m256i roundToZeroTweak = _mm256_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m256i q = _mm256_add_epi64(numers, _mm256_and_si256(libdivide_s64_signbits(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vector(q, shift); __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); return q; } else { __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm256_add_epi64(q, _mm256_sub_epi64(_mm256_xor_si256(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm256_add_epi64(q, _mm256_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m256i libdivide_s64_branchfree_do_vector(__m256i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m256i sign = _mm256_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m256i q = libdivide_mullhi_s64_vector(numers, _mm256_set1_epi64x(magic)); q = _mm256_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m256i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m256i mask = _mm256_set1_epi64x((1ULL << shift) - is_power_of_2); q = _mm256_add_epi64(q, _mm256_and_si256(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift q = _mm256_sub_epi64(_mm256_xor_si256(q, sign), sign); // q = (q ^ sign) - sign return q; } #elif defined(LIBDIVIDE_SSE2) static inline __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom); static inline __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom); static inline __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom); static inline __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom); static inline __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom); static inline __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom); static inline __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom); static inline __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom); //////// Internal Utility Functions // Implementation of _mm_srai_epi64(v, 63) (from AVX512). static inline __m128i libdivide_s64_signbits(__m128i v) { __m128i hiBitsDuped = _mm_shuffle_epi32(v, _MM_SHUFFLE(3, 3, 1, 1)); __m128i signBits = _mm_srai_epi32(hiBitsDuped, 31); return signBits; } // Implementation of _mm_srai_epi64 (from AVX512). static inline __m128i libdivide_s64_shift_right_vector(__m128i v, int amt) { const int b = 64 - amt; __m128i m = _mm_set1_epi64x(1ULL << (b - 1)); __m128i x = _mm_srli_epi64(v, amt); __m128i result = _mm_sub_epi64(_mm_xor_si128(x, m), m); return result; } // Here, b is assumed to contain one 32-bit value repeated. static inline __m128i libdivide_mullhi_u32_vector(__m128i a, __m128i b) { __m128i hi_product_0Z2Z = _mm_srli_epi64(_mm_mul_epu32(a, b), 32); __m128i a1X3X = _mm_srli_epi64(a, 32); __m128i mask = _mm_set_epi32(-1, 0, -1, 0); __m128i hi_product_Z1Z3 = _mm_and_si128(_mm_mul_epu32(a1X3X, b), mask); return _mm_or_si128(hi_product_0Z2Z, hi_product_Z1Z3); } // SSE2 does not have a signed multiplication instruction, but we can convert // unsigned to signed pretty efficiently. Again, b is just a 32 bit value // repeated four times. static inline __m128i libdivide_mullhi_s32_vector(__m128i a, __m128i b) { __m128i p = libdivide_mullhi_u32_vector(a, b); // t1 = (a >> 31) & y, arithmetic shift __m128i t1 = _mm_and_si128(_mm_srai_epi32(a, 31), b); __m128i t2 = _mm_and_si128(_mm_srai_epi32(b, 31), a); p = _mm_sub_epi32(p, t1); p = _mm_sub_epi32(p, t2); return p; } // Here, y is assumed to contain one 64-bit value repeated. // https://stackoverflow.com/a/28827013 static inline __m128i libdivide_mullhi_u64_vector(__m128i x, __m128i y) { __m128i lomask = _mm_set1_epi64x(0xffffffff); __m128i xh = _mm_shuffle_epi32(x, 0xB1); // x0l, x0h, x1l, x1h __m128i yh = _mm_shuffle_epi32(y, 0xB1); // y0l, y0h, y1l, y1h __m128i w0 = _mm_mul_epu32(x, y); // x0l*y0l, x1l*y1l __m128i w1 = _mm_mul_epu32(x, yh); // x0l*y0h, x1l*y1h __m128i w2 = _mm_mul_epu32(xh, y); // x0h*y0l, x1h*y0l __m128i w3 = _mm_mul_epu32(xh, yh); // x0h*y0h, x1h*y1h __m128i w0h = _mm_srli_epi64(w0, 32); __m128i s1 = _mm_add_epi64(w1, w0h); __m128i s1l = _mm_and_si128(s1, lomask); __m128i s1h = _mm_srli_epi64(s1, 32); __m128i s2 = _mm_add_epi64(w2, s1l); __m128i s2h = _mm_srli_epi64(s2, 32); __m128i hi = _mm_add_epi64(w3, s1h); hi = _mm_add_epi64(hi, s2h); return hi; } // y is one 64-bit value repeated. static inline __m128i libdivide_mullhi_s64_vector(__m128i x, __m128i y) { __m128i p = libdivide_mullhi_u64_vector(x, y); __m128i t1 = _mm_and_si128(libdivide_s64_signbits(x), y); __m128i t2 = _mm_and_si128(libdivide_s64_signbits(y), x); p = _mm_sub_epi64(p, t1); p = _mm_sub_epi64(p, t2); return p; } ////////// UINT32 __m128i libdivide_u32_do_vector(__m128i numers, const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi32(numers, more); } else { __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, shift); } else { return _mm_srli_epi32(q, more); } } } __m128i libdivide_u32_branchfree_do_vector(__m128i numers, const struct libdivide_u32_branchfree_t *denom) { __m128i q = libdivide_mullhi_u32_vector(numers, _mm_set1_epi32(denom->magic)); __m128i t = _mm_add_epi32(_mm_srli_epi32(_mm_sub_epi32(numers, q), 1), q); return _mm_srli_epi32(t, denom->more); } ////////// UINT64 __m128i libdivide_u64_do_vector(__m128i numers, const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) { return _mm_srli_epi64(numers, more); } else { __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // uint32_t t = ((numer - q) >> 1) + q; // return t >> denom->shift; uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, shift); } else { return _mm_srli_epi64(q, more); } } } __m128i libdivide_u64_branchfree_do_vector(__m128i numers, const struct libdivide_u64_branchfree_t *denom) { __m128i q = libdivide_mullhi_u64_vector(numers, _mm_set1_epi64x(denom->magic)); __m128i t = _mm_add_epi64(_mm_srli_epi64(_mm_sub_epi64(numers, q), 1), q); return _mm_srli_epi64(t, denom->more); } ////////// SINT32 __m128i libdivide_s32_do_vector(__m128i numers, const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) { uint32_t shift = more & LIBDIVIDE_32_SHIFT_MASK; uint32_t mask = (1U << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi32(mask); // q = numer + ((numer >> 31) & roundToZeroTweak); __m128i q = _mm_add_epi32(numers, _mm_and_si128(_mm_srai_epi32(numers, 31), roundToZeroTweak)); q = _mm_srai_epi32(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); return q; } else { __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(denom->magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm_add_epi32(q, _mm_sub_epi32(_mm_xor_si128(numers, sign), sign)); } // q >>= shift q = _mm_srai_epi32(q, more & LIBDIVIDE_32_SHIFT_MASK); q = _mm_add_epi32(q, _mm_srli_epi32(q, 31)); // q += (q < 0) return q; } } __m128i libdivide_s32_branchfree_do_vector(__m128i numers, const struct libdivide_s32_branchfree_t *denom) { int32_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_32_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); __m128i q = libdivide_mullhi_s32_vector(numers, _mm_set1_epi32(magic)); q = _mm_add_epi32(q, numers); // q += numers // If q is non-negative, we have nothing to do // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2 uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = _mm_srai_epi32(q, 31); // q_sign = q >> 31 __m128i mask = _mm_set1_epi32((1U << shift) - is_power_of_2); q = _mm_add_epi32(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = _mm_srai_epi32(q, shift); // q >>= shift q = _mm_sub_epi32(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } ////////// SINT64 __m128i libdivide_s64_do_vector(__m128i numers, const struct libdivide_s64_t *denom) { uint8_t more = denom->more; int64_t magic = denom->magic; if (magic == 0) { // shift path uint32_t shift = more & LIBDIVIDE_64_SHIFT_MASK; uint64_t mask = (1ULL << shift) - 1; __m128i roundToZeroTweak = _mm_set1_epi64x(mask); // q = numer + ((numer >> 63) & roundToZeroTweak); __m128i q = _mm_add_epi64(numers, _mm_and_si128(libdivide_s64_signbits(numers), roundToZeroTweak)); q = libdivide_s64_shift_right_vector(q, shift); __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q = (q ^ sign) - sign; q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); return q; } else { __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); if (more & LIBDIVIDE_ADD_MARKER) { // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // q += ((numer ^ sign) - sign); q = _mm_add_epi64(q, _mm_sub_epi64(_mm_xor_si128(numers, sign), sign)); } // q >>= denom->mult_path.shift q = libdivide_s64_shift_right_vector(q, more & LIBDIVIDE_64_SHIFT_MASK); q = _mm_add_epi64(q, _mm_srli_epi64(q, 63)); // q += (q < 0) return q; } } __m128i libdivide_s64_branchfree_do_vector(__m128i numers, const struct libdivide_s64_branchfree_t *denom) { int64_t magic = denom->magic; uint8_t more = denom->more; uint8_t shift = more & LIBDIVIDE_64_SHIFT_MASK; // must be arithmetic shift __m128i sign = _mm_set1_epi32((int8_t)more >> 7); // libdivide_mullhi_s64(numers, magic); __m128i q = libdivide_mullhi_s64_vector(numers, _mm_set1_epi64x(magic)); q = _mm_add_epi64(q, numers); // q += numers // If q is non-negative, we have nothing to do. // If q is negative, we want to add either (2**shift)-1 if d is // a power of 2, or (2**shift) if it is not a power of 2. uint32_t is_power_of_2 = (magic == 0); __m128i q_sign = libdivide_s64_signbits(q); // q_sign = q >> 63 __m128i mask = _mm_set1_epi64x((1ULL << shift) - is_power_of_2); q = _mm_add_epi64(q, _mm_and_si128(q_sign, mask)); // q = q + (q_sign & mask) q = libdivide_s64_shift_right_vector(q, shift); // q >>= shift q = _mm_sub_epi64(_mm_xor_si128(q, sign), sign); // q = (q ^ sign) - sign return q; } #endif /////////// C++ stuff #ifdef __cplusplus // The C++ divider class is templated on both an integer type // (like uint64_t) and an algorithm type. // * BRANCHFULL is the default algorithm type. // * BRANCHFREE is the branchfree algorithm type. enum { BRANCHFULL, BRANCHFREE }; #if defined(LIBDIVIDE_AVX512) #define LIBDIVIDE_VECTOR_TYPE __m512i #elif defined(LIBDIVIDE_AVX2) #define LIBDIVIDE_VECTOR_TYPE __m256i #elif defined(LIBDIVIDE_SSE2) #define LIBDIVIDE_VECTOR_TYPE __m128i #endif #if !defined(LIBDIVIDE_VECTOR_TYPE) #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) #else #define LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { \ return libdivide_##ALGO##_do_vector(n, &denom); \ } #endif // The DISPATCHER_GEN() macro generates C++ methods (for the given integer // and algorithm types) that redirect to libdivide's C API. #define DISPATCHER_GEN(T, ALGO) \ libdivide_##ALGO##_t denom; \ dispatcher() { } \ dispatcher(T d) \ : denom(libdivide_##ALGO##_gen(d)) \ { } \ T divide(T n) const { \ return libdivide_##ALGO##_do(n, &denom); \ } \ LIBDIVIDE_DIVIDE_VECTOR(ALGO) \ T recover() const { \ return libdivide_##ALGO##_recover(&denom); \ } // The dispatcher selects a specific division algorithm for a given // type and ALGO using partial template specialization. template struct dispatcher { }; template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32) }; template<> struct dispatcher { DISPATCHER_GEN(int32_t, s32_branchfree) }; template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32) }; template<> struct dispatcher { DISPATCHER_GEN(uint32_t, u32_branchfree) }; template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64) }; template<> struct dispatcher { DISPATCHER_GEN(int64_t, s64_branchfree) }; template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64) }; template<> struct dispatcher { DISPATCHER_GEN(uint64_t, u64_branchfree) }; // This is the main divider class for use by the user (C++ API). // The actual division algorithm is selected using the dispatcher struct // based on the integer and algorithm template parameters. template class divider { public: // We leave the default constructor empty so that creating // an array of dividers and then initializing them // later doesn't slow us down. divider() { } // Constructor that takes the divisor as a parameter divider(T d) : div(d) { } // Divides n by the divisor T divide(T n) const { return div.divide(n); } // Recovers the divisor, returns the value that was // used to initialize this divider object. T recover() const { return div.recover(); } bool operator==(const divider& other) const { return div.denom.magic == other.denom.magic && div.denom.more == other.denom.more; } bool operator!=(const divider& other) const { return !(*this == other); } #if defined(LIBDIVIDE_VECTOR_TYPE) // Treats the vector as packed integer values with the same type as // the divider (e.g. s32, u32, s64, u64) and divides each of // them by the divider, returning the packed quotients. LIBDIVIDE_VECTOR_TYPE divide(LIBDIVIDE_VECTOR_TYPE n) const { return div.divide(n); } #endif private: // Storage for the actual divisor dispatcher::value, std::is_signed::value, sizeof(T), ALGO> div; }; // Overload of operator / for scalar division template T operator/(T n, const divider& div) { return div.divide(n); } // Overload of operator /= for scalar division template T& operator/=(T& n, const divider& div) { n = div.divide(n); return n; } #if defined(LIBDIVIDE_VECTOR_TYPE) // Overload of operator / for vector division template LIBDIVIDE_VECTOR_TYPE operator/(LIBDIVIDE_VECTOR_TYPE n, const divider& div) { return div.divide(n); } // Overload of operator /= for vector division template LIBDIVIDE_VECTOR_TYPE& operator/=(LIBDIVIDE_VECTOR_TYPE& n, const divider& div) { n = div.divide(n); return n; } #endif // libdivdie::branchfree_divider template using branchfree_divider = divider; } // namespace libdivide #endif // __cplusplus #endif // LIBDIVIDE_H libdivide-3.0/test/000077500000000000000000000000001355155642500143015ustar00rootroot00000000000000libdivide-3.0/test/benchmark.c000066400000000000000000000720131355155642500164020ustar00rootroot00000000000000// Usage: benchmark [OPTIONS] // // You can pass the benchmark program one or more of the following // options: u32, s32, u64, s64 to compare libdivide's speed against // hardware division. If benchmark is run without any options u64 // is used as default option. benchmark tests a simple function that // inputs an array of random numerators and a single divisor, and // returns the sum of their quotients. It tests this using both // hardware division, and the various division approaches supported // by libdivide, including vector division. // Silence MSVC sprintf unsafe warnings #define _CRT_SECURE_NO_WARNINGS #include "libdivide.h" #include #include #include #include #include #if defined(__GNUC__) #define NOINLINE __attribute__((__noinline__)) #elif defined(_MSC_VER) #define NOINLINE __declspec(noinline) #else #define NOINLINE #endif #if defined(LIBDIVIDE_AVX512) #define VECTOR_TYPE __m512i #define SETZERO_SI _mm512_setzero_si512 #define LOAD_SI _mm512_load_si512 #define ADD_EPI64 _mm512_add_epi64 #define ADD_EPI32 _mm512_add_epi32 #elif defined(LIBDIVIDE_AVX2) #define VECTOR_TYPE __m256i #define SETZERO_SI _mm256_setzero_si256 #define LOAD_SI _mm256_load_si256 #define ADD_EPI64 _mm256_add_epi64 #define ADD_EPI32 _mm256_add_epi32 #elif defined(LIBDIVIDE_SSE2) #define VECTOR_TYPE __m128i #define SETZERO_SI _mm_setzero_si128 #define LOAD_SI _mm_load_si128 #define ADD_EPI64 _mm_add_epi64 #define ADD_EPI32 _mm_add_epi32 #endif #define NANOSEC_PER_SEC 1000000000ULL #define NANOSEC_PER_USEC 1000ULL #define NANOSEC_PER_MILLISEC 1000000ULL #define SEED { 2147483563, 2147483563 ^ 0x49616E42 } #if defined(__cplusplus) using namespace libdivide; #endif #if defined(_WIN32) || defined(WIN32) #define NOMINMAX #define WIN32_LEAN_AND_MEAN #define VC_EXTRALEAN #include #include #define LIBDIVIDE_WINDOWS #pragma comment(lib, "winmm") #endif #if !defined(LIBDIVIDE_WINDOWS) #include // for gettimeofday() #endif struct random_state { uint32_t hi; uint32_t lo; }; uint64_t sGlobalUInt64; size_t iters = 1 << 19; size_t genIters = 1 << 16; static uint32_t my_random(struct random_state *state) { state->hi = (state->hi << 16) + (state->hi >> 16); state->hi += state->lo; state->lo += state->hi; return state->hi; } #if defined(LIBDIVIDE_WINDOWS) static LARGE_INTEGER gPerfCounterFreq; #endif #if !defined(LIBDIVIDE_WINDOWS) static uint64_t nanoseconds(void) { struct timeval now; gettimeofday(&now, NULL); return now.tv_sec * NANOSEC_PER_SEC + now.tv_usec * NANOSEC_PER_USEC; } #endif struct FunctionParams_t { const void *d; //a pointer to e.g. a uint32_t const void *denomPtr; // a pointer to e.g. libdivide_u32_t const void *denomBranchfreePtr; // a pointer to e.g. libdivide_u32_t from branchfree const void *data; // a pointer to the data to be divided }; struct time_result { uint64_t time; uint64_t result; }; static struct time_result time_function(uint64_t (*func)(struct FunctionParams_t*), struct FunctionParams_t *params) { struct time_result tresult; #if defined(LIBDIVIDE_WINDOWS) LARGE_INTEGER start, end; QueryPerformanceCounter(&start); uint64_t result = func(params); QueryPerformanceCounter(&end); uint64_t diff = end.QuadPart - start.QuadPart; sGlobalUInt64 += result; tresult.result = result; tresult.time = (diff * 1000000000) / gPerfCounterFreq.QuadPart; #else uint64_t start = nanoseconds(); uint64_t result = func(params); uint64_t end = nanoseconds(); uint64_t diff = end - start; sGlobalUInt64 += result; tresult.result = result; tresult.time = diff; #endif return tresult; } // U32 NOINLINE static uint64_t mine_u32(struct FunctionParams_t *params) { const struct libdivide_u32_t denom = *(struct libdivide_u32_t *)params->denomPtr; const uint32_t *data = (const uint32_t *)params->data; uint32_t sum = 0; for (size_t iter = 0; iter < iters; iter++) { uint32_t numer = data[iter]; sum += libdivide_u32_do(numer, &denom); } return sum; } NOINLINE static uint64_t mine_u32_branchfree(struct FunctionParams_t *params) { const struct libdivide_u32_branchfree_t denom = *(struct libdivide_u32_branchfree_t *)params->denomBranchfreePtr; const uint32_t *data = (const uint32_t *)params->data; uint32_t sum = 0; for (size_t iter = 0; iter < iters; iter++) { uint32_t numer = data[iter]; sum += libdivide_u32_branchfree_do(numer, &denom); } return sum; } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) NOINLINE static uint64_t mine_u32_vector(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(uint32_t); const struct libdivide_u32_t denom = *(struct libdivide_u32_t *)params->denomPtr; const uint32_t *data = (const uint32_t *)params->data; VECTOR_TYPE sumX4 = SETZERO_SI(); for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX4 = ADD_EPI32(sumX4, libdivide_u32_do_vector(numers, &denom)); } const uint32_t *comps = (const uint32_t *)&sumX4; uint32_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } NOINLINE static uint64_t mine_u32_vector_branchfree(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(uint32_t); const struct libdivide_u32_branchfree_t denom = *(struct libdivide_u32_branchfree_t *)params->denomBranchfreePtr; const uint32_t *data = (const uint32_t *)params->data; VECTOR_TYPE sumX4 = SETZERO_SI(); for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX4 = ADD_EPI32(sumX4, libdivide_u32_branchfree_do_vector(numers, &denom)); } const uint32_t *comps = (const uint32_t *)&sumX4; uint32_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } #endif NOINLINE static uint64_t his_u32(struct FunctionParams_t *params) { const uint32_t *data = (const uint32_t *)params->data; const uint32_t d = *(uint32_t *)params->d; uint32_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { uint32_t numer = data[iter]; sum += numer / d; } return sum; } NOINLINE static uint64_t mine_u32_generate(struct FunctionParams_t *params) { uint32_t *dPtr = (uint32_t *)params->d; struct libdivide_u32_t *denomPtr = (struct libdivide_u32_t *)params->denomPtr; for (size_t iter= 0; iter < genIters; iter++) { *denomPtr = libdivide_u32_gen(*dPtr); } return *dPtr; } // S32 NOINLINE static uint64_t mine_s32(struct FunctionParams_t *params) { const struct libdivide_s32_t denom = *(struct libdivide_s32_t *)params->denomPtr; const int32_t *data = (const int32_t *)params->data; uint32_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { int32_t numer = data[iter]; sum += libdivide_s32_do(numer, &denom); } return sum; } NOINLINE static uint64_t mine_s32_branchfree(struct FunctionParams_t *params) { const struct libdivide_s32_branchfree_t denom = *(struct libdivide_s32_branchfree_t *)params->denomBranchfreePtr; const int32_t *data = (const int32_t *)params->data; uint32_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { int32_t numer = data[iter]; sum += libdivide_s32_branchfree_do(numer, &denom); } return sum; } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) NOINLINE static uint64_t mine_s32_vector(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(int32_t); VECTOR_TYPE sumX4 = SETZERO_SI(); const struct libdivide_s32_t denom = *(struct libdivide_s32_t *)params->denomPtr; const int32_t *data = (const int32_t *)params->data; for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX4 = ADD_EPI32(sumX4, libdivide_s32_do_vector(numers, &denom)); } const int32_t *comps = (const int32_t *)&sumX4; uint32_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } NOINLINE static uint64_t mine_s32_vector_branchfree(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(int32_t); VECTOR_TYPE sumX4 = SETZERO_SI(); const struct libdivide_s32_branchfree_t denom = *(struct libdivide_s32_branchfree_t *)params->denomBranchfreePtr; const int32_t *data = (const int32_t *)params->data; for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX4 = ADD_EPI32(sumX4, libdivide_s32_branchfree_do_vector(numers, &denom)); } const int32_t *comps = (const int32_t *)&sumX4; uint32_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } #endif NOINLINE static uint64_t his_s32(struct FunctionParams_t *params) { uint32_t sum = 0; const int32_t d = *(int32_t *)params->d; const int32_t *data = (const int32_t *)params->data; for (size_t iter= 0; iter < iters; iter++) { int32_t numer = data[iter]; sum += numer / d; } return sum; } NOINLINE static uint64_t mine_s32_generate(struct FunctionParams_t *params) { int32_t *dPtr = (int32_t *)params->d; struct libdivide_s32_t *denomPtr = (struct libdivide_s32_t *)params->denomPtr; for (size_t iter= 0; iter < genIters; iter++) { *denomPtr = libdivide_s32_gen(*dPtr); } return *dPtr; } // U64 NOINLINE static uint64_t mine_u64(struct FunctionParams_t *params) { const struct libdivide_u64_t denom = *(struct libdivide_u64_t *)params->denomPtr; const uint64_t *data = (const uint64_t *)params->data; uint64_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { uint64_t numer = data[iter]; sum += libdivide_u64_do(numer, &denom); } return sum; } NOINLINE static uint64_t mine_u64_branchfree(struct FunctionParams_t *params) { const struct libdivide_u64_branchfree_t denom = *(struct libdivide_u64_branchfree_t *)params->denomBranchfreePtr; const uint64_t *data = (const uint64_t *)params->data; uint64_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { uint64_t numer = data[iter]; sum += libdivide_u64_branchfree_do(numer, &denom); } return sum; } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) NOINLINE static uint64_t mine_u64_vector(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(uint64_t); VECTOR_TYPE sumX2 = SETZERO_SI(); const struct libdivide_u64_t denom = *(struct libdivide_u64_t *)params->denomPtr; const uint64_t *data = (const uint64_t *)params->data; for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX2 = ADD_EPI64(sumX2, libdivide_u64_do_vector(numers, &denom)); } const uint64_t *comps = (const uint64_t *)&sumX2; uint64_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } NOINLINE static uint64_t mine_u64_vector_branchfree(struct FunctionParams_t *params) { size_t count = sizeof(VECTOR_TYPE) / sizeof(uint64_t); VECTOR_TYPE sumX2 = SETZERO_SI(); const struct libdivide_u64_branchfree_t denom = *(struct libdivide_u64_branchfree_t *)params->denomBranchfreePtr; const uint64_t *data = (const uint64_t *)params->data; for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX2 = ADD_EPI64(sumX2, libdivide_u64_branchfree_do_vector(numers, &denom)); } const uint64_t *comps = (const uint64_t *)&sumX2; uint64_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } #endif NOINLINE static uint64_t his_u64(struct FunctionParams_t *params) { uint64_t sum = 0; const uint64_t d = *(uint64_t *)params->d; const uint64_t *data = (const uint64_t *)params->data; for (size_t iter= 0; iter < iters; iter++) { uint64_t numer = data[iter]; sum += numer / d; } return sum; } NOINLINE static uint64_t mine_u64_generate(struct FunctionParams_t *params) { uint64_t *dPtr = (uint64_t *)params->d; struct libdivide_u64_t *denomPtr = (struct libdivide_u64_t *)params->denomPtr; for (size_t iter= 0; iter < genIters; iter++) { *denomPtr = libdivide_u64_gen(*dPtr); } return *dPtr; } NOINLINE static uint64_t mine_s64(struct FunctionParams_t *params) { const struct libdivide_s64_t denom = *(struct libdivide_s64_t *)params->denomPtr; const int64_t *data = (const int64_t *)params->data; uint64_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { int64_t numer = data[iter]; sum += libdivide_s64_do(numer, &denom); } return sum; } NOINLINE static uint64_t mine_s64_branchfree(struct FunctionParams_t *params) { const struct libdivide_s64_branchfree_t denom = *(struct libdivide_s64_branchfree_t *)params->denomBranchfreePtr; const int64_t *data = (const int64_t *)params->data; uint64_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { int64_t numer = data[iter]; sum += libdivide_s64_branchfree_do(numer, &denom); } return sum; } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) NOINLINE static uint64_t mine_s64_vector(struct FunctionParams_t *params) { const struct libdivide_s64_t denom = *(struct libdivide_s64_t *)params->denomPtr; const int64_t *data = (const int64_t *)params->data; size_t count = sizeof(VECTOR_TYPE) / sizeof(int64_t); VECTOR_TYPE sumX2 = SETZERO_SI(); for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX2 = ADD_EPI64(sumX2, libdivide_s64_do_vector(numers, &denom)); } const int64_t *comps = (const int64_t *)&sumX2; uint64_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } NOINLINE static uint64_t mine_s64_vector_branchfree(struct FunctionParams_t *params) { const struct libdivide_s64_branchfree_t denom = *(struct libdivide_s64_branchfree_t *)params->denomBranchfreePtr; const int64_t *data = (const int64_t *)params->data; size_t count = sizeof(VECTOR_TYPE) / sizeof(int64_t); VECTOR_TYPE sumX2 = SETZERO_SI(); for (size_t iter = 0; iter < iters; iter += count) { VECTOR_TYPE numers = LOAD_SI((const VECTOR_TYPE*)(data + iter)); sumX2 = ADD_EPI64(sumX2, libdivide_s64_branchfree_do_vector(numers, &denom)); } const int64_t *comps = (const int64_t *)&sumX2; uint64_t sum = 0; for (size_t i = 0; i < count; i++) { sum += comps[i]; } return sum; } #endif NOINLINE static uint64_t his_s64(struct FunctionParams_t *params) { const int64_t *data = (const int64_t *)params->data; const int64_t d = *(int64_t *)params->d; uint64_t sum = 0; for (size_t iter= 0; iter < iters; iter++) { int64_t numer = data[iter]; sum += numer / d; } return sum; } NOINLINE static uint64_t mine_s64_generate(struct FunctionParams_t *params) { int64_t *dPtr = (int64_t *)params->d; struct libdivide_s64_t *denomPtr = (struct libdivide_s64_t *)params->denomPtr; for (size_t iter= 0; iter < genIters; iter++) { *denomPtr = libdivide_s64_gen(*dPtr); } return *dPtr; } /* Stub functions for when we have no AVX512/AVX2/SSE2 */ #if !defined(LIBDIVIDE_AVX512) && \ !defined(LIBDIVIDE_AVX2) && \ !defined(LIBDIVIDE_SSE2) NOINLINE static uint64_t mine_u32_vector(struct FunctionParams_t *params) { return mine_u32(params); } NOINLINE static uint64_t mine_u32_vector_branchfree(struct FunctionParams_t *params) { return mine_u32_branchfree(params); } NOINLINE static uint64_t mine_s32_vector(struct FunctionParams_t *params) { return mine_s32(params); } NOINLINE static uint64_t mine_s32_vector_branchfree(struct FunctionParams_t *params) { return mine_s32_branchfree(params); } NOINLINE static uint64_t mine_u64_vector(struct FunctionParams_t *params) { return mine_u64(params); } NOINLINE static uint64_t mine_u64_vector_branchfree(struct FunctionParams_t *params) { return mine_u64_branchfree(params); } NOINLINE static uint64_t mine_s64_vector(struct FunctionParams_t *params) { return mine_s64(params); } NOINLINE static uint64_t mine_s64_vector_branchfree(struct FunctionParams_t *params) { return mine_s64_branchfree(params); } #endif struct TestResult { double my_base_time; double my_branchfree_time; double my_vector_time; double my_vector_branchfree_time; double his_time; double gen_time; int algo; }; static uint64_t find_min(const uint64_t *vals, size_t cnt) { uint64_t result = vals[0]; size_t i; for (i=1; i < cnt; i++) { if (vals[i] < result) result = vals[i]; } return result; } typedef uint64_t (*TestFunc_t)(struct FunctionParams_t *params); NOINLINE struct TestResult test_one(TestFunc_t mine, TestFunc_t mine_branchfree, TestFunc_t mine_vector, TestFunc_t mine_vector_branchfree, TestFunc_t his, TestFunc_t generate, struct FunctionParams_t *params) { #define TEST_COUNT 30 struct TestResult result; memset(&result, 0, sizeof result); #define CHECK(actual, expected) do { if (1 && actual != expected) printf("Failure on line %lu\n", (unsigned long)__LINE__); } while (0) uint64_t my_times[TEST_COUNT], my_times_branchfree[TEST_COUNT], my_times_vector[TEST_COUNT], my_times_vector_branchfree[TEST_COUNT], his_times[TEST_COUNT], gen_times[TEST_COUNT]; struct time_result tresult; for (size_t iter= 0; iter < TEST_COUNT; iter++) { tresult = time_function(his, params); his_times[iter] = tresult.time; const uint64_t expected = tresult.result; tresult = time_function(mine, params); my_times[iter] = tresult.time; CHECK(tresult.result, expected); tresult = time_function(mine_branchfree, params); my_times_branchfree[iter] = tresult.time; CHECK(tresult.result, expected); #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) tresult = time_function(mine_vector, params); my_times_vector[iter] = tresult.time; CHECK(tresult.result, expected); tresult = time_function(mine_vector_branchfree, params); my_times_vector_branchfree[iter] = tresult.time; CHECK(tresult.result, expected); #else my_times_vector[iter]=0; my_times_vector_branchfree[iter] = 0; (void) mine_vector; (void) mine_vector_branchfree; #endif tresult = time_function(generate, params); gen_times[iter] = tresult.time; } result.gen_time = find_min(gen_times, TEST_COUNT) / (double)genIters; result.my_base_time = find_min(my_times, TEST_COUNT) / (double)iters; result.my_branchfree_time = find_min(my_times_branchfree, TEST_COUNT) / (double)iters; result.my_vector_time = find_min(my_times_vector, TEST_COUNT) / (double)iters; result.my_vector_branchfree_time = find_min(my_times_vector_branchfree, TEST_COUNT) / (double)iters; result.his_time = find_min(his_times, TEST_COUNT) / (double)iters; return result; #undef TEST_COUNT } int libdivide_u32_get_algorithm(const struct libdivide_u32_t *denom) { uint8_t more = denom->more; if (!denom->magic) return 0; else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; else return 2; } NOINLINE struct TestResult test_one_u32(uint32_t d, const uint32_t *data) { int no_branchfree = (d == 1); struct libdivide_u32_t div_struct = libdivide_u32_gen(d); struct libdivide_u32_branchfree_t div_struct_bf = libdivide_u32_branchfree_gen(no_branchfree ? 2 : d); struct FunctionParams_t params; params.d = &d; params.denomPtr = &div_struct; params.denomBranchfreePtr = &div_struct_bf; params.data = data; struct TestResult result = test_one(mine_u32, no_branchfree ? mine_u32 : mine_u32_branchfree, mine_u32_vector, no_branchfree ? mine_u32_vector : mine_u32_vector_branchfree, his_u32, mine_u32_generate, ¶ms); result.algo = libdivide_u32_get_algorithm(&div_struct); return result; } int libdivide_s32_get_algorithm(const struct libdivide_s32_t *denom) { uint8_t more = denom->more; if (!denom->magic) return 0; else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; else return 2; } NOINLINE struct TestResult test_one_s32(int32_t d, const int32_t *data) { struct libdivide_s32_t div_struct = libdivide_s32_gen(d); struct libdivide_s32_branchfree_t div_struct_bf = libdivide_s32_branchfree_gen(d); struct FunctionParams_t params; params.d = &d; params.denomPtr = &div_struct; params.denomBranchfreePtr = &div_struct_bf; params.data = data; struct TestResult result = test_one(mine_s32, mine_s32_branchfree, mine_s32_vector, mine_s32_vector_branchfree, his_s32, mine_s32_generate, ¶ms); result.algo = libdivide_s32_get_algorithm(&div_struct); return result; } int libdivide_u64_get_algorithm(const struct libdivide_u64_t *denom) { uint8_t more = denom->more; if (!denom->magic) return 0; else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; else return 2; } NOINLINE struct TestResult test_one_u64(uint64_t d, const uint64_t *data) { int no_branchfree = (d == 1); struct libdivide_u64_t div_struct = libdivide_u64_gen(d); struct libdivide_u64_branchfree_t div_struct_bf = libdivide_u64_branchfree_gen(no_branchfree ? 2 : d); struct FunctionParams_t params; params.d = &d; params.denomPtr = &div_struct; params.denomBranchfreePtr = &div_struct_bf; params.data = data; struct TestResult result = test_one(mine_u64, no_branchfree ? mine_u64 : mine_u64_branchfree, mine_u64_vector, no_branchfree ? mine_u64_vector : mine_u64_vector_branchfree, his_u64, mine_u64_generate, ¶ms); result.algo = libdivide_u64_get_algorithm(&div_struct); return result; } int libdivide_s64_get_algorithm(const struct libdivide_s64_t *denom) { uint8_t more = denom->more; if (!denom->magic) return 0; else if (!(more & LIBDIVIDE_ADD_MARKER)) return 1; else return 2; } NOINLINE struct TestResult test_one_s64(int64_t d, const int64_t *data) { struct libdivide_s64_t div_struct = libdivide_s64_gen(d); struct libdivide_s64_branchfree_t div_struct_bf = libdivide_s64_branchfree_gen(d); struct FunctionParams_t params; params.d = &d; params.denomPtr = &div_struct; params.denomBranchfreePtr = (void *)&div_struct_bf; params.data = data; struct TestResult result = test_one(mine_s64, mine_s64_branchfree, mine_s64_vector, mine_s64_vector_branchfree, his_s64, mine_s64_generate, ¶ms); result.algo = libdivide_s64_get_algorithm(&div_struct); return result; } static void report_header(void) { printf("%6s%9s%8s%8s%8s%8s%8s%7s\n", "#", "system", "scalar", "scl_bf", "vector", "vec_bf", "gener", "algo"); } static void report_result(const char *input, struct TestResult result) { printf("%6s%8.3f%8.3f%8.3f%8.3f%8.3f%9.3f%4d\n", input, result.his_time, result.my_base_time, result.my_branchfree_time, result.my_vector_time, result.my_vector_branchfree_time, result.gen_time, result.algo); } static void test_many_u32(const uint32_t *data) { printf("\n%50s", "=== libdivide u32 benchmark ===\n\n"); report_header(); uint32_t d; for (d=1; d > 0; d++) { struct TestResult result = test_one_u32(d, data); char input_buff[32]; sprintf(input_buff, "%u", d); report_result(input_buff, result); } } static void test_many_s32(const int32_t *data) { printf("\n%50s", "=== libdivide s32 benchmark ===\n\n"); report_header(); int32_t d; for (d=1; d != 0;) { struct TestResult result = test_one_s32(d, data); char input_buff[32]; sprintf(input_buff, "%d", d); report_result(input_buff, result); d = -d; if (d > 0) d++; } } static void test_many_u64(const uint64_t *data) { printf("\n%50s", "=== libdivide u64 benchmark ===\n\n"); report_header(); uint64_t d; for (d=1; d > 0; d++) { struct TestResult result = test_one_u64(d, data); char input_buff[32]; sprintf(input_buff, "%" PRIu64, d); report_result(input_buff, result); } } static void test_many_s64(const int64_t *data) { printf("\n%50s", "=== libdivide s64 benchmark ===\n\n"); report_header(); int64_t d; for (d=1; d != 0;) { struct TestResult result = test_one_s64(d, data); char input_buff[32]; sprintf(input_buff, "%" PRId64, d); report_result(input_buff, result); d = -d; if (d > 0) d++; } } static const uint32_t *random_data(unsigned sizeOfType) { #if defined(LIBDIVIDE_WINDOWS) /* Align memory to 64 byte boundary for AVX512 */ uint32_t *data = (uint32_t *) _aligned_malloc(iters * sizeOfType, 64); #else /* Align memory to 64 byte boundary for AVX512 */ void *ptr = NULL; int failed = posix_memalign(&ptr, 64, iters * sizeOfType); if (failed) { printf("Failed to align memory!\n"); exit(1); } uint32_t *data = (uint32_t*) ptr; #endif size_t size = (iters * sizeOfType) / sizeof(*data); struct random_state state = SEED; for (size_t i = 0; i < size; i++) { data[i] = my_random(&state); } return data; } int main(int argc, char* argv[]) { // Disable printf buffering. // This is mainly required for Windows. setbuf(stdout, NULL); #if defined(LIBDIVIDE_WINDOWS) QueryPerformanceFrequency(&gPerfCounterFreq); #endif int u32 = 0; int s32 = 0; int u64 = 0; int s64 = 0; if (argc == 1) { // By default test only u64 u64 = 1; } else { for (int i = 1; i < argc; i++) { if (! strcmp(argv[i], "u32")) u32 = 1; else if (! strcmp(argv[i], "u64")) u64 = 1; else if (! strcmp(argv[i], "s32")) s32 = 1; else if (! strcmp(argv[i], "s64")) s64 = 1; else { printf("Usage: benchmark [OPTIONS]\n" "\n" "You can pass the benchmark program one or more of the following\n" "options: u32, s32, u64, s64 to compare libdivide's speed against\n" "hardware division. If benchmark is run without any options u64\n" "is used as default option. benchmark tests a simple function that\n" "inputs an array of random numerators and a single divisor, and\n" "returns the sum of their quotients. It tests this using both\n" "hardware division, and the various division approaches supported\n" "by libdivide, including vector division.\n"); exit(1); } } } // Make sure that the number of iterations is not // known at compile time to prevent the compiler // from magically calculating results at compile // time and hence falsifying the benchmark. srand((unsigned) time(NULL)); iters += (rand() % 3) * (1 << 10); genIters += (rand() % 3) * (1 << 10); const uint32_t *data = random_data(sizeof(uint32_t)); if (u32) test_many_u32(data); if (s32) test_many_s32((const int32_t *)data); #if defined(LIBDIVIDE_WINDOWS) _aligned_free((void *)data); #else free((void *)data); #endif data = random_data(sizeof(uint64_t)); if (u64) test_many_u64((const uint64_t *)data); if (s64) test_many_s64((const int64_t *)data); #if defined(LIBDIVIDE_WINDOWS) _aligned_free((void *)data); #else free((void *)data); #endif return 0; } libdivide-3.0/test/benchmark_branchfree.cpp000066400000000000000000000155551355155642500211310ustar00rootroot00000000000000// Usage: benchmark_branchfree [u32] [u64] [s32] [s64] [branchfree] [branchfull] [sys|system] // // The branchfree benchmark iterates over an array of dividers and computes // divisions. This is the use case where the branchfree divider generally // shines and where the default branchfull divider performs poorly because // the CPU is not able to correctly predict the branches of the many different // dividers. // #include "libdivide.h" #include #include #include #include #include #include #include #if defined(__GNUC__) #define NOINLINE __attribute__((__noinline__)) #elif defined(_MSC_VER) #define NOINLINE __declspec(noinline) #else #define NOINLINE #endif // Generate primes using the sieve of Eratosthenes. // These primes will later be used as dividers in the benchmark. template std::vector get_primes(T max) { uint64_t n = (uint64_t) max; std::vector primes; std::vector sieve(n + 1, true); for (uint64_t i = 2; i * i <= n; i++) if (sieve[i]) for (uint64_t j = i * i; j <= n; j += i) sieve[j] = false; for (uint64_t i = 2; i <= n; i++) if (sieve[i]) primes.push_back((T) i); return primes; } // Here we iterate over an array of dividers and compute divisions. // libdivide's branchfull divider will not perform well as the // CPU will not be able to correctly predict the branches. // The branchfree divider is perfectly suited for this use case // and will perform much better. // template NOINLINE size_t sum_dividers(N numerator, const T& dividers) { size_t sum = 0; for (const auto& divider: dividers) sum += numerator / divider; return sum; } struct result_t { double duration; size_t sum; }; template NOINLINE result_t benchmark_sum_dividers(const D& dividers, size_t iters) { auto t1 = std::chrono::system_clock::now(); size_t sum = 0; for (; iters > 0; iters--) { // Unsigned branchfree divider cannot be 1 T numerator = std::max((T) 2, (T) iters); sum += sum_dividers(numerator, dividers); } auto t2 = std::chrono::system_clock::now(); std::chrono::duration seconds = t2 - t1; return result_t{seconds.count(), sum}; } enum { TEST_U32 = 1 << 0, TEST_U64 = 1 << 1, TEST_S32 = 1 << 2, TEST_S64 = 1 << 3, TEST_ALL_TYPES = (TEST_U32 | TEST_U64 | TEST_S32 | TEST_S64), TEST_SYSTEM = 1 << 4, TEST_BRANCHFREE = 1 << 5, TEST_BRANCHFULL = 1 << 6, TEST_ALL_ALGOS = (TEST_SYSTEM | TEST_BRANCHFREE | TEST_BRANCHFULL), }; using tasks_t = unsigned int; template void benchmark(tasks_t tasks, size_t max, size_t iters) { bool test_system = !!(tasks & TEST_SYSTEM); bool test_branchfull = !!(tasks & TEST_BRANCHFULL); bool test_branchfree = !!(tasks & TEST_BRANCHFREE); result_t sys = {0, 0}; result_t branchfull = {0, 0}; result_t branchfree = {0, 0}; if (test_system) { using divider_type = T; auto dividers = get_primes((T) max); sys = benchmark_sum_dividers(dividers, iters); std::cout << '.' << std::flush; } if (test_branchfull) { using divider_type = libdivide::divider; auto dividers = get_primes((T) max); branchfull = benchmark_sum_dividers(dividers, iters); std::cout << '.' << std::flush; } if (test_branchfree) { using divider_type = libdivide::branchfree_divider; auto dividers = get_primes((T) max); branchfree = benchmark_sum_dividers(dividers, iters); std::cout << '.' << std::endl; } if (test_system && test_branchfull && branchfull.sum != sys.sum) { std::cerr << "Error: branchfull_divider<" << typeid(T).name() << "> sum: " << branchfull.sum << ", but system sum: " << sys.sum << std::endl; std::exit(1); } if (test_system && test_branchfree && branchfree.sum != sys.sum) { std::cerr << "Error: branchfree_divider<" << typeid(T).name() << "> sum: " << branchfree.sum << ", but system sum: " << sys.sum << std::endl; std::exit(1); } if (test_system) std::cout << " system: " << sys.duration << " seconds" << std::endl; if (test_branchfull) std::cout << "branchfull: " << branchfull.duration << " seconds" << std::endl; if (test_branchfree) std::cout << "branchfree: " << branchfree.duration << " seconds" << std::endl; std::cout << std::endl; } void usage() { std::cout << "Usage: benchmark_branchfree [u32] [u64] [s32] [s64] [branchfree] [branchfull] [sys|system]\n" "\n" "The branchfree benchmark iterates over an array of dividers and computes\n" "divisions. This is the use case where the branchfree divider generally\n" "shines and where the default branchfull divider performs poorly because\n" "the CPU is not able to correctly predict the branches of the many different\n" "dividers." << std::endl; } int main(int argc, const char *argv[]) { tasks_t tasks = 0; for (int i = 1; i < argc; i++) { std::string arg(argv[i]); if (arg == "u32") { tasks |= TEST_U32; } else if (arg == "s32") { tasks |= TEST_S32; } else if (arg == "u64") { tasks |= TEST_U64; } else if (arg == "s64") { tasks |= TEST_S64; } else if (arg == "branchfree") { tasks |= TEST_BRANCHFREE; } else if (arg == "branchfull") { tasks |= TEST_BRANCHFULL; } else if (arg == "sys" || arg == "system") { tasks |= TEST_SYSTEM; } else { usage(); return 1; } } // Set default tasks if (!(tasks & TEST_ALL_TYPES)) { tasks |= TEST_ALL_TYPES; } if (!(tasks & TEST_ALL_ALGOS)) { tasks |= TEST_ALL_ALGOS; } size_t iters = 3000; size_t max_divider = 1 << 22; if (tasks & TEST_U32) { std::cout << "----- u32 -----" << std::endl; benchmark(tasks, max_divider, iters); } if (tasks & TEST_S32) { std::cout << "----- s32 -----" << std::endl; benchmark(tasks, max_divider, iters); } if (tasks & TEST_U64) { std::cout << "----- u64 -----" << std::endl; benchmark(tasks, max_divider, iters); } if (tasks & TEST_S64) { std::cout << "----- s64 -----" << std::endl; benchmark(tasks, max_divider, iters); } std::cout << "All tests passed successfully!" << std::endl; return 0; } libdivide-3.0/test/tester.cpp000066400000000000000000000300151355155642500163120ustar00rootroot00000000000000// Usage: tester [OPTIONS] // // You can pass the tester program one or more of the following options: // u32, s32, u64, s64 or run it without arguments to test all four. // The tester is multithreaded so it can test multiple cases simultaneously. // The tester will verify the correctness of libdivide via a set of // randomly chosen denominators, by comparing the result of libdivide's // division to hardware division. It may take a long time to run, but it // will output as soon as it finds a discrepancy. #include "libdivide.h" #include #include #include #include #include #include #include #include #include using namespace std; using namespace libdivide; template class DivideTest { private: using UT = typename std::make_unsigned::type; using limits = std::numeric_limits; std::string name; uint32_t seed = 0; UT rand_n = 0; // This random function slowly increases the random number // until there is an integer overflow, if this happens // the random number is reset to 0 and we restart at the // beginning. We do this to ensure that we get many test // cases (random integers) of varying bit length. T get_random() { // https://en.wikipedia.org/wiki/Linear_congruential_generator seed = seed * 1664525 + 1013904223; UT old = rand_n; rand_n = rand_n * (seed % 2 + 1) + rand_n % 30000001 + 3; // Reset upon integer overflow if (rand_n < old) { rand_n = seed % 19; } // The algorithm above generates mostly positive numbers. // Hence convert 50% of all values to negative. if (limits::is_signed) { if (seed % 2) return -(T) rand_n; } return (T) rand_n; } T random_denominator() { T denom = get_random(); while (denom == 0) { denom = get_random(); } return denom; } std::string testcase_name(int algo) const { std::string result = this->name; if (algo == BRANCHFREE) { result += " (branchfree)"; } return result; } template void test_one(T numer, T denom, const divider& the_divider) { // Don't crash with INT_MIN / -1 // INT_MIN / -1 is undefined behavior in C/C++ if (limits::is_signed && numer == limits::min() && denom == T(-1)) { return; } T expect = numer / denom; T result = numer / the_divider; if (result != expect) { ostringstream oss; oss << "Failure for " << testcase_name(ALGO) << ": " << numer << " / " << denom << " = " << expect << ", but got " << result << endl; cerr << oss.str(); exit(1); } } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) #if defined(LIBDIVIDE_AVX512) #define VECTOR_TYPE __m512i #define VECTOR_LOAD _mm512_loadu_si512 #elif defined(LIBDIVIDE_AVX2) #define VECTOR_TYPE __m256i #define VECTOR_LOAD _mm256_loadu_si256 #elif defined(LIBDIVIDE_SSE2) #define VECTOR_TYPE __m128i #define VECTOR_LOAD _mm_loadu_si128 #endif template void test_16(const T *numers, T denom, const divider & the_divider) { // Align memory to 64 byte boundary for AVX512 char mem[16 * sizeof(T) + 64]; size_t offset = 64 - (size_t)&mem % 64; T* results = (T*) &mem[offset]; size_t iters = 64 / sizeof(VECTOR_TYPE); size_t size = sizeof(VECTOR_TYPE) / sizeof(T); for (size_t j = 0; j < iters; j++, numers += size) { VECTOR_TYPE x = VECTOR_LOAD((const VECTOR_TYPE*) numers); VECTOR_TYPE resultVector = x / the_divider; results = (T*) &resultVector; for (size_t i = 0; i < size; i++) { T numer = numers[i]; T result = results[i]; T expect = numer / denom; if (result != expect) { ostringstream oss; oss << "Vector failure for: " << testcase_name(ALGO) << ": " << numer << " / " << denom << " = " << expect << ", but got " << result << endl; cerr << oss.str(); exit(1); } else { #if 0 ostringstream oss; oss << "Vector success for: " << numer << " / " << denom << " = " << result << endl; cout << oss.str(); #endif } } } } #endif template void test_many(T denom) { // Don't try dividing by 1 with unsigned branchfree if (ALGO == BRANCHFREE && std::is_unsigned::value && denom == 1) { return; } const divider the_divider = divider(denom); T recovered = the_divider.recover(); if (recovered != denom) { ostringstream oss; oss << "Failed to recover divisor for " << testcase_name(ALGO) << ": "<< denom << ", but got " << recovered << endl; cerr << oss.str(); exit(1); } T min = limits::min(); T max = limits::max(); vector edgeCases = { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 123, 1232, 36847, 506838, 3000003, 70000007, max, max-1, max-2, max-3, max-4, max-5, max-3213, max-2453242, max-432234231, min, min+1, min+2, min+3, min+4, min+5, min+3213, min+2453242, min+432234231, max/2, max/2+1, max/2-1, max/3, max/3+1, max/3-1, max/4, max/4+1, max/4-1, min/2, min/2+1, min/2-1, min/3, min/3+1, min/3-1, min/4, max/4+1, min/4-1 }; for (T numerator : edgeCases) { test_one(numerator, denom, the_divider); } // balance signed & unsigned testing int small_stop = (limits::is_signed) ? 1 << 14 : 1 << 16; // test small numerators < 2^16 for (int i = 0; i < small_stop; i++) { test_one(i, denom, the_divider); if (limits::is_signed) { test_one(-i, denom, the_divider); } } // test power of 2 numerators: 2^i-1, 2^i, 2^i+1 for (int i = 1; i < limits::digits; i++) { for (int j = -1; j <= 1; j++) { T numerator = ((T)1 << i) + j; test_one(numerator, denom, the_divider); if (limits::is_signed) { test_one(-numerator, denom, the_divider); } } } // test all bits set: // 11111111, 11111110, 11111100, ... for (UT bits = (UT) ~0ull; bits != 0; bits <<= 1) { test_one((T) bits, denom, the_divider); } // Align memory to 64 byte boundary for AVX512 char mem[16 * sizeof(T) + 64]; size_t offset = 64 - (size_t)&mem % 64; T* numers = (T*) &mem[offset]; // test random numerators for (size_t i = 0; i < 10000; i += 16) { for (size_t j = 0; j < 16; j++) { numers[j] = get_random(); } for (size_t j = 0; j < 16; j++) { test_one(numers[j], denom, the_divider); } #if defined(LIBDIVIDE_AVX512) || \ defined(LIBDIVIDE_AVX2) || \ defined(LIBDIVIDE_SSE2) test_16(numers, denom, the_divider); #endif } } public: DivideTest(const std::string &n) : name(n) { std::random_device randomDevice; std::mt19937 randGen(randomDevice()); std::uniform_int_distribution randDist(1, numeric_limits::max()); seed = randDist(randGen); rand_n = (UT) randDist(randGen); } void run() { // Test small values for (int denom = 1; denom < 1024; denom++) { test_many(denom); test_many(denom); if (limits::is_signed) { test_many(-denom); test_many(-denom); } } if (limits::is_signed) { test_many(limits::min()); test_many(limits::min()); } test_many(limits::max()); test_many(limits::max()); // test power of 2 denoms: 2^i-1, 2^i, 2^i+1 for (int i = 1; i < limits::digits; i++) { for (int j = -1; j <= 1; j++) { T denom = ((T)1 << i) + j; test_many(denom); test_many(denom); if (limits::is_signed) { test_many(-denom); test_many(-denom); } } } // test all bits set: // 11111111, 11111110, 11111100, ... for (UT bits = (UT) ~0ull; bits != 0; bits <<= 1) { test_many((T) bits); test_many((T) bits); } // Test random denominators for (int i = 0; i < 10000; i++) { T denom = random_denominator(); test_many(denom); test_many(denom); } } }; void run_test(int idx) { switch (idx) { case 0: { std::string msg = "Testing int32_t\n"; cout << msg << flush; DivideTest dt("s32"); dt.run(); break; } case 1: { std::string msg = "Testing uint32_t\n"; cout << msg << flush; DivideTest dt("u32"); dt.run(); break; } case 2: { std::string msg = "Testing int64_t\n"; cout << msg << flush; DivideTest dt("s64"); dt.run(); break; } case 3: { std::string msg = "Testing uint64_t\n"; cout << msg << flush; DivideTest dt("u64"); dt.run(); break; } } } int main(int argc, char* argv[]) { vector is_test(4, false); if (argc == 1) { // Test all fill(is_test.begin(), is_test.end(), true); } else { for (int i = 1; i < argc; i++) { string arg(argv[i]); if (arg == "s32") is_test[0] = true; else if (arg == "u32") is_test[1] = true; else if (arg == "s64") is_test[2] = true; else if (arg == "u64") is_test[3] = true; else { cout << "Usage: tester [OPTIONS]\n" "\n" "You can pass the tester program one or more of the following options:\n" "u32, s32, u64, s64 or run it without arguments to test all four.\n" "The tester is multithreaded so it can test multiple cases simultaneously.\n" "The tester will verify the correctness of libdivide via a set of\n" "randomly chosen denominators, by comparing the result of libdivide's\n" "division to hardware division. It may take a long time to run, but it\n" "will output as soon as it finds a discrepancy." << endl; exit(1); } } } vector> futures; futures.reserve(4); // Start 4 threads for (int test_id = 0; test_id < 4; test_id++) { if (is_test.at(test_id)) futures.emplace_back(async(launch::async, run_test, test_id)); } // Wait until threads finish for (auto &f : futures) { f.get(); } cout << "\nAll tests passed successfully!" << endl; return 0; }