pax_global_header00006660000000000000000000000064146752460110014520gustar00rootroot0000000000000052 comment=ecd1d474e91aaea74f04b62df371b151c2a504fa RenderKit-rkcommon-988718e/000077500000000000000000000000001467524601100155315ustar00rootroot00000000000000RenderKit-rkcommon-988718e/.clang-format000066400000000000000000000055131467524601100201100ustar00rootroot00000000000000--- Language: Cpp # BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: DontAlign AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false #EscapedNewlineAlignmentStyle: Right AlignEscapedNewlines: Right AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty #AllowShortLambdasOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: true AfterControlStatement: false AfterEnum: true AfterFunction: true AfterNamespace: false AfterStruct: true AfterUnion: true BeforeCatch: false BeforeElse: false IndentBraces: false SplitEmptyFunction: false BreakBeforeBinaryOperators: NonAssignment BreakBeforeBraces: Custom BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false #BreakConstructorInitializersStyle: BeforeComma BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, foreach_active, foreach_tiled, foreach_unique, cdo, cfor, cif, cwhile ] IncludeCategories: - Regex: '^<.*\.i?h>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IncludeIsMainRegex: '([-_](test|unittest))?$' IndentCaseLabels: false IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 #PPDirectiveIndentStyle: AfterHash PointerAlignment: Right ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 2 UseTab: Never ... RenderKit-rkcommon-988718e/.gitattributes000066400000000000000000000000661467524601100204260ustar00rootroot00000000000000*.rc text working-tree-encoding=UTF-16LE-BOM eol=CRLF RenderKit-rkcommon-988718e/.github/000077500000000000000000000000001467524601100170715ustar00rootroot00000000000000RenderKit-rkcommon-988718e/.github/scripts/000077500000000000000000000000001467524601100205605ustar00rootroot00000000000000RenderKit-rkcommon-988718e/.github/scripts/build.ps1000077500000000000000000000004331467524601100223070ustar00rootroot00000000000000## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 md build cd build cmake --version cmake -L ` -G $args[0] ` -D RKCOMMON_TASKING_SYSTEM=INTERNAL ` -D RKCOMMON_WARN_AS_ERRORS=ON ` .. cmake --build . --config Release --target ALL_BUILD exit $LASTEXITCODE RenderKit-rkcommon-988718e/.github/scripts/build.sh000077500000000000000000000003411467524601100222140ustar00rootroot00000000000000#!/bin/bash -x ## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 mkdir build cd build cmake --version cmake "$@" -DRKCOMMON_TASKING_SYSTEM=INTERNAL -DRKCOMMON_WARN_AS_ERRORS=ON .. cmake --build . RenderKit-rkcommon-988718e/.github/scripts/run_tests.ps1000066400000000000000000000002771467524601100232410ustar00rootroot00000000000000## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 echo "Running tests" $env:Path += ";.\build\Release" .\build\Release\rkcommon_test_suite.exe exit $LASTEXITCODE RenderKit-rkcommon-988718e/.github/workflows/000077500000000000000000000000001467524601100211265ustar00rootroot00000000000000RenderKit-rkcommon-988718e/.github/workflows/ci.linux.yml000066400000000000000000000133461467524601100234110ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: (Internal) CI Linux on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: read-all jobs: ## Build Jobs ## build-rocky8: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:8 cmd: | .github/scripts/build.sh artifact-out: build-rocky8 artifact-path: build build-rocky8-icx: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:8 cmd: | module load intel/2022.1 export CC=icx export CXX=icpx export LDFLAGS="-static-intel" .github/scripts/build.sh artifact-out: build-rocky8-icx artifact-path: build build-rocky9: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:9 cmd: | .github/scripts/build.sh artifact-out: build-rocky9 artifact-path: build build-ubuntu2004: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:20.04 cmd: | .github/scripts/build.sh -G Ninja artifact-out: build-ubuntu2004 artifact-path: build build-ubuntu2204: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:22.04 cmd: | .github/scripts/build.sh -G Ninja artifact-out: build-ubuntu2204 artifact-path: build build-ubuntu2404: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:24.04 cmd: | .github/scripts/build.sh -G Ninja artifact-out: build-ubuntu2404 artifact-path: build build-arch: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | .github/scripts/build.sh -G Ninja artifact-out: build-arch artifact-path: build build-arch-clang: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | export CC=clang export CXX=clang++ .github/scripts/build.sh -G Ninja artifact-out: build-arch-clang artifact-path: build build-arch-clang-sanitizer: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | export CC=clang export CXX=clang++ export CFLAGS=-fsanitize=address export CXXFLAGS=-fsanitize=address export LDFLAGS=-fsanitize=address .github/scripts/build.sh -G Ninja -DCMAKE_BUILD_TYPE=Debug artifact-out: build-arch-clang-sanitizer artifact-path: build ## Functional Test Jobs ## test-rocky8: needs: build-rocky8 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:8 cmd: | ./build/rkcommon_test_suite artifact-in: build-rocky8 test-rocky8-icx: needs: build-rocky8-icx uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:8 cmd: | ./build/rkcommon_test_suite artifact-in: build-rocky8-icx test-rocky9: needs: build-rocky9 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: rockylinux:9 cmd: | ./build/rkcommon_test_suite artifact-in: build-rocky9 test-ubuntu2004: needs: build-ubuntu2004 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:20.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu2004 test-ubuntu2204: needs: build-ubuntu2204 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:22.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu2204 test-ubuntu2404: needs: build-ubuntu2404 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:24.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu2404 test-arch: needs: build-arch uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | ./build/rkcommon_test_suite artifact-in: build-arch test-arch-clang: needs: build-arch-clang uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | ./build/rkcommon_test_suite artifact-in: build-arch-clang test-arch-clang-sanitizer: needs: build-arch-clang-sanitizer uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: archlinux:latest cmd: | ./build/rkcommon_test_suite artifact-in: build-arch-clang-sanitizer ## Static Analysis ## static-analysis: secrets: inherit uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/static_analysis.yml@main with: project: RKCommon prebuild: cmake -S . -B build -DRKCOMMON_TASKING_SYSTEM=INTERNAL build: cmake --build build RenderKit-rkcommon-988718e/.github/workflows/ci.macos.yml000066400000000000000000000030531467524601100233460ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: (Internal) CI MacOS on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: read-all jobs: ## Build Jobs ## build-macos: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: runs-on: '[ "macOS", "build", "x86_64" ]' cmd: | .github/scripts/build.sh artifact-out: build-macos artifact-path: build build-macos-arm: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: runs-on: '[ "macOS", "build", "arm" ]' cmd: | .github/scripts/build.sh artifact-out: build-macos-arm artifact-path: build ## Functional Test Jobs ## test-macos: needs: build-macos uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: runs-on: '[ "macOS", "build", "x86_64" ]' cmd: | export DYLD_FALLBACK_LIBRARY_PATH=./build:$DYLD_FALLBACK_LIBRARY_PATH ./build/rkcommon_test_suite artifact-in: build-macos test-macos-arm: needs: build-macos-arm uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: runs-on: '[ "macOS", "build", "arm" ]' cmd: | export DYLD_FALLBACK_LIBRARY_PATH=./build:$DYLD_FALLBACK_LIBRARY_PATH ./build/rkcommon_test_suite artifact-in: build-macos-arm RenderKit-rkcommon-988718e/.github/workflows/ci.windows.yml000066400000000000000000000035641467524601100237450ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: (Internal) CI Windows on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: read-all jobs: ## Build Jobs ## build-windows-msvc15: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\build.ps1 "Visual Studio 15 2017 Win64" artifact-out: build-windows-msvc15 artifact-path: build build-windows-msvc16: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\build.ps1 "Visual Studio 16 2019" artifact-out: build-windows-msvc16 artifact-path: build build-windows-msvc17: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\build.ps1 "Visual Studio 17 2022" artifact-out: build-windows-msvc17 artifact-path: build ## Functional Test Jobs ## test-windows-msvc15: needs: build-windows-msvc15 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\run_tests.ps1 artifact-in: build-windows-msvc15 test-windows-msvc16: needs: build-windows-msvc16 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\run_tests.ps1 artifact-in: build-windows-msvc16 test-windows-msvc17: needs: build-windows-msvc17 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | .github\scripts\run_tests.ps1 artifact-in: build-windows-msvc17 RenderKit-rkcommon-988718e/.github/workflows/external.ci.linux.yml000066400000000000000000000057271467524601100252360ustar00rootroot00000000000000## Copyright 2024 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: CI Linux on: push: pull_request: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true permissions: read-all jobs: build-rocky-8: runs-on: ubuntu-latest container: image: rockylinux:8 steps: - name: Install packages run: | echo "Installing build dependencies..." dnf update -y dnf install -y git cmake tbb-devel dnf group install -y "Development Tools" - name: Checkout Repository uses: actions/checkout@v4 - name: Build run: | mkdir build cd build cmake -D CMAKE_INSTALL_PREFIX=`pwd`/install .. make -j`nproc` install - name: Upload Artifact uses: actions/upload-artifact@v4 with: name: build-rocky-8 path: build/install test-rocky-8: needs: build-rocky-8 runs-on: ubuntu-latest container: image: rockylinux:8 steps: - name: Install packages run: | echo "Installing runtime dependencies..." dnf update -y dnf install -y tbb - name: Download Artifact uses: actions/download-artifact@v4 with: name: build-rocky-8 - name: Test run: | # Adding execution bit to binaries is needed since upload/download GHA is using zip compression # and it can't preserve files permissions - https://github.com/actions/upload-artifact/issues/38 chmod +x ./bin/* LD_LIBRARY_PATH=./lib64/ ./bin/rkcommon_test_suite -d yes build-ubuntu-2204: runs-on: ubuntu-latest container: image: ubuntu:22.04 steps: - name: Install packages run: | echo "Installing build dependencies..." apt update apt upgrade -y apt install -y build-essential cmake git libtbb2-dev - name: Checkout Repository uses: actions/checkout@v4 - name: Build run: | mkdir build cd build cmake -D CMAKE_INSTALL_PREFIX=`pwd`/install .. make -j`nproc` install - name: Upload Artifact uses: actions/upload-artifact@v4 with: name: build-ubuntu-2204 path: build/install test-ubuntu-2204: needs: build-ubuntu-2204 runs-on: ubuntu-latest container: image: ubuntu:22.04 steps: - name: Install packages run: | echo "Installing runtime dependencies..." apt update apt upgrade -y apt install -y libtbb2 - name: Download Artifact uses: actions/download-artifact@v4 with: name: build-ubuntu-2204 - name: Test run: | # Adding execution bit to binaries is needed since upload/download GHA is using zip compression # and it can't preserve files permissions - https://github.com/actions/upload-artifact/issues/38 chmod +x ./bin/* LD_LIBRARY_PATH=./lib/ ./bin/rkcommon_test_suite -d yesRenderKit-rkcommon-988718e/.gitignore000066400000000000000000000002331467524601100175170ustar00rootroot00000000000000*~ *# bin *.user* build*/ *.sw? tags .ycm_extra_conf.pyc *.autosave *DS_Store* *.gz *.rpm *.zip *.bak *.patch .vscode .idea/ premake.local.* cmake-build*/ RenderKit-rkcommon-988718e/CMakeLists.txt000066400000000000000000000053231467524601100202740ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 ## Global CMake options ## if (RKCOMMON_TASKING_SYSTEM STREQUAL "OpenMP") cmake_minimum_required(VERSION 3.9) # NOTE(jda): rely on OpenMP targets else() cmake_minimum_required(VERSION 3.5) endif() set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) ## Establish project ## project(rkcommon VERSION 1.14.2 LANGUAGES CXX) include(GNUInstallDirs) configure_file( ${PROJECT_SOURCE_DIR}/rkcommon/version.h.in ${PROJECT_BINARY_DIR}/rkcommon/version.h @ONLY ) set(RKCOMMON_RESOURCE ${PROJECT_SOURCE_DIR}/rkcommon/rkcommon.rc) ## Add rkcommon specific macros ## set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake) include(rkcommon_macros) rkcommon_configure_build_type() rkcommon_configure_compiler() rkcommon_configure_tasking_system() rkcommon_create_tasking_target(FALSE) ## Build options and specific configuration ## option(BUILD_SHARED_LIBS "Build rkcommon as a shared library" ON) option(RKCOMMON_ADDRSAN "Build rkcommon with dlclose disabled for addrsan" OFF) option(RKCOMMON_NO_SIMD "Build rkcommon not using SIMD instructions" OFF) option(RKCOMMON_WARN_AS_ERRORS "Treat warnings as errors" OFF) set(CMAKE_SKIP_INSTALL_RPATH OFF) if (APPLE) set(CMAKE_MACOSX_RPATH ON) set(CMAKE_INSTALL_RPATH "@loader_path/") else() set(CMAKE_INSTALL_RPATH "\$ORIGIN") endif() include(CTest) if (BUILD_TESTING) enable_testing() endif() if (WIN32) option(INSTALL_DEPS "Install rkcommon DLL dependencies" ON) else() set(INSTALL_DEPS OFF) endif() if (INSTALL_DEPS) include(rkcommon_redist_deps) endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Build library and tests ## add_subdirectory(rkcommon) if (BUILD_TESTING) add_subdirectory(tests) endif() ## Configure CMake find_package() config files ## include(CMakePackageConfigHelpers) configure_package_config_file( "${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in" "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} ) write_basic_package_version_file( "${PROJECT_NAME}ConfigVersion.cmake" VERSION ${PROJECT_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake cmake/FindTBB.cmake cmake/rkcommon_macros.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} ) # Must be last include(CPack) RenderKit-rkcommon-988718e/LICENSE.txt000066400000000000000000000261361467524601100173640ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. RenderKit-rkcommon-988718e/README.md000066400000000000000000000010701467524601100170060ustar00rootroot00000000000000# rkcommon - C++/CMake infrastructure This project represents a common set of C++ infrastructure and CMake utilities used by various components of IntelĀ® Rendering Toolkit (Render Kit). ### Requirements - CMake - C++11 compiler - TBB 4.4.3 or higher (by default, other tasking system options available via the `RKCOMMON_TASKING_SYSTEM` CMake variable) ### Building Build with: ```bash git clone https://github.com/ospray/rkcommon.git cd rkcommon mkdir build cd build cmake .. cmake --build . ``` Run tests from the build directory with: ```bash ctest . ``` RenderKit-rkcommon-988718e/cmake/000077500000000000000000000000001467524601100166115ustar00rootroot00000000000000RenderKit-rkcommon-988718e/cmake/FindTBB.cmake000066400000000000000000000410461467524601100210300ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 #=============================================================================== # This script will attempt to find TBB and set up a TBB target. # # The user may specify a version and lists of required and optional components: # # find_package(TBB 2017.0 EXACT REQUIRED # tbb tbbmalloc # OPTIONAL_COMPONENTS tbbmalloc_proxy # QUIET) # # If this target exists already, the script will attempt to re-use it, but fail # if version or components do not match the user-specified requirements. # # If all the required component targets (e.g. TBB::tbb) exist, the script will # attempt to create a target TBB and link existing component targets to it. # It will fail if the component target version does not match the user-specified # requirements. # # The user may specify the following variables to help the search process: # - TBB_ROOT # - TBB_INCLUDE_DIR # # After the script has run successfully, there is a target TBB, as well as # component targets TBB::, e.g. TBB::tbbmalloc. # # The targets will attempt to link to release versions of TBB in release mode, # and debug versions in debug mode. # # In addition to the targets, the script defines: # # TBB_FOUND # TBB_INCLUDE_DIRS # #=============================================================================== # We use INTERFACE libraries, which are only supported in 3.x cmake_minimum_required(VERSION 3.5) # These two are used to automatically find the root and include directories. set(_TBB_INCLUDE_SUBDIR "include") set(_TBB_HEADER "tbb/tbb.h") # Initialize cache variable; but use existing non-cache variable as the default, # and fall back to the environment variable. if (NOT TBB_ROOT) set(TBB_ROOT "$ENV{TBB_ROOT}") endif() set(TBB_ROOT "${TBB_ROOT}" CACHE PATH "The root path of TBB.") #=============================================================================== # Error messages that respect the user's wishes about peace and quiet. #=============================================================================== function(rk_tbb_status) if (NOT TBB_FIND_QUIETLY) message(STATUS "${ARGV}") endif() endfunction() function(rk_tbb_warning) if (NOT TBB_FIND_QUIETLY) message(WARNING "${ARGV}") endif() endfunction() macro(rk_tbb_error) if (TBB_FIND_REQUIRED) message(FATAL_ERROR "${ARGV}") else() rk_tbb_warning("${ARGV}") endif() return() endmacro() #=============================================================================== # Extract a list of required and optional components. #=============================================================================== macro(rk_tbb_list_components) # cmake provides the TBB_FIND_COMPONENTS and # TBB_FIND_REQUIRED_ variables based on the invocation # of find_package. if (TBB_FIND_COMPONENTS STREQUAL "") set(_REQUIRED_COMPONENTS "tbb") set(_OPTIONAL_COMPONENTS "tbbmalloc" "tbbmalloc_proxy" "tbbbind" "tbbpreview") else() set(_REQUIRED_COMPONENTS "") set(_OPTIONAL_COMPONENTS "") foreach (C IN LISTS TBB_FIND_COMPONENTS) if (${TBB_FIND_REQUIRED_${C}}) list(APPEND _REQUIRED_COMPONENTS ${C}) else() list(APPEND _OPTIONAL_COMPONENTS ${C}) endif() endforeach() endif() rk_tbb_status("Looking for TBB components ${_REQUIRED_COMPONENTS}" " (${_OPTIONAL_COMPONENTS})") endmacro() #=============================================================================== # List components that are available, and check if any REQUIRED components # are missing. #=============================================================================== macro(rk_tbb_check_components) set(_TBB_MISSING_COMPONENTS "") set(_TBB_AVAILABLE_COMPONENTS "") foreach (C IN LISTS _REQUIRED_COMPONENTS) if (TARGET TBB::${C}) list(APPEND _TBB_AVAILABLE_COMPONENTS ${C}) else() list(APPEND _TBB_MISSING_COMPONENTS ${C}) endif() endforeach() foreach (C IN LISTS _OPTIONAL_COMPONENTS) if (TARGET TBB::${C}) list(APPEND _TBB_AVAILABLE_COMPONENTS ${C}) endif() endforeach() endmacro() #=============================================================================== # Check the version of the TBB root we found. #=============================================================================== macro(rk_tbb_check_version) # Extract the version we found in our root. if(EXISTS "${TBB_INCLUDE_DIR}/oneapi/tbb/version.h") set(_TBB_VERSION_HEADER "oneapi/tbb/version.h") elseif(EXISTS "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h") set(_TBB_VERSION_HEADER "tbb/tbb_stddef.h") elseif(EXISTS "${TBB_INCLUDE_DIR}/tbb/version.h") set(_TBB_VERSION_HEADER "tbb/version.h") else() rk_tbb_error("Missing TBB version information. Could not find" "tbb/tbb_stddef.h or tbb/version.h in ${TBB_INCLUDE_DIR}") endif() file(READ ${TBB_INCLUDE_DIR}/${_TBB_VERSION_HEADER} VERSION_HEADER_CONTENT) string(REGEX MATCH "#define TBB_VERSION_MAJOR ([0-9]+)" DUMMY "${VERSION_HEADER_CONTENT}") set(TBB_VERSION_MAJOR ${CMAKE_MATCH_1}) string(REGEX MATCH "#define TBB_VERSION_MINOR ([0-9]+)" DUMMY "${VERSION_HEADER_CONTENT}") set(TBB_VERSION_MINOR ${CMAKE_MATCH_1}) set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}") set(TBB_VERSION_STRING "${TBB_VERSION}") # If the user provided information about required versions, check them! if (TBB_FIND_VERSION) if (${TBB_FIND_VERSION_EXACT} AND NOT TBB_VERSION VERSION_EQUAL ${TBB_FIND_VERSION}) rk_tbb_error("Requested exact TBB version ${TBB_FIND_VERSION}," " but found ${TBB_VERSION}") elseif(TBB_VERSION VERSION_LESS ${TBB_FIND_VERSION}) rk_tbb_error("Requested minimum TBB version ${TBB_FIND_VERSION}," " but found ${TBB_VERSION}") endif() endif() rk_tbb_status("Found TBB version ${TBB_VERSION} at ${TBB_ROOT}") endmacro() #=============================================================================== # Reuse existing targets. # NOTE: This must be a macro, as we rely on return() to exit this script. #=============================================================================== macro(rk_tbb_reuse_existing_target_components) rk_tbb_check_components() if (_TBB_MISSING_COMPONENTS STREQUAL "") rk_tbb_status("Found existing TBB component targets: ${_TBB_AVAILABLE_COMPONENTS}") # Get TBB_INCLUDE_DIR if not already set to check for the version of the # existing component targets (making the assumption that they all have # the same version) if (NOT TBB_INCLUDE_DIR) list(GET _TBB_AVAILABLE_COMPONENTS 0 first_target) get_target_property(TBB_INCLUDE_DIR TBB::${first_target} INTERFACE_INCLUDE_DIRECTORIES) foreach(TGT IN LISTS _TBB_AVAILABLE_COMPONENTS) get_target_property(_TGT_INCLUDE_DIR TBB::${TGT} INTERFACE_INCLUDE_DIRECTORIES) if (NOT _TGT_INCLUDE_DIR STREQUAL "${TBB_INCLUDE_DIR}") rk_tbb_error("Existing TBB component targets have inconsistent include directories.") endif() endforeach() endif() find_path(TBB_INCLUDE_DIR NAMES "${_TBB_HEADER}" PATHS "${TBB_INCLUDE_DIRS}") # Extract TBB_ROOT from the include path so that rk_tbb_check_version # prints the correct tbb location string(REPLACE "/${_TBB_INCLUDE_SUBDIR}" "" TBB_ROOT "${TBB_INCLUDE_DIR}") rk_tbb_check_version() # Add target TBB and link all available components if (NOT TARGET TBB) add_library(TBB INTERFACE) foreach(C IN LISTS _TBB_AVAILABLE_COMPONENTS) target_link_libraries(TBB INTERFACE TBB::${C}) endforeach() endif() set(TBB_FOUND TRUE) set(TBB_INCLUDE_DIRS "${TBB_INCLUDE_DIR}") return() elseif ((TARGET TBB) OR (NOT _TBB_AVAILABLE_COMPONENTS STREQUAL "")) rk_tbb_error("Ignoring existing TBB targets because required components are missing: ${_TBB_MISSING_COMPONENTS}") endif() endmacro() #=============================================================================== # Find the root directory if a manual override is not specified. # Sets TBB_ROOT in the parent scope, but does not check for failure. #=============================================================================== function(rk_tbb_find_root) if (NOT TBB_ROOT OR TBB_ROOT STREQUAL "") set(TBB_HINTS "") set(TBB_PATHS "") if (WIN32) # workaround for parentheses in variable name / CMP0053 set(PROGRAMFILESx86 "PROGRAMFILES(x86)") set(PROGRAMFILES32 "$ENV{${PROGRAMFILESx86}}") if(NOT PROGRAMFILES32) set(PROGRAMFILES32 "$ENV{PROGRAMFILES}") endif() if(NOT PROGRAMFILES32) set(PROGRAMFILES32 "C:/Program Files (x86)") endif() set(TBB_PATHS "${PROJECT_SOURCE_DIR}/../tbb" "${PROGRAMFILES32}/IntelSWTools/compilers_and_libraries/windows/tbb" "${PROGRAMFILES32}/Intel/Composer XE/tbb" "${PROGRAMFILES32}/Intel/compilers_and_libraries/windows/tbb") else() set(TBB_HINTS "/usr/local") set(TBB_PATHS "${PROJECT_SOURCE_DIR}/tbb" "/opt/intel/oneapi/tbb/latest" "/opt/intel/tbb" "/opt/intel/compilers_and_libraries/tbb" "/opt/intel/compilers_and_libraries/linux/tbb" "/opt/intel/composerxe/tbb") endif() set(TBB_ROOT "TBB_ROOT-NOTFOUND") find_path(TBB_ROOT NAMES "${_TBB_INCLUDE_SUBDIR}/${_TBB_HEADER}" HINTS ${TBB_HINTS} PATHS ${TBB_PATHS} NO_PACKAGE_ROOT_PATH) endif() endfunction() #=============================================================================== # Find the include directory if a manual override is not specified. # Assumes TBB_ROOT to be set. #=============================================================================== function(rk_tbb_find_include_directory) find_path(TBB_INCLUDE_DIR NAMES "${_TBB_HEADER}" HINTS "${TBB_ROOT}/${_TBB_INCLUDE_SUBDIR}" NO_PACKAGE_ROOT_PATH) endfunction() #=============================================================================== # Find a specific library and create a target for it. #=============================================================================== function(rk_tbb_find_library COMPONENT_NAME BUILD_CONFIG) set(LIB_VAR "${COMPONENT_NAME}_LIBRARY_${BUILD_CONFIG}") set(BIN_DIR_VAR "${COMPONENT_NAME}_BIN_DIR_${BUILD_CONFIG}") set(DLL_VAR "${COMPONENT_NAME}_DLL_${BUILD_CONFIG}") if (BUILD_CONFIG STREQUAL "DEBUG") set(LIB_NAME "${COMPONENT_NAME}_debug") else() set(LIB_NAME "${COMPONENT_NAME}") endif() unset(LIB_PATHS) if (WIN32) if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(TBB_ARCH intel64) else() set(TBB_ARCH ia32) endif() if(MSVC10) set(TBB_VCVER vc10) elseif(MSVC11) set(TBB_VCVER vc11) elseif(MSVC12) set(TBB_VCVER vc12) else() set(TBB_VCVER vc14) endif() set(LIB_PATHS ${TBB_ROOT}/lib/${TBB_ARCH}/${TBB_VCVER} ${TBB_ROOT}/lib ) # On window, also search the DLL so that the client may install it. set(DLL_NAME "${LIB_NAME}.dll") # lib name with version suffix to handle oneTBB tbb12.dll set(LIB_NAME_VERSION "") if (${COMPONENT_NAME} STREQUAL "tbb") if (BUILD_CONFIG STREQUAL "DEBUG") set(LIB_NAME_VERSION "tbb12_debug") else() set(LIB_NAME_VERSION "tbb12") endif() endif() set(DLL_NAME_VERSION "${LIB_NAME_VERSION}.dll") set(BIN_FILE BIN_FILE-NOTFOUND) find_file(BIN_FILE NAMES ${DLL_NAME} ${DLL_NAME_VERSION} PATHS "${TBB_ROOT}/bin/${TBB_ARCH}/${TBB_VCVER}" "${TBB_ROOT}/bin" "${TBB_ROOT}/redist/${TBB_ARCH}/${TBB_VCVER}" "${TBB_ROOT}/../redist/${TBB_ARCH}/tbb/${TBB_VCVER}" "${TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/${TBB_VCVER}" NO_DEFAULT_PATH) get_filename_component(${BIN_DIR_VAR} ${BIN_FILE} DIRECTORY) set(${DLL_VAR} "${BIN_FILE}" CACHE PATH "${COMPONENT_NAME} ${BUILD_CONFIG} dll path") elseif(APPLE) set(LIB_PATHS ${TBB_ROOT}/lib) else() file(GLOB LIB_PATHS PATHS ${TBB_ROOT}/lib/intel64/gcc*) list(REVERSE LIB_PATHS) list(APPEND LIB_PATHS ${TBB_ROOT}/lib ${TBB_ROOT}/lib/x86_64-linux-gnu ${TBB_ROOT}/lib64 ${TBB_ROOT}/libx86_64-linux-gnu) endif() # We prefer finding the versioned file on Unix so that the library path # variable will not point to a symlink. This makes installing TBB as a # dependency easier. if (UNIX) set(LIB_NAME lib${LIB_NAME}.so.2 ${LIB_NAME}) endif() find_library(${LIB_VAR} NAMES ${LIB_NAME} PATHS ${LIB_PATHS} NO_DEFAULT_PATH) # Hide this variable if we found something, otherwise display it for # easy override. if(${LIB_VAR}) mark_as_advanced(${LIB_VAR}) endif() if(${BIN_DIR_VAR}) mark_as_advanced(${BIN_DIR_VAR}) endif() if(${DLL_VAR}) mark_as_advanced(${DLL_VAR}) endif() endfunction() #=============================================================================== # Find the given component. # This macro attempts to find both release and debug versions, and falls back # appropriately if only one can be found. # On success, it creates a target ${TARGET}::${COMPONENT_NAME} and links # it to the overall ${TARGET}. # # For more information on the variables set here, see # https://cmake.org/cmake/help/v3.17/manual/cmake-developer.7.html#a-sample-find-module #=============================================================================== function(rk_tbb_find_and_link_component COMPONENT_NAME) set(COMPONENT_TARGET "TBB::${COMPONENT_NAME}") rk_tbb_find_library("${COMPONENT_NAME}" RELEASE) rk_tbb_find_library("${COMPONENT_NAME}" DEBUG) if (${COMPONENT_NAME}_LIBRARY_RELEASE OR ${COMPONENT_NAME}_LIBRARY_DEBUG) # Note: We *must* use SHARED here rather than UNKNOWN as our # IMPORTED_NO_SONAME trick a few lines down does not work with # UNKNOWN. add_library(${COMPONENT_TARGET} SHARED IMPORTED) if (${COMPONENT_NAME}_LIBRARY_RELEASE) set_property(TARGET ${COMPONENT_TARGET} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) if(WIN32) set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_RELEASE "${${COMPONENT_NAME}_DLL_RELEASE}") set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_IMPLIB_RELEASE "${${COMPONENT_NAME}_LIBRARY_RELEASE}") else() set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_RELEASE "${${COMPONENT_NAME}_LIBRARY_RELEASE}") endif() endif() if (${COMPONENT_NAME}_LIBRARY_DEBUG) set_property(TARGET ${COMPONENT_TARGET} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) if(WIN32) set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_DEBUG "${${COMPONENT_NAME}_DLL_DEBUG}") set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_IMPLIB_DEBUG "${${COMPONENT_NAME}_LIBRARY_DEBUG}") else() set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_DEBUG "${${COMPONENT_NAME}_LIBRARY_DEBUG}") endif() endif() set_target_properties(${COMPONENT_TARGET} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TBB_INCLUDE_DIR}" INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1" ) if(NOT WIN32) # Note: IMPORTED_NO_SONAME must be set or cmake will attempt # to link to the full path of libtbb.so. Instead, we # rely on the linker to find libtbb.so.2. set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_NO_SONAME TRUE ) endif() target_link_libraries(TBB INTERFACE ${COMPONENT_TARGET}) endif() endfunction() #=============================================================================== # Note: The order of these is important. # Some of these macros create variables that are used in later calls. rk_tbb_list_components() rk_tbb_reuse_existing_target_components() rk_tbb_find_root() if (NOT EXISTS "${TBB_ROOT}") rk_tbb_error("Unable to find root directory ${TBB_ROOT}") endif() mark_as_advanced(TBB_ROOT) # Hide, we found something. rk_tbb_find_include_directory() if (NOT EXISTS "${TBB_INCLUDE_DIR}") rk_tbb_error("Unable to find include directory ${TBB_INCLUDE_DIR}") endif() mark_as_advanced(TBB_INCLUDE_DIR) # Hide, we found something. rk_tbb_check_version() add_library(TBB INTERFACE) foreach(C IN LISTS _REQUIRED_COMPONENTS _OPTIONAL_COMPONENTS) rk_tbb_find_and_link_component(${C}) endforeach() rk_tbb_check_components() if (_TBB_MISSING_COMPONENTS) rk_tbb_error("Cannot find required components: " "${_TBB_MISSING_COMPONENTS}") endif() set(TBB_FOUND TRUE) set(TBB_INCLUDE_DIRS "${TBB_INCLUDE_DIR}") RenderKit-rkcommon-988718e/cmake/rkcommonConfig.cmake.in000066400000000000000000000017251467524601100232000ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 @PACKAGE_INIT@ include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@_Exports.cmake") include("${CMAKE_CURRENT_LIST_DIR}/rkcommon_macros.cmake") check_required_components("@PROJECT_NAME@") ## Stash incoming CMAKE_MODULE_PATH ## set(RKCOMMON_CALLERS_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}) ## Create rkcommon tasking target ## set(RKCOMMON_TASKING_SYSTEM @RKCOMMON_TASKING_SYSTEM@) set(RKCOMMON_TASKING_TBB @RKCOMMON_TASKING_TBB@) set(RKCOMMON_TASKING_OPENMP @RKCOMMON_TASKING_OPENMP@) set(RKCOMMON_TASKING_INTERNAL @RKCOMMON_TASKING_INTERNAL@) set(RKCOMMON_TASKING_DEBUG @RKCOMMON_TASKING_DEBUG@) rkcommon_create_tasking_target(TRUE) ## Restore CMAKE_MODULE_PATH ## set(CMAKE_MODULE_PATH ${RKCOMMON_CALLERS_CMAKE_MODULE_PATH}) ## Standard signal that the package was found ## set(RKCOMMON_FOUND TRUE) RenderKit-rkcommon-988718e/cmake/rkcommon_macros.cmake000066400000000000000000000204171467524601100230100ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 # use a backported version of find_dependency(), renamed as # find_dependency_39(), from CMake 3.9.0, which correctly supports passing # components to find_package(). this allows us to maintain our current minimum # CMake version of 3.1. macro(find_dependency_39 dep) if (NOT ${dep}_FOUND) set(cmake_fd_quiet_arg) if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) set(cmake_fd_quiet_arg QUIET) endif() set(cmake_fd_required_arg) if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) set(cmake_fd_required_arg REQUIRED) endif() get_property(cmake_fd_alreadyTransitive GLOBAL PROPERTY _CMAKE_${dep}_TRANSITIVE_DEPENDENCY ) find_package(${dep} ${ARGN} ${cmake_fd_quiet_arg} ${cmake_fd_required_arg} ) if(NOT DEFINED cmake_fd_alreadyTransitive OR cmake_fd_alreadyTransitive) set_property(GLOBAL PROPERTY _CMAKE_${dep}_TRANSITIVE_DEPENDENCY TRUE) endif() if (NOT ${dep}_FOUND) set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.") set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False) return() endif() set(cmake_fd_required_arg) set(cmake_fd_quiet_arg) set(cmake_fd_exact_arg) endif() endmacro() ## Macro for printing CMake variables ## macro(print var) message("${var} = ${${var}}") endmacro() ## Macro to print a warning message that only appears once ## macro(rkcommon_warn_once IDENTIFIER MESSAGE) set(INTERNAL_WARNING "RKCOMMON_WARNED_${IDENTIFIER}") if(NOT ${INTERNAL_WARNING}) message(WARNING ${MESSAGE}) set(${INTERNAL_WARNING} ON CACHE INTERNAL "Warned about '${MESSAGE}'") endif() endmacro() ## Get a list of subdirectories (single level) under a given directory macro(get_subdirectories result curdir) file(GLOB children RELATIVE ${curdir} ${curdir}/*) set(dirlist "") foreach(child ${children}) if(IS_DIRECTORY ${curdir}/${child}) list(APPEND dirlist ${child}) endif() endforeach() set(${result} ${dirlist}) endmacro() ## Setup CMAKE_BUILD_TYPE to have a default + cycle between options in UI macro(rkcommon_configure_build_type) set(CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo") if (WIN32) if (NOT RKCOMMON_DEFAULT_CMAKE_CONFIGURATION_TYPES_SET) set(CMAKE_CONFIGURATION_TYPES "${CONFIGURATION_TYPES}" CACHE STRING "List of generated configurations." FORCE) set(RKCOMMON_DEFAULT_CMAKE_CONFIGURATION_TYPES_SET ON CACHE INTERNAL "Default CMake configuration types set.") endif() else() if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the build type." FORCE) endif() set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CONFIGURATION_TYPES}) endif() endmacro() ## Compiler configuration macros ## macro(rkcommon_configure_compiler) if (WIN32) set(RKCOMMON_PLATFORM_WIN 1) set(RKCOMMON_PLATFORM_UNIX 0) else() set(RKCOMMON_PLATFORM_WIN 0) set(RKCOMMON_PLATFORM_UNIX 1) endif() # unhide compiler to make it easier for users to see what they are using mark_as_advanced(CLEAR CMAKE_CXX_COMPILER) option(RKCOMMON_STRICT_BUILD "Build with additional warning flags" ON) mark_as_advanced(RKCOMMON_STRICT_BUILD) option(RKCOMMON_WARN_AS_ERRORS "Treat warnings as errors" OFF) mark_as_advanced(RKCOMMON_WARN_AS_ERRORS) set(RKCOMMON_COMPILER_ICC FALSE) set(RKCOMMON_COMPILER_GCC FALSE) set(RKCOMMON_COMPILER_CLANG FALSE) set(RKCOMMON_COMPILER_MSVC FALSE) set(RKCOMMON_COMPILER_DPCPP FALSE) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") set(RKCOMMON_COMPILER_ICC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(RKCOMMON_COMPILER_GCC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") set(RKCOMMON_COMPILER_CLANG TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") set(RKCOMMON_COMPILER_MSVC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM") set(RKCOMMON_COMPILER_DPCPP TRUE) else() message(FATAL_ERROR "Unsupported compiler specified: '${CMAKE_CXX_COMPILER_ID}'") endif() if (WIN32 AND NOT RKCOMMON_COMPILER_MSVC) # workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/18311 set(CMAKE_NINJA_CMCLDEPS_RC OFF) endif() # setting DEPENDENTLOADFLAG:LOAD_LIBRARY_SAFE_CURRENT_DIRS on rkcommon DLL if(WIN32) if(RKCOMMON_COMPILER_MSVC) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEPENDENTLOADFLAG:0x2000") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEPENDENTLOADFLAG:0x2000") elseif(RKCOMMON_COMPILER_DPCPP) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /Qoption,link,/DEPENDENTLOADFLAG:0x2000") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /Qoption,link,/DEPENDENTLOADFLAG:0x2000") else() message(WARNING "Unrecognized WIN32 compiler, DEPENDENTLOADFLAG can't be set") endif() endif() endmacro() ## Tasking System macros ## macro(rkcommon_configure_tasking_system) set(RKCOMMON_TASKING_SYSTEM TBB CACHE STRING "Per-node thread tasking system [TBB,OpenMP,Internal,Debug]") set_property(CACHE RKCOMMON_TASKING_SYSTEM PROPERTY STRINGS TBB OpenMP Internal Debug) # NOTE(jda) - Make the RKCOMMON_TASKING_SYSTEM build option case-insensitive string(TOUPPER ${RKCOMMON_TASKING_SYSTEM} RKCOMMON_TASKING_SYSTEM_ID) set(RKCOMMON_TASKING_TBB FALSE) set(RKCOMMON_TASKING_OPENMP FALSE) set(RKCOMMON_TASKING_INTERNAL FALSE) set(RKCOMMON_TASKING_DEBUG FALSE) if(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "TBB") set(RKCOMMON_TASKING_TBB TRUE) else() unset(TBB_INCLUDE_DIR CACHE) unset(TBB_LIBRARY CACHE) unset(TBB_LIBRARY_DEBUG CACHE) unset(TBB_LIBRARY_MALLOC CACHE) unset(TBB_LIBRARY_MALLOC_DEBUG CACHE) if(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "OPENMP") set(RKCOMMON_TASKING_OPENMP TRUE) elseif(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "INTERNAL") set(RKCOMMON_TASKING_INTERNAL TRUE) else() set(RKCOMMON_TASKING_DEBUG TRUE) endif() endif() endmacro() macro(rkcommon_create_tasking_target FROM_INSTALL) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) set(RKCOMMON_TASKING_LIBS ${CMAKE_THREAD_LIBS_INIT}) if(RKCOMMON_TASKING_TBB) if(POLICY CMP0074) # Our FindTBB script uses TBB_ROOT, which is the NEW behaviour for # CMP0074. cmake_policy(SET CMP0074 NEW) endif() if (DEFINED RKCOMMON_TBB_ROOT AND NOT RKCOMMON_TBB_ROOT STREQUAL "") set(TBB_FIND_PACKAGE_OPTION "ONLY_CMAKE_FIND_ROOT_PATH") set(CMAKE_FIND_ROOT_PATH ${RKCOMMON_TBB_ROOT}) set(TBB_ROOT ${RKCOMMON_TBB_ROOT}) list(APPEND CMAKE_PREFIX_PATH ${RKCOMMON_TBB_ROOT}) endif() # Try getting TBB via config first find_package(TBB 2021.1 QUIET COMPONENTS tbb tbbmalloc CONFIG ${TBB_FIND_PACKAGE_OPTION}) if (TBB_FOUND) list(APPEND RKCOMMON_TASKING_LIBS TBB::tbb TBB::tbbmalloc) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_TBB) else() # If not found try getting older TBB via module (FindTBB.cmake) unset(TBB_DIR CACHE) if (${FROM_INSTALL}) find_dependency_39(TBB 4.4 REQUIRED tbb tbbmalloc) else() find_package(TBB 4.4 REQUIRED tbb tbbmalloc) endif() if (TBB_FOUND) list(APPEND RKCOMMON_TASKING_LIBS TBB) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_TBB) endif() endif() elseif(RKCOMMON_TASKING_OPENMP) find_dependency_39(OpenMP) if (OPENMP_FOUND) list(APPEND RKCOMMON_TASKING_LIBS OpenMP::OpenMP_CXX) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_OMP) endif() elseif(RKCOMMON_TASKING_INTERNAL) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_INTERNAL) else()#Debug # Do nothing, will fall back to scalar code (useful for debugging) endif() if (NOT TARGET rkcommon_tasking) add_library(rkcommon_tasking INTERFACE IMPORTED) set_target_properties(rkcommon_tasking PROPERTIES INTERFACE_LINK_LIBRARIES "${RKCOMMON_TASKING_LIBS}" INTERFACE_COMPILE_DEFINITIONS "${RKCOMMON_TASKING_DEFINITIONS}" ) endif() endmacro() RenderKit-rkcommon-988718e/cmake/rkcommon_redist_deps.cmake000066400000000000000000000025661467524601100240360ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 if (WIN32 AND RKCOMMON_TASKING_TBB) if (NOT TBB_ARCH) if (CMAKE_SIZEOF_VOID_P EQUAL 8) set(TBB_ARCH intel64) else() set(TBB_ARCH ia32) endif() endif() set(TBB_DLL_HINTS HINTS ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/redist/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/bin/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/bin ) find_file(TBB_DLL NAMES tbb12.dll tbb.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_DEBUG NAMES tbb12_debug.dll tbb_debug.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_MALLOC tbbmalloc.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_MALLOC_DEBUG tbbmalloc_debug.dll ${TBB_DLL_HINTS}) mark_as_advanced(TBB_DLL) mark_as_advanced(TBB_DLL_DEBUG) mark_as_advanced(TBB_DLL_MALLOC) mark_as_advanced(TBB_DLL_MALLOC_DEBUG) install(PROGRAMS ${TBB_DLL} ${TBB_DLL_MALLOC} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS Release RelWithDebInfo) install(PROGRAMS ${TBB_DLL_DEBUG} ${TBB_DLL_MALLOC_DEBUG} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS Debug) endif() RenderKit-rkcommon-988718e/rkcommon/000077500000000000000000000000001467524601100173565ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/CMakeLists.txt000066400000000000000000000051161467524601100221210ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 if (RKCOMMON_TASKING_INTERNAL) set(EXTRA_TASKING_SOURCES tasking/detail/enkiTS/TaskScheduler.cpp tasking/detail/TaskSys.cpp ) endif() add_library(${PROJECT_NAME} ${RKCOMMON_RESOURCE} common.cpp memory/malloc.cpp networking/DataStreaming.cpp networking/Fabric.cpp os/FileName.cpp os/library.cpp tasking/detail/tasking_system_init.cpp ${EXTRA_TASKING_SOURCES} utility/demangle.cpp utility/ParameterizedObject.cpp utility/PseudoURL.cpp utility/TimeStamp.cpp xml/XML.cpp tracing/Tracing.cpp ) target_link_libraries(${PROJECT_NAME} PUBLIC rkcommon_tasking ${CMAKE_DL_LIBS} $<${RKCOMMON_PLATFORM_WIN}:ws2_32> ) target_include_directories(${PROJECT_NAME} PUBLIC $ $ $ PRIVATE ${CMAKE_CURRENT_LIST_DIR} ) if (RKCOMMON_TASKING_INTERNAL) target_compile_definitions(${PROJECT_NAME} PRIVATE -DENKITS_BUILD_DLL) endif() if (RKCOMMON_ADDRSAN) target_compile_definitions(${PROJECT_NAME} PUBLIC -DRKCOMMON_ADDRSAN) endif() if (RKCOMMON_NO_SIMD) target_compile_definitions(${PROJECT_NAME} PUBLIC -DRKCOMMON_NO_SIMD) endif() if (RKCOMMON_WARN_AS_ERRORS) if(MSVC) target_compile_options(${PROJECT_NAME} PRIVATE /WX) else() target_compile_options(${PROJECT_NAME} PRIVATE -Werror) endif() endif() set_property(TARGET ${PROJECT_NAME} PROPERTY POSITION_INDEPENDENT_CODE ON) ## Install library + targets ################################################## set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) install(TARGETS ${PROJECT_NAME} EXPORT rkcommon_Exports LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_SKIP # on Windows put the dlls into bin RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # ... and the import lib into the devel package ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) install(EXPORT rkcommon_Exports DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} NAMESPACE rkcommon:: ) install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_ONLY RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) ## Install headers ############################################################ install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN *.h PATTERN *.inl PATTERN *.hpp PATTERN *.ih ) RenderKit-rkcommon-988718e/rkcommon/array3D/000077500000000000000000000000001467524601100206635ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/array3D/Array3D.h000066400000000000000000000300361467524601100223030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // ospray #include #include #include "../common.h" #include "../math/range.h" #include "for_each.h" namespace rkcommon { namespace array3D { /*! ABSTRACTION for a 3D array of data */ template struct Array3D { virtual ~Array3D() = default; /*! return size (ie, "dimensions") of volume */ virtual vec3i size() const = 0; /*! get cell value at given location, but ensure that location is actually a valid cell ID inside the volume (clamps to nearest cell in volume if 'where' is outside) */ virtual value_t get(const vec3i &where) const = 0; /*! get the range/interval of all cell values in the given begin/end region of the volume */ range_t getValueRange(const vec3i &begin, const vec3i &end) const { range_t v = get(begin); for_each(begin, end, [&](const vec3i &idx) { v.extend(get(idx)); }); return v; } /*! get value range over entire volume */ range_t getValueRange() const { return getValueRange(vec3i(0), size()); } /*! returns number of elements (as 64-bit int) across all dimensions */ virtual size_t numElements() const = 0; }; /*! implementation for an actual array3d that stores a 3D array of values */ template struct ActualArray3D : public Array3D { ActualArray3D(const vec3i &dims, void *externalMem = nullptr); ~ActualArray3D() override { if (valuesAreMine) delete[] value; } /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override; /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &where, const value_t &t); void clear(const value_t &t); /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; /* compute the (1D) linear array index for a (3D) grid coordinate */ size_t indexOf(const vec3i &pos) const { return pos.x + size_t(dims.x) * (pos.y + size_t(dims.y) * pos.z); } const vec3i dims; value_t *value; // bool that specified whether it was us that alloc'ed this mem, // and thus, whether we should free it upon termination. bool valuesAreMine; }; /*! shifts another array3d by a given amount */ template struct IndexShiftedArray3D : public Array3D { IndexShiftedArray3D(std::shared_ptr> _actual, const vec3i &_shift) : actual(_actual), shift(_shift) { } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return actual->size(); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return actual->get((where + size() + shift) % size()); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a IndexShiftArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { return actual->numElements(); } const vec3i shift; const std::shared_ptr> actual; }; /*! implemnetaiton of a wrapper class that makes an actual array3d of one type look like that of another type */ template struct Array3DAccessor : public Array3D { Array3DAccessor(std::shared_ptr> actual); /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ out_t get(const vec3i &where) const override; /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; private: //! the actual 3D array we're wrapping around const std::shared_ptr> actual; }; /*! wrapper class that generates an artificially larger data set by simply repeating the given input */ template struct Array3DRepeater : public Array3D { Array3DRepeater(const std::shared_ptr> &actual, const vec3i &repeatedSize); /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ T get(const vec3i &where) const override; /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; const vec3i repeatedSize; const std::shared_ptr> actual; }; /*! implements a sub-set of another array3d */ template struct SubBoxArray3D : public Array3D { SubBoxArray3D(const std::shared_ptr> &actual, const box3i &clipBox) : clipBox(clipBox), actual(actual) { assert(actual); assert(clipBox.upper.x <= actual->size().x); assert(clipBox.upper.y <= actual->size().y); assert(clipBox.upper.z <= actual->size().z); } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return clipBox.size(); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return actual->get(where + clipBox.lower); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a SubBoxArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { vec3i dims = clipBox.size(); return size_t(dims.x) * size_t(dims.y) * size_t(dims.z); } const box3i clipBox; const std::shared_ptr> actual; }; /*! implements a array3d that's composed of multiple individual slices */ template struct MultiSliceArray3D : public Array3D { MultiSliceArray3D( const std::vector>> &slice) : slice(slice) { } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return vec3i(slice[0]->size().x, slice[0]->size().y, slice.size()); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return slice[clamp(where.z, 0, (int)slice.size() - 1)]->get( vec3i(where.x, where.y, 0)); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a MultiSliceArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { return slice[0]->numElements() * slice.size(); } const std::vector>> slice; }; #ifndef _WIN32 /*! load raw file with given dimensions. the 'type' of the raw file (uint8,float,...) is given through the function's template parameter */ template std::shared_ptr> RKCOMMON_INTERFACE loadRAW(const std::string &fileName, const vec3i &dims); /*! load raw file with given dimensions. the 'type' of the raw file (uint8,float,...) is given through the function's template parameter */ template std::shared_ptr> RKCOMMON_INTERFACE mmapRAW(const std::string &fileName, const vec3i &dims); #endif // Inlined definitions //////////////////////////////////////////////////// // ActualArray3D // template inline vec3i ActualArray3D::size() const { return dims; } template inline T ActualArray3D::get(const vec3i &_where) const { assert(value != nullptr); const vec3i where = max(vec3i(0), min(_where, dims - vec3i(1))); size_t index = where.x + size_t(dims.x) * (where.y + size_t(dims.y) * (where.z)); assert(value); assert(index < numElements()); const T v = value[index]; return v; } template inline size_t ActualArray3D::numElements() const { return size_t(dims.x) * size_t(dims.y) * size_t(dims.z); } template inline ActualArray3D::ActualArray3D(const vec3i &dims, void *externalMem) : dims(dims), value((T *)externalMem), valuesAreMine(externalMem == nullptr) { try { if (!value) { const size_t numVoxels = longProduct(dims); value = new T[numVoxels]; } } catch (const std::bad_alloc &) { std::stringstream ss; ss << "could not allocate memory for Array3D of dimensions " << dims << " (in Array3D::Array3D())"; throw std::runtime_error(ss.str()); } } template inline void ActualArray3D::set(const vec3i &where, const T &t) { value[longIndex(where, size())] = t; } template inline void ActualArray3D::clear(const T &t) { for_each(size(), [&](const vec3i &idx) { set(idx, t); }); } // Array3DAccessor // template inline Array3DAccessor::Array3DAccessor( std::shared_ptr> actual) : actual(actual) { } template inline vec3i Array3DAccessor::size() const { return actual->size(); } template inline out_t Array3DAccessor::get(const vec3i &where) const { return (out_t)actual->get(where); } template inline size_t Array3DAccessor::numElements() const { assert(actual); return actual->numElements(); } // Array3DRepeater // template inline Array3DRepeater::Array3DRepeater( const std::shared_ptr> &actual, const vec3i &repeatedSize) : repeatedSize(repeatedSize), actual(actual) { } template inline vec3i Array3DRepeater::size() const { return repeatedSize; } template inline T Array3DRepeater::get(const vec3i &_where) const { vec3i where(_where.x % repeatedSize.x, _where.y % repeatedSize.y, _where.z % repeatedSize.z); if ((_where.x / repeatedSize.x) % 2) where.x = repeatedSize.x - 1 - where.x; if ((_where.y / repeatedSize.y) % 2) where.y = repeatedSize.y - 1 - where.y; if ((_where.z / repeatedSize.z) % 2) where.z = repeatedSize.z - 1 - where.z; return actual->get(where); } template inline size_t Array3DRepeater::numElements() const { return size_t(repeatedSize.x) * size_t(repeatedSize.y) * size_t(repeatedSize.z); } } // namespace array3D } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/array3D/for_each.h000066400000000000000000000043131467524601100226030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/box.h" /*! \file array3D/for_each Helper templates to do 3D iterations via lambda functions */ namespace rkcommon { namespace array3D { using namespace rkcommon::math; /*! compute - in 64 bit - the number of voxels in a vec3i */ inline size_t longProduct(const vec3i &dims) { return dims.x * size_t(dims.y) * dims.z; } /*! compute - in 64 bit - the linear array index of vec3i index in a vec3i sized array */ inline size_t longIndex(const vec3i &idx, const vec3i &dims) { return idx.x + size_t(dims.x) * (idx.y + size_t(dims.y) * idx.z); } /*! reverse mapping form a cell index to the cell coordinates, for a given volume size 'dims' */ inline vec3i coordsOf(const size_t idx, const vec3i &dims) { return vec3i( idx % dims.x, (idx / dims.x) % dims.y, (idx / dims.x) / dims.y); } /*! iterate through all indices in [lower,upper), EXCLUSING the 'upper' value */ template inline void for_each(const vec3i &lower, const vec3i &upper, Functor &&functor) { for (int iz = lower.z; iz < upper.z; iz++) for (int iy = lower.y; iy < upper.y; iy++) for (int ix = lower.x; ix < upper.x; ix++) functor(vec3i(ix, iy, iz)); } /*! a template that calls the given functor (typically a lambda) for every vec3i(ix,iy,iz) with 0<=ixsize(),[&](const vec3i &idx){ doSomeThing(volume,index); }); */ template inline void for_each(const vec3i &size, Functor &&functor) { for_each({0, 0, 0}, size, std::forward(functor)); } /*! iterate through all indices in [lower,upper), EXCLUSING the 'upper' value */ template inline void for_each(const box3i &coords, Functor &&functor) { for_each(coords.lower, coords.upper, std::forward(functor)); } } // namespace array3D } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/common.cpp000066400000000000000000000053531467524601100213600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "common.h" #include "os/library.h" #include namespace rkcommon { void removeArgs(int &ac, const char **&av, int where, int howMany) { for (int i = where + howMany; i < ac; i++) av[i - howMany] = av[i]; ac -= howMany; } void loadLibrary(const void *anchorAddress, const std::string &name, const std::vector &version) { LibraryRepository::getInstance()->add(anchorAddress, name, version); } void unloadLibrary(const std::string &name) { LibraryRepository::getInstance()->remove(name); } void *getSymbol(const std::string &name) { return LibraryRepository::getInstance()->getSymbol(name); } #ifdef _WIN32 #define osp_snprintf sprintf_s #else #define osp_snprintf snprintf #endif std::string prettyDouble(double val) { const double absVal = std::abs(val); char result[1000]; if (absVal >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e18f, 'E'); else if (absVal >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e15f, 'P'); else if (absVal >= 1e+12f) osp_snprintf(result, 1000, "%.1f%c", val / 1e12f, 'T'); else if (absVal >= 1e+09f) osp_snprintf(result, 1000, "%.1f%c", val / 1e09f, 'G'); else if (absVal >= 1e+06f) osp_snprintf(result, 1000, "%.1f%c", val / 1e06f, 'M'); else if (absVal >= 1e+03f) osp_snprintf(result, 1000, "%.1f%c", val / 1e03f, 'k'); else if (absVal <= 1e-12f) osp_snprintf(result, 1000, "%.1f%c", val * 1e15f, 'f'); else if (absVal <= 1e-09f) osp_snprintf(result, 1000, "%.1f%c", val * 1e12f, 'p'); else if (absVal <= 1e-06f) osp_snprintf(result, 1000, "%.1f%c", val * 1e09f, 'n'); else if (absVal <= 1e-03f) osp_snprintf(result, 1000, "%.1f%c", val * 1e06f, 'u'); else if (absVal <= 1e-00f) osp_snprintf(result, 1000, "%.1f%c", val * 1e03f, 'm'); else osp_snprintf(result, 1000, "%f", (float)val); return result; } std::string prettyNumber(size_t s) { const double val = s; char result[1000]; if (val >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e18f, 'E'); else if (val >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e15f, 'P'); else if (val >= 1e+12f) osp_snprintf(result, 1000, "%.1f%c", val / 1e12f, 'T'); else if (val >= 1e+09f) osp_snprintf(result, 1000, "%.1f%c", val / 1e09f, 'G'); else if (val >= 1e+06f) osp_snprintf(result, 1000, "%.1f%c", val / 1e06f, 'M'); else if (val >= 1e+03f) osp_snprintf(result, 1000, "%.1f%c", val / 1e03f, 'k'); else osp_snprintf(result, 1000, "%zu", s); return result; } #undef osp_snprintf } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/common.h000066400000000000000000000035571467524601100210310ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // rkcommon #include "platform.h" // std #include #include #ifdef _WIN32 // ----------- windows only ----------- typedef unsigned long long id_t; #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include #ifdef _M_X64 typedef long long ssize_t; #else typedef int ssize_t; #endif #else // ----------- NOT windows ----------- #include "unistd.h" #endif #ifdef _WIN32 #ifdef rkcommon_EXPORTS #define RKCOMMON_INTERFACE __declspec(dllexport) #else #define RKCOMMON_INTERFACE __declspec(dllimport) #endif #else #define RKCOMMON_INTERFACE #endif #ifdef _WIN32 #define __PRETTY_FUNCTION__ __FUNCSIG__ #endif namespace rkcommon { using byte_t = unsigned char; /*! remove specified num arguments from an ac/av arglist */ RKCOMMON_INTERFACE void removeArgs(int &ac, const char **&av, int where, int howMany); // anchorAddress = nullptr will disable anchored loads RKCOMMON_INTERFACE void loadLibrary( const void *anchorAddress, const std::string &name, const std::vector &version = {}); RKCOMMON_INTERFACE void unloadLibrary(const std::string &name); RKCOMMON_INTERFACE void *getSymbol(const std::string &name); RKCOMMON_INTERFACE std::string prettyDouble(double x); RKCOMMON_INTERFACE std::string prettyNumber(size_t x); // NOTE(jda) - Implement make_unique() as it didn't show up until C++14... template inline std::unique_ptr make_unique(Args &&... args) { return std::unique_ptr(new T(std::forward(args)...)); } template T *getDataSafe(std::vector &v) { return v.empty() ? nullptr : v.data(); } } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/containers/000077500000000000000000000000001467524601100215235ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/containers/AlignedVector.h000066400000000000000000000005131467524601100244210ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "aligned_allocator.h" namespace rkcommon { namespace containers { template using AlignedVector = std::vector>; } // namespace containers } // namespace rkcommonRenderKit-rkcommon-988718e/rkcommon/containers/FlatMap.h000066400000000000000000000170661467524601100232320ustar00rootroot00000000000000// Copyright 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include namespace rkcommon { namespace containers { // A small map data structure with a similar interface to std::map<>, but // uses an underlying std::vector<> to store the key/value pairs instead of // a tree. This makes lookups O(n), but inserts are O(1) and it is sortable // like an array to enable things like std::binary_search() on either the // keys or values. template struct FlatMap { using item_t = std::pair; using storage_t = std::vector; using iterator_t = decltype(std::declval().begin()); using citerator_t = decltype(std::declval().cbegin()); using riterator_t = decltype(std::declval().rbegin()); using criterator_t = decltype(std::declval().crbegin()); FlatMap() = default; ~FlatMap() = default; // Key-based lookups // VALUE &at(const KEY &key); const VALUE &at(const KEY &key) const; VALUE &operator[](const KEY &key); const VALUE &operator[](const KEY &key) const; // Index-based lookups // item_t &at_index(size_t index); const item_t &at_index(size_t index) const; // Property queries // size_t size() const; size_t empty() const; bool contains(const KEY &key) const; // Storage mutation // void erase(const KEY &key); void clear(); void reserve(size_t size); // Iterators // iterator_t begin(); citerator_t begin() const; citerator_t cbegin() const; iterator_t end(); citerator_t end() const; citerator_t cend() const; riterator_t rbegin(); criterator_t rbegin() const; criterator_t crbegin() const; riterator_t rend(); criterator_t rend() const; criterator_t crend() const; private: // Helpers // iterator_t lookup(const KEY &key); citerator_t lookup(const KEY &key) const; // Data // storage_t values; }; // Inlined definitions //////////////////////////////////////////////////// template inline VALUE &FlatMap::at(const KEY &key) { auto itr = lookup(key); if (itr == values.end()) throw std::out_of_range("key wasn't found in FlatMap<>"); return itr->second; } template inline const VALUE &FlatMap::at(const KEY &key) const { auto itr = lookup(key); if (itr == values.end()) throw std::out_of_range("key wasn't found in FlatMap<>"); return itr->second; } template inline VALUE &FlatMap::operator[](const KEY &key) { auto itr = lookup(key); if (itr == values.end()) { values.push_back(std::make_pair(key, VALUE())); return values.back().second; } else { return itr->second; } } template inline const VALUE &FlatMap::operator[](const KEY &key) const { auto itr = lookup(key); if (itr == values.end()) { values.push_back(std::make_pair(key, VALUE())); return values.back().second; } else { return itr->second; } } template inline typename FlatMap::item_t &FlatMap::at_index( size_t index) { return values.at(index); } template inline const typename FlatMap::item_t & FlatMap::at_index(size_t index) const { return values.at(index); } template inline size_t FlatMap::size() const { return values.size(); } template inline size_t FlatMap::empty() const { return values.empty(); } template inline bool FlatMap::contains(const KEY &key) const { return lookup(key) != values.cend(); } template inline void FlatMap::erase(const KEY &key) { auto itr = std::stable_partition( values.begin(), values.end(), [&](const item_t &i) { return i.first != key; }); values.resize(std::distance(values.begin(), itr)); } template inline void FlatMap::clear() { values.clear(); } template inline void FlatMap::reserve(size_t size) { return values.reserve(size); } // Iterators // template inline typename FlatMap::iterator_t FlatMap::begin() { return values.begin(); } template inline typename FlatMap::citerator_t FlatMap::begin() const { return cbegin(); } template inline typename FlatMap::citerator_t FlatMap::cbegin() const { return values.cbegin(); } template inline typename FlatMap::iterator_t FlatMap::end() { return values.end(); } template inline typename FlatMap::citerator_t FlatMap::end() const { return cend(); } template inline typename FlatMap::citerator_t FlatMap::cend() const { return values.cend(); } template inline typename FlatMap::riterator_t FlatMap::rbegin() { return values.rbegin(); } template inline typename FlatMap::criterator_t FlatMap::rbegin() const { return crbegin(); } template inline typename FlatMap::criterator_t FlatMap::crbegin() const { return values.crbegin(); } template inline typename FlatMap::riterator_t FlatMap::rend() { return values.rend(); } template inline typename FlatMap::criterator_t FlatMap::rend() const { return crend(); } template inline typename FlatMap::criterator_t FlatMap::crend() const { return values.crend(); } // Helper functions // template inline typename FlatMap::iterator_t FlatMap::lookup( const KEY &key) { return std::find_if(values.begin(), values.end(), [&](item_t &item) { return item.first == key; }); } template inline typename FlatMap::citerator_t FlatMap::lookup(const KEY &key) const { return std::find_if( values.cbegin(), values.cend(), [&](const item_t &item) { return item.first == key; }); } } // namespace containers } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/containers/TransactionalBuffer.h000066400000000000000000000034321467524601100256320ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace containers { template struct TransactionalBuffer { TransactionalBuffer() = default; // Insert into the buffer (producer) void push_back(const T &); void push_back(T &&); // Take all contents of the buffer (consumer) std::vector consume(); size_t size() const; bool empty() const; private: // Data members // std::vector buffer; mutable std::mutex bufferMutex; // NOTE(jda) - Marked mutable so 'const' // methods can take the lock... }; // Inlined members //////////////////////////////////////////////////////// template inline void TransactionalBuffer::push_back(const T &v) { std::lock_guard lock(bufferMutex); buffer.push_back(v); } template inline void TransactionalBuffer::push_back(T &&v) { std::lock_guard lock(bufferMutex); buffer.push_back(std::forward(v)); } template inline std::vector TransactionalBuffer::consume() { std::lock_guard lock(bufferMutex); return std::move(buffer); } template inline size_t TransactionalBuffer::size() const { std::lock_guard lock(bufferMutex); return buffer.size(); } template inline bool TransactionalBuffer::empty() const { std::lock_guard lock(bufferMutex); return buffer.empty(); } } // namespace containers } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/containers/aligned_allocator.h000066400000000000000000000116141467524601100253420ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include // Required for size_t and ptrdiff_t and nullptr #include // Required for placement new and std::bad_alloc #include // Required for std::length_error #include "../memory/malloc.h" namespace rkcommon { namespace containers { // NOTE(jda) - aligned_allocator implementation loosely based off of Stephen // T. Lavavej's "Mallocator" example: // // https://blogs.msdn.microsoft.com/vcblog/2008/08/28/the-mallocator/ #define OSPRAY_DEFAULT_ALIGNMENT 64 template struct aligned_allocator { // Compile-time info // using pointer = T *; using const_pointer = const T *; using reference = T &; using const_reference = const T &; using value_type = T; using size_type = size_t; using difference_type = ptrdiff_t; template struct rebind { using other = aligned_allocator; }; // Implementation // aligned_allocator() = default; aligned_allocator(const aligned_allocator &) = default; ~aligned_allocator() = default; aligned_allocator &operator=(const aligned_allocator &) = delete; template aligned_allocator(const aligned_allocator &); template aligned_allocator &operator=(const aligned_allocator &); T *address(T &r) const; const T *address(const T &s) const; size_t max_size() const; bool operator!=(const aligned_allocator &other) const; void construct(T *const p, const T &t) const; void destroy(T *const p) const; // Returns true if and only if storage allocated from *this // can be deallocated from other, and vice versa. // Always returns true for stateless allocators. bool operator==(const aligned_allocator &) const; // The following will be different for each allocator. T *allocate(const size_t n) const; void deallocate(T *const p, const size_t n) const; template T *allocate(const size_t n, const U * /* const hint */) const; }; // Inlined member definitions ///////////////////////////////////////////// template template aligned_allocator::aligned_allocator(const aligned_allocator &) { } template template aligned_allocator &aligned_allocator::operator=( const aligned_allocator &) { } template inline T *aligned_allocator::address(T &r) const { return &r; } template inline const T *aligned_allocator::address(const T &s) const { return &s; } template inline size_t aligned_allocator::max_size() const { // The following has been carefully written to be independent of // the definition of size_t and to avoid signed/unsigned warnings. return (static_cast(0) - static_cast(1)) / sizeof(T); } template inline bool aligned_allocator::operator!=( const aligned_allocator &other) const { return !(*this == other); } template inline void aligned_allocator::construct(T *const p, const T &t) const { void *const pv = static_cast(p); new (pv) T(t); } template inline bool aligned_allocator::operator==( const aligned_allocator &) const { return true; } template inline T *aligned_allocator::allocate(const size_t n) const { if (n == 0) return nullptr; if (n > max_size()) { throw std::length_error( "aligned_allocator::allocate() – Integer overflow."); } void *const pv = memory::alignedMalloc(n * sizeof(T), A); if (pv == nullptr) throw std::bad_alloc(); return static_cast(pv); } template inline void aligned_allocator::deallocate(T *const p, const size_t) const { memory::alignedFree(p); } template template inline T *aligned_allocator::allocate(const size_t n, const U *) const { return allocate(n); } template inline void aligned_allocator::destroy(T *const p) const { p->~T(); } } // namespace containers } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/000077500000000000000000000000001467524601100203075ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/math/AffineSpace.h000066400000000000000000000237011467524601100226270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "LinearSpace.h" #include "box.h" namespace rkcommon { namespace math { #define VectorT typename L::Vector #define ScalarT typename L::Vector::Scalar /////////////////////////////////////////////////////////////////////////// // Affine Space /////////////////////////////////////////////////////////////////////////// template struct AffineSpaceT { L l; /*< linear part of affine space */ VectorT p; /*< affine part of affine space */ ///////////////////////////////////////////////////////////////////////// // Constructors, Assignment, Cast, Copy Operations ///////////////////////////////////////////////////////////////////////// inline AffineSpaceT() = default; inline AffineSpaceT(const AffineSpaceT &other) { l = other.l; p = other.p; } inline AffineSpaceT(const L &other) { l = other; p = VectorT(zero); } inline AffineSpaceT &operator=(const AffineSpaceT &other) { l = other.l; p = other.p; return *this; } inline AffineSpaceT(const VectorT &vx, const VectorT &vy, const VectorT &vz, const VectorT &p) : l(vx, vy, vz), p(p) { } inline AffineSpaceT(const L &l, const VectorT &p) : l(l), p(p) {} template inline AffineSpaceT(const AffineSpaceT &s) : l(s.l), p(s.p) { } inline operator L*() { return static_cast(&l); } inline operator const L*() const { return static_cast(&l); } ///////////////////////////////////////////////////////////////////////// // Constants ///////////////////////////////////////////////////////////////////////// inline AffineSpaceT(ZeroTy) : l(zero), p(zero) {} inline AffineSpaceT(OneTy) : l(one), p(zero) {} /*! return matrix for scaling */ static inline AffineSpaceT scale(const VectorT &s) { return L::scale(s); } /*! return matrix for translation */ static inline AffineSpaceT translate(const VectorT &p) { return AffineSpaceT(one, p); } /*! return matrix for rotation, only in 2D */ static inline AffineSpaceT rotate(const ScalarT &r) { return L::rotate(r); } /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ static inline AffineSpaceT rotate(const VectorT &u, const ScalarT &r) { return L::rotate(u, r); } /*! return matrix for rotation quaternion, only in 3D */ static inline AffineSpaceT rotate(const QuaternionT &q) { return L(q); } /*! return matrix for rotation around arbitrary axis and point, only in 3D */ static inline AffineSpaceT rotate(const VectorT &p, const VectorT &u, const ScalarT &r) { return translate(+p) * rotate(u, r) * translate(-p); } /*! return matrix for rotation with quaternion around point, only in 3D */ static inline AffineSpaceT rotate(const VectorT &p, const QuaternionT &q) { return translate(+p) * L(q) * translate(-p); } /*! return matrix for looking at given point, only in 3D; right-handed * coordinate system */ static inline AffineSpaceT lookat(const VectorT &eye, const VectorT &point, const VectorT &up) { VectorT Z = normalize(point - eye); VectorT U = normalize(cross(Z, up)); VectorT V = cross(U, Z); return AffineSpaceT(L(U, V, Z), eye); } }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline AffineSpaceT operator-(const AffineSpaceT &a) { return AffineSpaceT(-a.l, -a.p); } template inline AffineSpaceT operator+(const AffineSpaceT &a) { return AffineSpaceT(+a.l, +a.p); } template inline AffineSpaceT rcp(const AffineSpaceT &a) { L il = rcp(a.l); return AffineSpaceT(il, -(il * a.p)); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline AffineSpaceT operator+(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l + b.l, a.p + b.p); } template inline AffineSpaceT operator-(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l - b.l, a.p - b.p); } template inline AffineSpaceT operator*(const ScalarT &a, const AffineSpaceT &b) { return AffineSpaceT(a * b.l, a * b.p); } template inline AffineSpaceT operator*(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l * b.l, a.l * b.p + a.p); } template inline AffineSpaceT operator/(const AffineSpaceT &a, const AffineSpaceT &b) { return a * rcp(b); } template inline AffineSpaceT operator/(const AffineSpaceT &a, const ScalarT &b) { return a * rcp(b); } template inline AffineSpaceT &operator*=(AffineSpaceT &a, const AffineSpaceT &b) { return a = a * b; } template inline AffineSpaceT &operator*=(AffineSpaceT &a, const ScalarT &b) { return a = a * b; } template inline AffineSpaceT &operator/=(AffineSpaceT &a, const AffineSpaceT &b) { return a = a / b; } template inline AffineSpaceT &operator/=(AffineSpaceT &a, const ScalarT &b) { return a = a / b; } template inline const VectorT xfmPoint(const AffineSpaceT &m, const VectorT &p) { return madd(VectorT(p.x), m.l.vx, madd(VectorT(p.y), m.l.vy, madd(VectorT(p.z), m.l.vz, m.p))); } template inline const VectorT xfmVector(const AffineSpaceT &m, const VectorT &v) { return xfmVector(m.l, v); } template inline const VectorT xfmNormal(const AffineSpaceT &m, const VectorT &n) { return xfmNormal(m.l, n); } template inline const box_t xfmBounds( const AffineSpaceT>> &m, const box_t &b) { box_t dst = empty; const vec_t p0(b.lower.x, b.lower.y, b.lower.z); dst.extend(xfmPoint(m, p0)); const vec_t p1(b.lower.x, b.lower.y, b.upper.z); dst.extend(xfmPoint(m, p1)); const vec_t p2(b.lower.x, b.upper.y, b.lower.z); dst.extend(xfmPoint(m, p2)); const vec_t p3(b.lower.x, b.upper.y, b.upper.z); dst.extend(xfmPoint(m, p3)); const vec_t p4(b.upper.x, b.lower.y, b.lower.z); dst.extend(xfmPoint(m, p4)); const vec_t p5(b.upper.x, b.lower.y, b.upper.z); dst.extend(xfmPoint(m, p5)); const vec_t p6(b.upper.x, b.upper.y, b.lower.z); dst.extend(xfmPoint(m, p6)); const vec_t p7(b.upper.x, b.upper.y, b.upper.z); dst.extend(xfmPoint(m, p7)); return dst; } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const AffineSpaceT &a, const AffineSpaceT &b) { return a.l == b.l && a.p == b.p; } template inline bool operator!=(const AffineSpaceT &a, const AffineSpaceT &b) { return a.l != b.l || a.p != b.p; } /////////////////////////////////////////////////////////////////////////// // Output Operators /////////////////////////////////////////////////////////////////////////// template inline std::ostream &operator<<(std::ostream &cout, const AffineSpaceT &m) { return cout << "{ l = " << m.l << ", p = " << m.p << " }"; } /////////////////////////////////////////////////////////////////////////// // Type Aliases /////////////////////////////////////////////////////////////////////////// using AffineSpace2f = AffineSpaceT; using AffineSpace3f = AffineSpaceT; using AffineSpace3fa = AffineSpaceT; using OrthonormalSpace3f = AffineSpaceT; using affine2f = AffineSpace2f; using affine3f = AffineSpace3f; /////////////////////////////////////////////////////////////////////////// /*! Template Specialization for 2D: return matrix for rotation around point * (rotation around arbitrarty vector is not meaningful in 2D) */ template <> inline AffineSpace2f AffineSpace2f::rotate(const vec2f &p, const float &r) { return translate(+p) * AffineSpace2f(LinearSpace2f::rotate(r)) * translate(-p); } #undef VectorT #undef ScalarT } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/AffineSpace.ih000066400000000000000000000213611467524601100230000ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "LinearSpace.ih" #ifndef ISPC namespace ispc { #endif // A Affine vector space; i.e., a Linear Space with a translation struct AffineSpace3f { LinearSpace3f l; vec3f p; #ifndef ISPC AffineSpace3f() = default; AffineSpace3f(const float v) : l(v), p(v) {} #endif }; // short-hand name for AffineSpace3f typedef AffineSpace3f affine3f; // create a new affine space from given basis vectors v{x,y,z} and translation p inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f( const ISPC_UNIFORM vec3f vx, const ISPC_UNIFORM vec3f vy, const ISPC_UNIFORM vec3f vz, const ISPC_UNIFORM vec3f p) { ISPC_UNIFORM AffineSpace3f xfm; xfm.l.vx = vx; xfm.l.vy = vy; xfm.l.vz = vz; xfm.p = p; return xfm; } inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f_identity() { return make_AffineSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f), make_vec3f(0.f)); } inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f( const ISPC_UNIFORM LinearSpace3f l) { ISPC_UNIFORM AffineSpace3f xfm; xfm.l = l; xfm.p = make_vec3f(0, 0, 0); return xfm; } #define __define_transform(univary_r, univary_a, univary_v) \ /* apply given affine transformation to given _point_ v */ \ inline univary_r vec3f xfmPoint( \ const univary_a AffineSpace3f a, const univary_v vec3f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply affine transform to given _vector_ v, i.e., _without_ the \ * translation */ \ inline univary_r vec3f xfmVector( \ const univary_a AffineSpace3f a, const univary_v vec3f v) \ { \ return xfmVector(a.l, v); \ } #ifdef ISPC __define_transform(uniform, uniform, uniform); __define_transform(varying, uniform, varying); __define_transform(varying, varying, varying); #else __define_transform(, , ); #endif #undef __define_transform #define __define_other(univary) \ inline univary AffineSpace3f make_AffineSpace3f( \ const univary LinearSpace3f l, const univary vec3f p) \ { \ univary AffineSpace3f xfm; \ xfm.l = l; \ xfm.p = p; \ return xfm; \ } \ inline univary AffineSpace3f operator+( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l + b.l, a.p + b.p); \ } \ inline univary AffineSpace3f operator-( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l - b.l, a.p - b.p); \ } \ inline univary AffineSpace3f operator*( \ const univary float a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a * b.l, a * b.p); \ } \ inline univary AffineSpace3f operator*( \ const univary AffineSpace3f a, const univary float b) \ { \ return make_AffineSpace3f(a.l * b, a.p * b); \ } \ inline univary AffineSpace3f operator*( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l * b.l, a.l * b.p + a.p); \ } \ inline univary AffineSpace3f neg(const univary AffineSpace3f a) \ { \ return make_AffineSpace3f(neg(a.l), neg(a.p)); \ } \ inline univary AffineSpace3f rcp(const univary AffineSpace3f a) \ { \ univary LinearSpace3f il = rcp(a.l); \ return make_AffineSpace3f(il, neg(il * a.p)); \ } #ifdef ISPC __define_other(uniform); __define_other(varying); #else __define_other(); #endif #undef __define_other //////////////////////////////////////////////////////////////////////////////// // Rudimentary 2D affine space, used for texture coordinate transformations //////////////////////////////////////////////////////////////////////////////// // A 2D Affine vector space; i.e., a Linear Space with a translation struct AffineSpace2f { LinearSpace2f l; vec2f p; #ifndef ISPC AffineSpace2f() = default; AffineSpace2f(const float v) : l(v), p(v) {} #endif }; // short-hand name for AffineSpace2f typedef AffineSpace2f affine2f; // create a new affine space from given basis vectors v{x,y,z} and translation p inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f( const ISPC_UNIFORM LinearSpace2f l, const ISPC_UNIFORM vec2f p) { ISPC_UNIFORM AffineSpace2f xfm; xfm.l = l; xfm.p = p; return xfm; } inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f( const ISPC_UNIFORM vec2f vx, const ISPC_UNIFORM vec2f vy, const ISPC_UNIFORM vec2f p) { return make_AffineSpace2f(make_LinearSpace2f(vx, vy), p); } inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f_identity() { return make_AffineSpace2f( make_vec2f(1.f, 0.f), make_vec2f(0.f, 1.f), make_vec2f(0.f)); } #define __define_transform2f(univary_r, univary_a, univary_v) \ inline univary_r vec2f operator*( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply given affine transformation to given _point_ v */ \ inline univary_r vec2f xfmPoint( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply affine transform to given _vector_ v, i.e., _without_ the \ * translation */ \ inline univary_r vec2f xfmVector( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return xfmVector(a.l, v); \ } #ifdef ISPC __define_transform2f(uniform, uniform, uniform); __define_transform2f(varying, uniform, varying); __define_transform2f(varying, varying, varying); #else __define_transform2f(, , ); #endif #undef __define_transform2f #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/math/LinearSpace.h000066400000000000000000000431661467524601100226600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "Quaternion.h" #include "vec.h" namespace rkcommon { namespace math { /////////////////////////////////////////////////////////////////////////// // 2D Linear Transform (2x2 Matrix) /////////////////////////////////////////////////////////////////////////// template struct LinearSpace2 { using Vector = T; using Scalar = typename T::scalar_t; /*! default matrix constructor */ inline LinearSpace2() = default; inline LinearSpace2(const LinearSpace2 &other) { vx = other.vx; vy = other.vy; } inline LinearSpace2 &operator=(const LinearSpace2 &other) { vx = other.vx; vy = other.vy; return *this; } template inline LinearSpace2(const LinearSpace2 &s) : vx(s.vx), vy(s.vy) { } /*! matrix construction from column vectors */ inline LinearSpace2(const Vector &vx, const Vector &vy) : vx(vx), vy(vy) { } /*! matrix construction from row mayor data */ inline LinearSpace2(const Scalar &m00, const Scalar &m01, const Scalar &m10, const Scalar &m11) : vx(m00, m10), vy(m01, m11) { } /*! compute the determinant of the matrix */ inline const Scalar det() const { return vx.x * vy.y - vx.y * vy.x; } /*! compute adjoint matrix */ inline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y, -vy.x, -vx.y, vx.x); } /*! compute inverse matrix */ inline const LinearSpace2 inverse() const { return adjoint() / det(); } /*! compute transposed matrix */ inline const LinearSpace2 transposed() const { return LinearSpace2(vx.x, vx.y, vy.x, vy.y); } /*! returns first row of matrix */ inline const Vector row0() const { return Vector(vx.x, vy.x); } /*! returns second row of matrix */ inline const Vector row1() const { return Vector(vx.y, vy.y); } ///////////////////////////////////////////////////////////////////////// /// Constants ///////////////////////////////////////////////////////////////////////// inline LinearSpace2(ZeroTy) : vx(zero), vy(zero) {} inline LinearSpace2(OneTy) : vx(one, zero), vy(zero, one) {} /*! return matrix for scaling */ static inline LinearSpace2 scale(const Vector &s) { return LinearSpace2(s.x, 0, 0, s.y); } /*! return matrix for rotation */ static inline LinearSpace2 rotate(const Scalar &r) { Scalar s = sin(r), c = cos(r); return LinearSpace2(c, -s, s, c); } /*! return closest orthogonal matrix (i.e. a general rotation including * reflection) */ LinearSpace2 orthogonal() const { LinearSpace2 m = *this; // mirrored? Scalar mirror(one); if (m.det() < Scalar(zero)) { m.vx = -m.vx; mirror = -mirror; } // rotation for (int i = 0; i < 99; i++) { const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); const LinearSpace2 d = m_next - m; m = m_next; // norm^2 of difference small enough? if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) break; } // rotation * mirror_x return LinearSpace2(mirror * m.vx, m.vy); } inline operator Scalar*() { return static_cast(&vx); } inline operator const Scalar*() const { return static_cast(&vx); } public: /*! the column vectors of the matrix */ Vector vx, vy; }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace2 operator-(const LinearSpace2 &a) { return LinearSpace2(-a.vx, -a.vy); } template inline LinearSpace2 operator+(const LinearSpace2 &a) { return LinearSpace2(+a.vx, +a.vy); } template inline LinearSpace2 rcp(const LinearSpace2 &a) { return a.inverse(); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace2 operator+(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a.vx + b.vx, a.vy + b.vy); } template inline LinearSpace2 operator-(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a.vx - b.vx, a.vy - b.vy); } template inline LinearSpace2 operator*(const typename T::Scalar &a, const LinearSpace2 &b) { return LinearSpace2(a * b.vx, a * b.vy); } template inline T operator*(const LinearSpace2 &a, const T &b) { return b.x * a.vx + b.y * a.vy; } template inline LinearSpace2 operator*(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a * b.vx, a * b.vy); } template inline LinearSpace2 operator/(const LinearSpace2 &a, const typename T::Scalar &b) { return LinearSpace2(a.vx / b, a.vy / b); } template inline LinearSpace2 operator/(const LinearSpace2 &a, const LinearSpace2 &b) { return a * rcp(b); } template inline LinearSpace2 &operator*=(LinearSpace2 &a, const LinearSpace2 &b) { return a = a * b; } template inline LinearSpace2 &operator/=(LinearSpace2 &a, const LinearSpace2 &b) { return a = a / b; } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const LinearSpace2 &a, const LinearSpace2 &b) { return a.vx == b.vx && a.vy == b.vy; } template inline bool operator!=(const LinearSpace2 &a, const LinearSpace2 &b) { return a.vx != b.vx || a.vy != b.vy; } /////////////////////////////////////////////////////////////////////////// /// Output Operators /////////////////////////////////////////////////////////////////////////// template static std::ostream &operator<<(std::ostream &cout, const LinearSpace2 &m) { return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; } /////////////////////////////////////////////////////////////////////////// /// 3D Linear Transform (3x3 Matrix) /////////////////////////////////////////////////////////////////////////// template struct LinearSpace3 { using Vector = T; using Scalar = typename T::scalar_t; /*! default matrix constructor */ inline LinearSpace3() = default; inline LinearSpace3(const LinearSpace3 &other) { vx = other.vx; vy = other.vy; vz = other.vz; } inline LinearSpace3 &operator=(const LinearSpace3 &other) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } template inline LinearSpace3(const LinearSpace3 &s) : vx(s.vx), vy(s.vy), vz(s.vz) { } /*! matrix construction from column vectors */ inline LinearSpace3(const Vector &vx, const Vector &vy, const Vector &vz) : vx(vx), vy(vy), vz(vz) { } /*! construction from quaternion */ inline LinearSpace3(const QuaternionT &q) : vx((q.r * q.r + q.i * q.i - q.j * q.j - q.k * q.k), Scalar(2.0) * (q.i * q.j + q.r * q.k), Scalar(2.0) * (q.i * q.k - q.r * q.j)), vy(Scalar(2.0) * (q.i * q.j - q.r * q.k), (q.r * q.r - q.i * q.i + q.j * q.j - q.k * q.k), Scalar(2.0) * (q.j * q.k + q.r * q.i)), vz(Scalar(2.0) * (q.i * q.k + q.r * q.j), Scalar(2.0) * (q.j * q.k - q.r * q.i), (q.r * q.r - q.i * q.i - q.j * q.j + q.k * q.k)) { } /*! matrix construction from row mayor data */ inline LinearSpace3(const Scalar &m00, const Scalar &m01, const Scalar &m02, const Scalar &m10, const Scalar &m11, const Scalar &m12, const Scalar &m20, const Scalar &m21, const Scalar &m22) : vx(m00, m10, m20), vy(m01, m11, m21), vz(m02, m12, m22) { } /*! compute the determinant of the matrix */ inline const Scalar det() const { return dot(vx, cross(vy, vz)); } /*! compute adjoint matrix */ inline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy, vz), cross(vz, vx), cross(vx, vy)) .transposed(); } /*! compute inverse matrix */ inline const LinearSpace3 inverse() const { return adjoint() / det(); } /*! compute transposed matrix */ inline const LinearSpace3 transposed() const { return LinearSpace3( vx.x, vx.y, vx.z, vy.x, vy.y, vy.z, vz.x, vz.y, vz.z); } /*! returns first row of matrix */ inline const Vector row0() const { return Vector(vx.x, vy.x, vz.x); } /*! returns second row of matrix */ inline const Vector row1() const { return Vector(vx.y, vy.y, vz.y); } /*! returns third row of matrix */ inline const Vector row2() const { return Vector(vx.z, vy.z, vz.z); } ///////////////////////////////////////////////////////////////////////// // Constants ///////////////////////////////////////////////////////////////////////// inline LinearSpace3(ZeroTy) : vx(zero), vy(zero), vz(zero) {} inline LinearSpace3(OneTy) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) { } /*! return matrix for scaling */ static inline LinearSpace3 scale(const Vector &s) { return LinearSpace3(s.x, 0, 0, 0, s.y, 0, 0, 0, s.z); } /*! return matrix for rotation around arbitrary axis */ static inline LinearSpace3 rotate(const Vector &_u, const Scalar &r) { Vector u = normalize(_u); Scalar s = sin(r), c = cos(r); return LinearSpace3(u.x * u.x + (1 - u.x * u.x) * c, u.x * u.y * (1 - c) - u.z * s, u.x * u.z * (1 - c) + u.y * s, u.x * u.y * (1 - c) + u.z * s, u.y * u.y + (1 - u.y * u.y) * c, u.y * u.z * (1 - c) - u.x * s, u.x * u.z * (1 - c) - u.y * s, u.y * u.z * (1 - c) + u.x * s, u.z * u.z + (1 - u.z * u.z) * c); } inline operator Scalar*() { return static_cast(&vx); } inline operator const Scalar*() const { return static_cast(&vx); } public: /*! the column vectors of the matrix */ Vector vx, vy, vz; }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace3 operator-(const LinearSpace3 &a) { return LinearSpace3(-a.vx, -a.vy, -a.vz); } template inline LinearSpace3 operator+(const LinearSpace3 &a) { return LinearSpace3(+a.vx, +a.vy, +a.vz); } template inline LinearSpace3 rcp(const LinearSpace3 &a) { return a.inverse(); } /* constructs a coordinate frame from a normalized normal */ template inline LinearSpace3 frame(const T &N) { const T dx0 = cross(T(one, zero, zero), N); const T dx1 = cross(T(zero, one, zero), N); const T dx = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1); const T dy = normalize(cross(N, dx)); return LinearSpace3(dx, dy, N); } /* constructs a coordinate frame from a normal and approximate up direction */ template inline LinearSpace3 frame(const T &N, const T &up) { if (abs(dot(up, N)) > 0.99f) return frame(N); // fallback in case N and up are very parallel const T dx = normalize(cross(up, N)); const T dy = normalize(cross(N, dx)); return LinearSpace3(dx, dy, N); } /* clamps linear space to range -1 to +1 */ template inline LinearSpace3 clamp(const LinearSpace3 &space) { return LinearSpace3(clamp(space.vx, T(-1.0f), T(1.0f)), clamp(space.vy, T(-1.0f), T(1.0f)), clamp(space.vz, T(-1.0f), T(1.0f))); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace3 operator+(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a.vx + b.vx, a.vy + b.vy, a.vz + b.vz); } template inline LinearSpace3 operator-(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a.vx - b.vx, a.vy - b.vy, a.vz - b.vz); } template inline LinearSpace3 operator*(const typename T::Scalar &a, const LinearSpace3 &b) { return LinearSpace3(a * b.vx, a * b.vy, a * b.vz); } template inline T operator*(const LinearSpace3 &a, const T &b) { return b.x * a.vx + b.y * a.vy + b.z * a.vz; } template inline LinearSpace3 operator*(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a * b.vx, a * b.vy, a * b.vz); } template inline LinearSpace3 operator/(const LinearSpace3 &a, const typename T::Scalar &b) { return LinearSpace3(a.vx / b, a.vy / b, a.vz / b); } template inline LinearSpace3 operator/(const LinearSpace3 &a, const LinearSpace3 &b) { return a * rcp(b); } template inline LinearSpace3 &operator*=(LinearSpace3 &a, const LinearSpace3 &b) { return a = a * b; } template inline LinearSpace3 &operator/=(LinearSpace3 &a, const LinearSpace3 &b) { return a = a / b; } template inline T xfmPoint(const LinearSpace3 &s, const T &a) { return madd(T(a.x), s.vx, madd(T(a.y), s.vy, T(a.z * s.vz))); } template inline T xfmVector(const LinearSpace3 &s, const T &a) { return madd(T(a.x), s.vx, madd(T(a.y), s.vy, T(a.z * s.vz))); } template inline T xfmNormal(const LinearSpace3 &s, const T &a) { return xfmVector(s.inverse().transposed(), a); } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const LinearSpace3 &a, const LinearSpace3 &b) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } template inline bool operator!=(const LinearSpace3 &a, const LinearSpace3 &b) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } /////////////////////////////////////////////////////////////////////////// /// Output Operators /////////////////////////////////////////////////////////////////////////// template inline std::ostream &operator<<(std::ostream &cout, const LinearSpace3 &m) { return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; } /*! Shortcuts for common linear spaces. */ using LinearSpace2f = LinearSpace2; using LinearSpace3f = LinearSpace3; using LinearSpace3fa = LinearSpace3; using linear2f = LinearSpace2f; using linear3f = LinearSpace3f; } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/LinearSpace.ih000066400000000000000000000303321467524601100230200ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.ih" #ifndef ISPC namespace ispc { #endif // Linear vector space, or a linear transformation struct LinearSpace3f { vec3f vx; vec3f vy; vec3f vz; #ifndef ISPC LinearSpace3f() = default; LinearSpace3f(const float v) : vx(v), vy(v), vz(v) {} LinearSpace3f(const vec3f &vx, const vec3f &vy, const vec3f &vz) : vx(vx), vy(vy), vz(vz) {} #endif }; // short-hand name for LinearSpace3f typedef LinearSpace3f linear3f; //////////////////////////////////////////////////////////////////////////////// /// Constructors //////////////////////////////////////////////////////////////////////////////// #define __define_make_LinearSpace3f(univary) \ inline univary LinearSpace3f make_LinearSpace3f( \ const univary vec3f x, const univary vec3f y, const univary vec3f z) \ { \ univary LinearSpace3f l; \ l.vx = x; \ l.vy = y; \ l.vz = z; \ return l; \ } #ifdef ISPC __define_make_LinearSpace3f(uniform); __define_make_LinearSpace3f(varying); #else __define_make_LinearSpace3f(); #endif #undef __define_make_LinearSpace3f inline ISPC_UNIFORM LinearSpace3f make_LinearSpace3f_identity() { return make_LinearSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f)); } inline ISPC_VARYING LinearSpace3f make_LinearSpace3f_varying_identity() { return make_LinearSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f)); } //////////////////////////////////////////////////////////////////////////////// // Transformation //////////////////////////////////////////////////////////////////////////////// #define __define_transform(univary_r, univary_l, univary_v) \ inline univary_r vec3f operator*( \ const univary_l LinearSpace3f l, const univary_v vec3f v) \ { \ return v.x * l.vx + v.y * l.vy + v.z * l.vz; \ } \ inline univary_r vec3f xfmVector( \ const univary_l LinearSpace3f l, const univary_v vec3f v) \ { \ return v.x * l.vx + v.y * l.vy + v.z * l.vz; \ } #ifdef ISPC __define_transform(uniform, uniform, uniform); __define_transform(varying, uniform, varying); __define_transform(varying, varying, varying); #else __define_transform(, , ); #endif #undef __define_transform //////////////////////////////////////////////////////////////////////////////// /// Binary Operators //////////////////////////////////////////////////////////////////////////////// #define __define_binary_ops(univary) \ inline univary LinearSpace3f operator+( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f(a.vx + b.vx, a.vy + b.vy, a.vz + b.vz); \ } \ inline univary LinearSpace3f operator-( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f(a.vx - b.vx, a.vy - b.vy, a.vz - b.vz); \ } \ inline univary LinearSpace3f operator*( \ const univary float a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f((a * b.vx), (a * b.vy), (a * b.vz)); \ } \ inline univary LinearSpace3f operator*( \ const univary LinearSpace3f a, const univary float b) \ { \ return make_LinearSpace3f((a.vx * b), (a.vy * b), (a.vz * b)); \ } \ inline univary LinearSpace3f operator*( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f((a * b.vx), (a * b.vy), (a * b.vz)); \ } #ifdef ISPC __define_binary_ops(uniform); __define_binary_ops(varying); #else __define_binary_ops(); #endif #undef __define_binary_ops //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// #define __define_comp_ops(univary) \ inline univary bool eq( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return eq(a.vx, b.vx) & eq(a.vy, b.vy) & eq(a.vz, b.vz); \ } \ inline univary bool ne( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return ne(a.vx, b.vx) | ne(a.vy, b.vy) | ne(a.vz, b.vz); \ } #ifdef ISPC __define_comp_ops(uniform); __define_comp_ops(varying); #else __define_comp_ops(); #endif #undef __define_comp_ops //////////////////////////////////////////////////////////////////////////////// // Unary Operators //////////////////////////////////////////////////////////////////////////////// #define __define_unary_fct(univary) \ inline univary LinearSpace3f neg(const univary LinearSpace3f l) \ { \ return make_LinearSpace3f(neg(l.vx), neg(l.vy), neg(l.vz)); \ } \ /* compute the determinant of the matrix */ \ inline univary float det(const univary LinearSpace3f l) \ { \ return dot(l.vx, cross(l.vy, l.vz)); \ } \ /* compute transposed matrix */ \ inline univary LinearSpace3f transposed(const univary LinearSpace3f l) \ { \ return make_LinearSpace3f(make_vec3f(l.vx.x, l.vy.x, l.vz.x), \ make_vec3f(l.vx.y, l.vy.y, l.vz.y), \ make_vec3f(l.vx.z, l.vy.z, l.vz.z)); \ } \ /* compute adjoint matrix */ \ inline univary LinearSpace3f adjoint(const univary LinearSpace3f l) \ { \ return transposed(make_LinearSpace3f( \ cross(l.vy, l.vz), cross(l.vz, l.vx), cross(l.vx, l.vy))); \ } \ /* calculates orthogonal coordinate frame with z-vector pointing towards N \ */ \ inline univary LinearSpace3f frame(const univary vec3f N) \ { \ const univary vec3f dx0 = make_vec3f(0.0f, N.z, -N.y); \ const univary vec3f dx1 = make_vec3f(-N.z, 0.0f, N.x); \ const univary vec3f dx = normalize(abs(N.x) < abs(N.y) ? dx0 : dx1); \ const univary vec3f dy = cross(N, dx); \ return make_LinearSpace3f(dx, dy, N); \ } \ inline univary LinearSpace3f rcp(const univary LinearSpace3f l) \ { \ return adjoint(l) * rcpf(det(l)); \ } #ifdef ISPC __define_unary_fct(uniform); __define_unary_fct(varying); #else __define_unary_fct(); #endif #undef __define_unary_fct //////////////////////////////////////////////////////////////////////////////// // Rudimentary 2D linear space, used for texture coordinate transformations //////////////////////////////////////////////////////////////////////////////// struct LinearSpace2f { vec2f vx; vec2f vy; #ifndef ISPC LinearSpace2f() = default; LinearSpace2f(const float v) : vx(v), vy(v) {} LinearSpace2f(const vec2f &vx, const vec2f &vy) : vx(vx), vy(vy) {} #endif }; // short-hand name for LinearSpace2f typedef LinearSpace2f linear2f; #define __define_make_LinearSpace2f(univary) \ inline univary LinearSpace2f make_LinearSpace2f( \ const univary vec2f x, const univary vec2f y) \ { \ univary LinearSpace2f l; \ l.vx = x; \ l.vy = y; \ return l; \ } #ifdef ISPC __define_make_LinearSpace2f(uniform); __define_make_LinearSpace2f(varying); #else __define_make_LinearSpace2f(); #endif #undef __define_make_LinearSpace2f inline ISPC_UNIFORM LinearSpace2f make_LinearSpace2f_identity() { return make_LinearSpace2f(make_vec2f(1.f, 0.f), make_vec2f(0.f, 1.f)); } #define __define_transform2f(univary_r, univary_l, univary_v) \ inline univary_r vec2f operator*( \ const univary_l LinearSpace2f l, const univary_v vec2f v) \ { \ return v.x * l.vx + v.y * l.vy; \ } \ inline univary_r vec2f xfmVector( \ const univary_l LinearSpace2f l, const univary_v vec2f v) \ { \ return v.x * l.vx + v.y * l.vy; \ } #ifdef ISPC __define_transform2f(uniform, uniform, uniform); __define_transform2f(varying, uniform, varying); __define_transform2f(varying, varying, varying); #else __define_transform2f(, , ); #endif #undef __define_transform2f #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/math/Quaternion.h000066400000000000000000000262201467524601100226070ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.h" #include "../traits/rktraits.h" namespace rkcommon { namespace math { template > struct QuaternionT { using Vector = vec_t; using Scalar = T; QuaternionT() {} QuaternionT(const QuaternionT &other) { r = other.r; i = other.i; j = other.j; k = other.k; } QuaternionT &operator=(const QuaternionT &other) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } QuaternionT(const T &r) : r(r), i(zero), j(zero), k(zero) {} explicit QuaternionT(const Vector &v) : r(zero), i(v.x), j(v.y), k(v.z) {} QuaternionT(const T &r, const T &i, const T &j, const T &k) : r(r), i(i), j(j), k(k) { } QuaternionT(const T &r, const Vector &v) : r(r), i(v.x), j(v.y), k(v.z) {} QuaternionT(const Vector &vx, const Vector &vy, const Vector &vz); QuaternionT(const T &yaw, const T &pitch, const T &roll); QuaternionT(ZeroTy) : r(zero), i(zero), j(zero), k(zero) {} QuaternionT(OneTy) : r(one), i(zero), j(zero), k(zero) {} static inline QuaternionT rotate(const Vector &u, const T &r) { return QuaternionT(cos(T(0.5) * r), sin(T(0.5) * r) * normalize(u)); } /*! returns the rotation axis of the quaternion as a vector */ const Vector v() const { return Vector(i, j, k); } T i, j, k, r; }; template inline QuaternionT operator*(const T &a, const QuaternionT &b) { return QuaternionT(a * b.r, a * b.i, a * b.j, a * b.k); } template inline QuaternionT operator*(const QuaternionT &a, const T &b) { return QuaternionT(a.r * b, a.i * b, a.j * b, a.k * b); } template > inline auto operator*(const T &a, const QuaternionT &b) -> QuaternionT { using scalar_t = decltype(T() * U()); using quaternion_t = QuaternionT; return quaternion_t(scalar_t(a) * quaternion_t(b)); } template > inline auto operator*(const QuaternionT &a, const U &b) -> QuaternionT { using scalar_t = decltype(T() * U()); using quaternion_t = QuaternionT; return quaternion_t(quaternion_t(a) * scalar_t(b)); } template inline QuaternionT operator+(const QuaternionT &a) { return QuaternionT(+a.r, +a.i, +a.j, +a.k); } template inline QuaternionT operator-(const QuaternionT &a) { return QuaternionT(-a.r, -a.i, -a.j, -a.k); } template inline QuaternionT conj(const QuaternionT &a) { return QuaternionT(a.r, -a.i, -a.j, -a.k); } template inline T abs(const QuaternionT &a) { return sqrt(a.r * a.r + a.i * a.i + a.j * a.j + a.k * a.k); } template inline QuaternionT rcp(const QuaternionT &a) { return conj(a) * rcp(a.r * a.r + a.i * a.i + a.j * a.j + a.k * a.k); } template inline T dot(const QuaternionT &a, const QuaternionT &b) { return a.r * b.r + a.i * b.i + a.j * b.j + a.k * b.k; } template inline QuaternionT normalize(const QuaternionT &a) { return a * rsqrt(dot(a, a)); } template inline QuaternionT operator+(const T &a, const QuaternionT &b) { return QuaternionT(a + b.r, b.i, b.j, b.k); } template inline QuaternionT operator+(const QuaternionT &a, const T &b) { return QuaternionT(a.r + b, a.i, a.j, a.k); } template inline QuaternionT operator+(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } template inline QuaternionT operator-(const T &a, const QuaternionT &b) { return QuaternionT(a - b.r, -b.i, -b.j, -b.k); } template inline QuaternionT operator-(const QuaternionT &a, const T &b) { return QuaternionT(a.r - b, a.i, a.j, a.k); } template inline QuaternionT operator-(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } template inline typename QuaternionT::Vector operator*( const QuaternionT &a, const typename QuaternionT::Vector &b) { return (a * QuaternionT(b) * conj(a)).v(); } template inline QuaternionT operator*(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r * b.r - a.i * b.i - a.j * b.j - a.k * b.k, a.r * b.i + a.i * b.r + a.j * b.k - a.k * b.j, a.r * b.j - a.i * b.k + a.j * b.r + a.k * b.i, a.r * b.k + a.i * b.j - a.j * b.i + a.k * b.r); } template inline QuaternionT operator/(const T &a, const QuaternionT &b) { return a * rcp(b); } template inline QuaternionT operator/(const QuaternionT &a, const T &b) { return a * rcp(b); } template inline QuaternionT operator/(const QuaternionT &a, const QuaternionT &b) { return a * rcp(b); } template inline QuaternionT &operator+=(QuaternionT &a, const T &b) { return a = a + b; } template inline QuaternionT &operator+=(QuaternionT &a, const QuaternionT &b) { return a = a + b; } template inline QuaternionT &operator-=(QuaternionT &a, const T &b) { return a = a - b; } template inline QuaternionT &operator-=(QuaternionT &a, const QuaternionT &b) { return a = a - b; } template inline QuaternionT &operator*=(QuaternionT &a, const T &b) { return a = a * b; } template inline QuaternionT &operator*=(QuaternionT &a, const QuaternionT &b) { return a = a * b; } template inline QuaternionT &operator/=(QuaternionT &a, const T &b) { return a = a * rcp(b); } template inline QuaternionT &operator/=(QuaternionT &a, const QuaternionT &b) { return a = a * rcp(b); } template inline typename QuaternionT::Vector xfmPoint( const QuaternionT &a, const typename QuaternionT::Vector &b) { return a * b; } template inline QuaternionT xfmQuaternion( const QuaternionT &a, const QuaternionT &b) { return a * b; } template inline typename QuaternionT::Vector xfmNormal( const QuaternionT &a, const typename QuaternionT::Vector &b) { return a * b; } template inline bool operator==(const QuaternionT &a, const QuaternionT &b) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } template inline bool operator!=(const QuaternionT &a, const QuaternionT &b) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } template QuaternionT::QuaternionT(const typename QuaternionT::Vector &vx, const typename QuaternionT::Vector &vy, const typename QuaternionT::Vector &vz) { if (vx.x + vy.y + vz.z >= T(zero)) { const T t = T(one) + (vx.x + vy.y + vz.z); const T s = rsqrt(t) * T(.5); r = t * s; i = (vy.z - vz.y) * s; j = (vz.x - vx.z) * s; k = (vx.y - vy.x) * s; } else if (vx.x >= max(vy.y, vz.z)) { const T t = (T(one) + vx.x) - (vy.y + vz.z); const T s = rsqrt(t) * T(.5); r = (vy.z - vz.y) * s; i = t * s; j = (vx.y + vy.x) * s; k = (vz.x + vx.z) * s; } else if (vy.y >= vz.z) // if ( vy.y >= max(vz.z, vx.x) ) { const T t = (T(one) + vy.y) - (vz.z + vx.x); const T s = rsqrt(t) * T(.5); r = (vz.x - vx.z) * s; i = (vx.y + vy.x) * s; j = t * s; k = (vy.z + vz.y) * s; } else // if ( vz.z >= max(vy.y, vx.x) ) { const T t = (T(one) + vz.z) - (vx.x + vy.y); const T s = rsqrt(t) * T(.5); r = (vx.y - vy.x) * s; i = (vz.x + vx.z) * s; j = (vy.z + vz.y) * s; k = t * s; } } template QuaternionT::QuaternionT(const T &yaw, const T &pitch, const T &roll) { const T cya = cos(yaw * T(.5)); const T cpi = cos(pitch * T(.5)); const T cro = cos(roll * T(.5)); const T sya = sin(yaw * T(.5)); const T spi = sin(pitch * T(.5)); const T sro = sin(roll * T(.5)); r = cro * cya * cpi + sro * sya * spi; i = cro * cya * spi + sro * sya * cpi; j = cro * sya * cpi - sro * cya * spi; k = sro * cya * cpi - cro * sya * spi; } template static std::ostream &operator<<(std::ostream &cout, const QuaternionT &q) { return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; } template // a, b must be normalized inline QuaternionT slerp(const float factor, const QuaternionT &_a, const QuaternionT &b) { QuaternionT a(_a); T d = dot(a, b); if (d < 0.) { // prevent "long way around" a = -a; d = -d; } if (d > 0.9995) { // angles too small, fallback to linear interpolation return normalize(rkcommon::math::lerp(factor, a, b)); } const T theta0 = std::acos(d); const T theta = theta0 * factor; const T fb = std::sin(theta) / std::sin(theta0); const T fa = std::cos(theta) - d * fb; return fa * a + fb * b; } using quaternionf = QuaternionT; using quatf = QuaternionT; using quaterniond = QuaternionT; using quatd = QuaternionT; } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/arm/000077500000000000000000000000001467524601100210665ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/math/arm/emulation.h000066400000000000000000000004351467524601100232360ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "sse2neon.h" /* Dummy defines for floating point control */ #define _MM_MASK_MASK 0x1f80 #define _MM_MASK_DIV_ZERO 0x200 #define _MM_MASK_DENORM 0x100 #define _MM_SET_EXCEPTION_MASK(x) RenderKit-rkcommon-988718e/rkcommon/math/arm/sse2neon.h000066400000000000000000014156571467524601100230160ustar00rootroot00000000000000#ifndef SSE2NEON_H #define SSE2NEON_H // This header file provides a simple API translation layer // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions // // Contributors to this work are: // John W. Ratcliff // Brandon Rowlett // Ken Fast // Eric van Beurden // Alexander Potylitsin // Hasindu Gamaarachchi // Jim Huang // Mark Cheng // Malcolm James MacLeod // Devin Hussey (easyaspi314) // Sebastian Pop // Developer Ecosystem Engineering // Danila Kutenin // FranƧois Turban (JishinMaster) // Pei-Hsuan Hung // Yang-Hao Yuan // Syoyo Fujita // Brecht Van Lommel // Jonathan Hue // Cuda Chen // Aymen Qader // Anthony Roberts /* * sse2neon is freely redistributable under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Tunable configurations */ /* Enable precise implementation of math operations * This would slow down the computation a bit, but gives consistent result with * x86 SSE. (e.g. would solve a hole or NaN pixel in the rendering result) */ /* _mm_min|max_ps|ss|pd|sd */ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif /* _mm_rcp_ps and _mm_div_ps */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif /* _mm_sqrt_ps and _mm_rsqrt_ps */ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif /* _mm_dp_pd */ #ifndef SSE2NEON_PRECISE_DP #define SSE2NEON_PRECISE_DP (0) #endif /* Enable inclusion of windows.h on MSVC platforms * This makes _mm_clflush functional on windows, as there is no builtin. */ #ifndef SSE2NEON_INCLUDE_WINDOWS_H #define SSE2NEON_INCLUDE_WINDOWS_H (0) #endif /* compiler specific definitions */ #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) #define _sse2neon_likely(x) __builtin_expect(!!(x), 1) #define _sse2neon_unlikely(x) __builtin_expect(!!(x), 0) #elif defined(_MSC_VER) #if _MSVC_TRADITIONAL #error Using the traditional MSVC preprocessor is not supported! Use /Zc:preprocessor instead. #endif #ifndef FORCE_INLINE #define FORCE_INLINE static inline #endif #ifndef ALIGN_STRUCT #define ALIGN_STRUCT(x) __declspec(align(x)) #endif #define _sse2neon_likely(x) (x) #define _sse2neon_unlikely(x) (x) #else #pragma message("Macro name collisions may happen with unsupported compilers.") #endif #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 10 #warning "GCC versions earlier than 10 are not supported." #endif /* C language does not allow initializing a variable with a function call. */ #ifdef __cplusplus #define _sse2neon_const static const #else #define _sse2neon_const const #endif #include #include #if defined(_WIN32) /* Definitions for _mm_{malloc,free} are provided by * from both MinGW-w64 and MSVC. */ #define SSE2NEON_ALLOC_DEFINED #endif /* If using MSVC */ #ifdef _MSC_VER #include #if SSE2NEON_INCLUDE_WINDOWS_H #include #include #endif #if !defined(__cplusplus) #error SSE2NEON only supports C++ compilation with this compiler #endif #ifdef SSE2NEON_ALLOC_DEFINED #include #endif #if (defined(_M_AMD64) || defined(__x86_64__)) || \ (defined(_M_ARM64) || defined(__arm64__)) #define SSE2NEON_HAS_BITSCAN64 #endif #endif #if defined(__GNUC__) || defined(__clang__) #define _sse2neon_define0(type, s, body) \ __extension__({ \ type _a = (s); \ body \ }) #define _sse2neon_define1(type, s, body) \ __extension__({ \ type _a = (s); \ body \ }) #define _sse2neon_define2(type, a, b, body) \ __extension__({ \ type _a = (a), _b = (b); \ body \ }) #define _sse2neon_return(ret) (ret) #else #define _sse2neon_define0(type, a, body) [=](type _a) { body }(a) #define _sse2neon_define1(type, a, body) [](type _a) { body }(a) #define _sse2neon_define2(type, a, b, body) \ [](type _a, type _b) { body }((a), (b)) #define _sse2neon_return(ret) return ret #endif #define _sse2neon_init(...) \ { \ __VA_ARGS__ \ } /* Compiler barrier */ #if defined(_MSC_VER) #define SSE2NEON_BARRIER() _ReadWriteBarrier() #else #define SSE2NEON_BARRIER() \ do { \ __asm__ __volatile__("" ::: "memory"); \ (void) 0; \ } while (0) #endif /* Memory barriers * __atomic_thread_fence does not include a compiler barrier; instead, * the barrier is part of __atomic_load/__atomic_store's "volatile-like" * semantics. */ #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) #include #endif FORCE_INLINE void _sse2neon_smp_mb(void) { SSE2NEON_BARRIER(); #if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 201112L) && \ !defined(__STDC_NO_ATOMICS__) atomic_thread_fence(memory_order_seq_cst); #elif defined(__GNUC__) || defined(__clang__) __atomic_thread_fence(__ATOMIC_SEQ_CST); #else /* MSVC */ __dmb(_ARM64_BARRIER_ISH); #endif } /* Architecture-specific build options */ /* FIXME: #pragma GCC push_options is only available on GCC */ #if defined(__GNUC__) #if defined(__arm__) && __ARM_ARCH == 7 /* According to ARM C Language Extensions Architecture specification, * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) * architecture supported. */ #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." #endif #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("fpu=neon") #endif #elif defined(__aarch64__) || defined(_M_ARM64) #if !defined(__clang__) && !defined(_MSC_VER) #pragma GCC push_options #pragma GCC target("+simd") #endif #elif __ARM_ARCH == 8 #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error \ "You must enable NEON instructions (e.g. -mfpu=neon-fp-armv8) to use SSE2NEON." #endif #if !defined(__clang__) && !defined(_MSC_VER) #pragma GCC push_options #endif #else #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." #endif #endif #include #if (!defined(__aarch64__) && !defined(_M_ARM64)) && (__ARM_ARCH == 8) #if defined __has_include && __has_include() #include #endif #endif /* Apple Silicon cache lines are double of what is commonly used by Intel, AMD * and other Arm microarchitectures use. * From sysctl -a on Apple M1: * hw.cachelinesize: 128 */ #if defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) #define SSE2NEON_CACHELINE_SIZE 128 #else #define SSE2NEON_CACHELINE_SIZE 64 #endif /* Rounding functions require either Aarch64 instructions or libm fallback */ #if !defined(__aarch64__) && !defined(_M_ARM64) #include #endif /* On ARMv7, some registers, such as PMUSERENR and PMCCNTR, are read-only * or even not accessible in user mode. * To write or access to these registers in user mode, * we have to perform syscall instead. */ #if (!defined(__aarch64__) && !defined(_M_ARM64)) #include #endif /* "__has_builtin" can be used to query support for built-in functions * provided by gcc/clang and other compilers that support it. */ #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ /* Compatibility with gcc <= 9 */ #if defined(__GNUC__) && (__GNUC__ <= 9) #define __has_builtin(x) HAS##x #define HAS__builtin_popcount 1 #define HAS__builtin_popcountll 1 // __builtin_shuffle introduced in GCC 4.7.0 #if (__GNUC__ >= 5) || ((__GNUC__ == 4) && (__GNUC_MINOR__ >= 7)) #define HAS__builtin_shuffle 1 #else #define HAS__builtin_shuffle 0 #endif #define HAS__builtin_shufflevector 0 #define HAS__builtin_nontemporal_store 0 #else #define __has_builtin(x) 0 #endif #endif /** * MACRO for shuffle parameter for _mm_shuffle_ps(). * Argument fp3 is a digit[0123] that represents the fp from argument "b" * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same * for fp2 in result. fp1 is a digit[0123] that represents the fp from * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) #if __has_builtin(__builtin_shufflevector) #define _sse2neon_shuffle(type, a, b, ...) \ __builtin_shufflevector(a, b, __VA_ARGS__) #elif __has_builtin(__builtin_shuffle) #define _sse2neon_shuffle(type, a, b, ...) \ __extension__({ \ type tmp = {__VA_ARGS__}; \ __builtin_shuffle(a, b, tmp); \ }) #endif #ifdef _sse2neon_shuffle #define vshuffle_s16(a, b, ...) _sse2neon_shuffle(int16x4_t, a, b, __VA_ARGS__) #define vshuffleq_s16(a, b, ...) _sse2neon_shuffle(int16x8_t, a, b, __VA_ARGS__) #define vshuffle_s32(a, b, ...) _sse2neon_shuffle(int32x2_t, a, b, __VA_ARGS__) #define vshuffleq_s32(a, b, ...) _sse2neon_shuffle(int32x4_t, a, b, __VA_ARGS__) #define vshuffle_s64(a, b, ...) _sse2neon_shuffle(int64x1_t, a, b, __VA_ARGS__) #define vshuffleq_s64(a, b, ...) _sse2neon_shuffle(int64x2_t, a, b, __VA_ARGS__) #endif /* Rounding mode macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 #define _MM_FROUND_TO_POS_INF 0x02 #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_NO_EXC 0x08 #define _MM_FROUND_RAISE_EXC 0x00 #define _MM_FROUND_NINT (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_FLOOR (_MM_FROUND_TO_NEG_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_CEIL (_MM_FROUND_TO_POS_INF | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_TRUNC (_MM_FROUND_TO_ZERO | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_RINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_RAISE_EXC) #define _MM_FROUND_NEARBYINT (_MM_FROUND_CUR_DIRECTION | _MM_FROUND_NO_EXC) #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 /* Flush zero mode macros. */ #define _MM_FLUSH_ZERO_MASK 0x8000 #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_FLUSH_ZERO_OFF 0x0000 /* Denormals are zeros mode macros. */ #define _MM_DENORMALS_ZERO_MASK 0x0040 #define _MM_DENORMALS_ZERO_ON 0x0040 #define _MM_DENORMALS_ZERO_OFF 0x0000 /* indicate immediate constant argument in a given range */ #define __constrange(a, b) const /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. * If a vector type ends in d, it contains doubles, and if it does not have * a suffix, it contains floats. An integer vector type can contain any type * of integer, from chars to shorts to unsigned long longs. */ typedef int64x1_t __m64; typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. #if defined(__aarch64__) || defined(_M_ARM64) typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ // __int64 is defined in the Intrinsics Guide which maps to different datatype // in different data model #if !(defined(_WIN32) || defined(_WIN64) || defined(__int64)) #if (defined(__x86_64__) || defined(__i386__)) #define __int64 long long #else #define __int64 int64_t #endif #endif /* type-safe casting between types */ #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) #define vreinterpretq_m128_f32(x) (x) #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) #define vreinterpretq_f32_m128(x) (x) #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) #define vreinterpretq_m128i_s64(x) (x) #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) #define vreinterpretq_s64_m128i(x) (x) #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) #define vreinterpret_m64_s64(x) (x) #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) #define vreinterpret_s64_m64(x) (x) #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) #if defined(__aarch64__) || defined(_M_ARM64) #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) #define vreinterpretq_m128d_f64(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f64(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) #define vreinterpretq_f64_m128d(x) (x) #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) #else #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128d_f32(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) #define vreinterpretq_f32_m128d(x) (x) #endif // A struct is defined in this header file called 'SIMDVec' which can be used // by applications which attempt to access the contents of an __m128 struct // directly. It is important to note that accessing the __m128 struct directly // is bad coding practice by Microsoft: @see: // https://learn.microsoft.com/en-us/cpp/cpp/m128 // // However, some legacy source code may try to access the contents of an __m128 // struct directly so the developer can use the SIMDVec as an alias for it. Any // casting must be done manually by the developer, as you cannot cast or // otherwise alias the base NEON data type for intrinsic operations. // // union intended to allow direct access to an __m128 variable using the names // that the MSVC compiler provides. This union should really only be used when // trying to access the members of the vector as integer values. GCC/clang // allow native access to the float members through a simple array access // operator (in C since 4.6, in C++ since 4.8). // // Ideally direct accesses to SIMD vectors should not be used since it can cause // a performance hit. If it really is needed however, the original __m128 // variable can be aliased with a pointer to this union and used to access // individual components. The use of this union should be hidden behind a macro // that is used throughout the codebase to access the members instead of always // declaring this type of variable. typedef union ALIGN_STRUCT(16) SIMDVec { float m128_f32[4]; // as floats - DON'T USE. Added for convenience. int8_t m128_i8[16]; // as signed 8-bit integers. int16_t m128_i16[8]; // as signed 16-bit integers. int32_t m128_i32[4]; // as signed 32-bit integers. int64_t m128_i64[2]; // as signed 64-bit integers. uint8_t m128_u8[16]; // as unsigned 8-bit integers. uint16_t m128_u16[8]; // as unsigned 16-bit integers. uint32_t m128_u32[4]; // as unsigned 32-bit integers. uint64_t m128_u64[2]; // as unsigned 64-bit integers. } SIMDVec; // casting using SIMDVec #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) /* SSE macros */ #define _MM_GET_FLUSH_ZERO_MODE _sse2neon_mm_get_flush_zero_mode #define _MM_SET_FLUSH_ZERO_MODE _sse2neon_mm_set_flush_zero_mode #define _MM_GET_DENORMALS_ZERO_MODE _sse2neon_mm_get_denormals_zero_mode #define _MM_SET_DENORMALS_ZERO_MODE _sse2neon_mm_set_denormals_zero_mode // Function declaration // SSE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void); FORCE_INLINE __m128 _mm_move_ss(__m128, __m128); FORCE_INLINE __m128 _mm_or_ps(__m128, __m128); FORCE_INLINE __m128 _mm_set_ps1(float); FORCE_INLINE __m128 _mm_setzero_ps(void); // SSE2 FORCE_INLINE __m128i _mm_and_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_castps_si128(__m128); FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i, __m128i); FORCE_INLINE __m128i _mm_cvtps_epi32(__m128); FORCE_INLINE __m128d _mm_move_sd(__m128d, __m128d); FORCE_INLINE __m128i _mm_or_si128(__m128i, __m128i); FORCE_INLINE __m128i _mm_set_epi32(int, int, int, int); FORCE_INLINE __m128i _mm_set_epi64x(int64_t, int64_t); FORCE_INLINE __m128d _mm_set_pd(double, double); FORCE_INLINE __m128i _mm_set1_epi32(int); FORCE_INLINE __m128i _mm_setzero_si128(void); // SSE4.1 FORCE_INLINE __m128d _mm_ceil_pd(__m128d); FORCE_INLINE __m128 _mm_ceil_ps(__m128); FORCE_INLINE __m128d _mm_floor_pd(__m128d); FORCE_INLINE __m128 _mm_floor_ps(__m128); FORCE_INLINE __m128d _mm_round_pd(__m128d, int); FORCE_INLINE __m128 _mm_round_ps(__m128, int); // SSE4.2 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t, uint8_t); /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type #if defined(__GNUC__) && !defined(__clang__) && \ ((__GNUC__ <= 13 && defined(__arm__)) || \ (__GNUC__ == 10 && __GNUC_MINOR__ < 3 && defined(__aarch64__)) || \ (__GNUC__ <= 9 && defined(__aarch64__))) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; ret.val[0] = vld1q_u8(p + 0); ret.val[1] = vld1q_u8(p + 16); ret.val[2] = vld1q_u8(p + 32); ret.val[3] = vld1q_u8(p + 48); return ret; } #else // Wraps vld1q_u8_x4 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { return vld1q_u8_x4(p); } #endif #if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddv u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) { const uint64x1_t v1 = vpaddl_u32(vpaddl_u16(vpaddl_u8(v8))); return vget_lane_u8(vreinterpret_u8_u64(v1), 0); } #else // Wraps vaddv_u8 FORCE_INLINE uint8_t _sse2neon_vaddv_u8(uint8x8_t v8) { return vaddv_u8(v8); } #endif #if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddvq u8 variant */ FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) { uint8x8_t tmp = vpadd_u8(vget_low_u8(a), vget_high_u8(a)); uint8_t res = 0; for (int i = 0; i < 8; ++i) res += tmp[i]; return res; } #else // Wraps vaddvq_u8 FORCE_INLINE uint8_t _sse2neon_vaddvq_u8(uint8x16_t a) { return vaddvq_u8(a); } #endif #if !defined(__aarch64__) && !defined(_M_ARM64) /* emulate vaddvq u16 variant */ FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) { uint32x4_t m = vpaddlq_u16(a); uint64x2_t n = vpaddlq_u32(m); uint64x1_t o = vget_low_u64(n) + vget_high_u64(n); return vget_lane_u32((uint32x2_t) o, 0); } #else // Wraps vaddvq_u16 FORCE_INLINE uint16_t _sse2neon_vaddvq_u16(uint16x8_t a) { return vaddvq_u16(a); } #endif /* Function Naming Conventions * The naming convention of SSE intrinsics is straightforward. A generic SSE * intrinsic function is given as follows: * _mm__ * * The parts of this format are given as follows: * 1. describes the operation performed by the intrinsic * 2. identifies the data type of the function's primary arguments * * This last part, , is a little complicated. It identifies the * content of the input values, and can be set to any of the following values: * + ps - vectors contain floats (ps stands for packed single-precision) * + pd - vectors contain doubles (pd stands for packed double-precision) * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit * signed integers * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit * unsigned integers * + si128 - unspecified 128-bit vector or 256-bit vector * + m128/m128i/m128d - identifies input vector types when they are different * than the type of the returned vector * * For example, _mm_setzero_ps. The _mm implies that the function returns * a 128-bit vector. The _ps at the end implies that the argument vectors * contain floats. * * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); * // Set packed 8-bit integers * // 128 bits, 16 chars, per 8 bits * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, * 4, 5, 12, 13, 6, 7, 14, 15); * // Shuffle packed 8-bit integers * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb */ /* Constants for use with _mm_prefetch. */ enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ _MM_HINT_T1 = 2, /* load data to L2 cache only */ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ }; // The bit field mapping to the FPCR(floating-point control register) typedef struct { uint16_t res0; uint8_t res1 : 6; uint8_t bit22 : 1; uint8_t bit23 : 1; uint8_t bit24 : 1; uint8_t res2 : 7; #if defined(__aarch64__) || defined(_M_ARM64) uint32_t res3; #endif } fpcr_bitfield; // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of b and places it into the high end of the result. FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); } // takes the lower two 32-bit values from a and swaps them and places in high // end of result takes the higher two 32 bit values from b and swaps them and // places in low end of result. FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); } FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { float32x2_t a21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); } FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) { float32x2_t a03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); } FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); } // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the // high FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) { float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) { float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) { float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); } FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) { float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); } FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); } // For MSVC, we check only if it is ARM64, as every single ARM64 processor // supported by WoA has crypto extensions. If this changes in the future, // this can be verified via the runtime-only method of: // IsProcessorFeaturePresent(PF_ARM_V8_CRYPTO_INSTRUCTIONS_AVAILABLE) #if (defined(_M_ARM64) && !defined(__clang__)) || \ (defined(__ARM_FEATURE_CRYPTO) && \ (defined(__aarch64__) || __has_builtin(__builtin_arm_crypto_vmullp64))) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); #if defined(_MSC_VER) __n64 a1 = {a}, b1 = {b}; return vreinterpretq_u64_p128(vmull_p64(a1, b1)); #else return vreinterpretq_u64_p128(vmull_p64(a, b)); #endif } #else // ARMv7 polyfill // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. // // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a // 64-bit->128-bit polynomial multiply. // // It needs some work and is somewhat slow, but it is still faster than all // known scalar methods. // // Algorithm adapted to C from // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted // from "Fast Software Polynomial Multiplication on ARM Processors Using the // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab // (https://hal.inria.fr/hal-01506572) static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly8x8_t a = vreinterpret_p8_u64(_a); poly8x8_t b = vreinterpret_p8_u64(_b); // Masks uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); // Do the multiplies, rotating with vext to get all combinations uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 // Add cross products uint8x16_t l = veorq_u8(e, f); // L = E + F uint8x16_t m = veorq_u8(g, h); // M = G + H uint8x16_t n = veorq_u8(i, j); // N = I + J // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL // instructions. #if defined(__aarch64__) uint8x16_t lm_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t lm_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t nk_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); uint8x16_t nk_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); #else uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); #endif // t0 = (L) (P0 + P1) << 8 // t1 = (M) (P2 + P3) << 16 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); // t2 = (N) (P4 + P5) << 24 // t3 = (K) (P6 + P7) << 32 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); // De-interleave #if defined(__aarch64__) uint8x16_t t0 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t1 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t2 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); uint8x16_t t3 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); #else uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); #endif // Shift the cross products uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 // Accumulate the products uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); uint8x16_t mix = veorq_u8(d, cross1); uint8x16_t r = veorq_u8(mix, cross2); return vreinterpretq_u64_u8(r); } #endif // ARMv7 polyfill // C equivalent: // __m128i _mm_shuffle_epi32_default(__m128i a, // __constrange(0, 255) int imm) { // __m128i ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ vreinterpretq_m128i_s32(vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ vsetq_lane_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), \ ((imm) >> 2) & 0x3), \ vmovq_n_s32(vgetq_lane_s32( \ vreinterpretq_s32_m128i(a), (imm) & (0x3))), \ 1), \ 2), \ 3)) // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of a and places it into the high end of the result. FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); } // takes the lower two 32-bit values from a and swaps them and places in low end // of result takes the higher two 32 bit values from a and swaps them and places // in high end of result. FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); } // rotates the least significant 32 bits into the most significant 32 bits, and // shifts the rest down FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); } // rotates the most significant 32 bits into the least significant 32 bits, and // shifts the rest up FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); } // gets the lower 64 bits of a, and places it in the upper 64 bits // gets the lower 64 bits of a and places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and // places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); } FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) { int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } #if defined(__aarch64__) || defined(_M_ARM64) #define _mm_shuffle_epi32_splat(a, imm) \ vreinterpretq_m128i_s32(vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))) #else #define _mm_shuffle_epi32_splat(a, imm) \ vreinterpretq_m128i_s32( \ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))) #endif // NEON does not support a general purpose permute intrinsic. // Shuffle single-precision (32-bit) floating-point elements in a using the // control in imm8, and store the results in dst. // // C equivalent: // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, // __constrange(0, 255) int imm) { // __m128 ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; // return ret; // } // // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_ps #define _mm_shuffle_ps_default(a, b, imm) \ vreinterpretq_m128_f32(vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ vmovq_n_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))), \ 1), \ 2), \ 3)) // Shuffle 16-bit integers in the low 64 bits of a using the control in imm8. // Store the results in the low 64 bits of dst, with the high 64 bits being // copied from a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflelo_epi16 #define _mm_shufflelo_epi16_function(a, imm) \ _sse2neon_define1( \ __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ int16x4_t lowBits = vget_low_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 3); \ _sse2neon_return(vreinterpretq_m128i_s16(ret));) // Shuffle 16-bit integers in the high 64 bits of a using the control in imm8. // Store the results in the high 64 bits of dst, with the low 64 bits being // copied from a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shufflehi_epi16 #define _mm_shufflehi_epi16_function(a, imm) \ _sse2neon_define1( \ __m128i, a, int16x8_t ret = vreinterpretq_s16_m128i(_a); \ int16x4_t highBits = vget_high_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 5); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 6); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 7); \ _sse2neon_return(vreinterpretq_m128i_s16(ret));) /* MMX */ //_mm_empty is a no-op on arm FORCE_INLINE void _mm_empty(void) {} /* SSE */ // Add packed single-precision (32-bit) floating-point elements in a and b, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ps FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Add the lower single-precision (32-bit) floating-point element in a and b, // store the result in the lower element of dst, and copy the upper 3 packed // elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_ss FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . return vreinterpretq_m128_f32(vaddq_f32(a, value)); } // Compute the bitwise AND of packed single-precision (32-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_ps FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Compute the bitwise NOT of packed single-precision (32-bit) floating-point // elements in a and then AND with b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_ps FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a))); // *NOTE* argument swap } // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu16 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16( vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_pu8 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for equality, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ps FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for equality, store the result in the lower element of dst, and copy the // upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_ss FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for greater-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ps FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for greater-than-or-equal, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_ss FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpge_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for greater-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ps FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for greater-than, store the result in the lower element of dst, and copy // the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_ss FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for less-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ps FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for less-than-or-equal, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_ss FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmple_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for less-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ps FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for less-than, store the result in the lower element of dst, and copy the // upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_ss FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmplt_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for not-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ps FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for not-equal, store the result in the lower element of dst, and copy the // upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_ss FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for not-greater-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ps FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for not-greater-than-or-equal, store the result in the lower element of // dst, and copy the upper 3 packed elements from a to the upper elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_ss FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnge_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for not-greater-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ps FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for not-greater-than, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_ss FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpngt_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for not-less-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ps FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for not-less-than-or-equal, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_ss FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnle_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // for not-less-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ps FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b for not-less-than, store the result in the lower element of dst, and copy // the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_ss FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpnlt_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // to see if neither is NaN, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ps // // See also: // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { // Note: NEON does not have ordered compare builtin // Need to compare a eq a and b eq b to check for NaN // Do AND of results to get final uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b to see if neither is NaN, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_ss FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpord_ps(a, b)); } // Compare packed single-precision (32-bit) floating-point elements in a and b // to see if either is NaN, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ps FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { uint32x4_t f32a = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t f32b = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b to see if either is NaN, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_ss FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); } // Compare the lower single-precision (32-bit) floating-point element in a and b // for equality, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_ss FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_eq_b, 0) & 0x1; } // Compare the lower single-precision (32-bit) floating-point element in a and b // for greater-than-or-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_ss FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_ge_b, 0) & 0x1; } // Compare the lower single-precision (32-bit) floating-point element in a and b // for greater-than, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_ss FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_gt_b, 0) & 0x1; } // Compare the lower single-precision (32-bit) floating-point element in a and b // for less-than-or-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_ss FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_le_b, 0) & 0x1; } // Compare the lower single-precision (32-bit) floating-point element in a and b // for less-than, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_ss FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return vgetq_lane_u32(a_lt_b, 0) & 0x1; } // Compare the lower single-precision (32-bit) floating-point element in a and b // for not-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_ss FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { return !_mm_comieq_ss(a, b); } // Convert packed signed 32-bit integers in b to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, and copy the upper 2 packed elements from a to the upper elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_pi2ps FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))))); #else return vreinterpret_m64_s32(vcvt_s32_f32(vget_low_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION))))); #endif } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_si2ss FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vgetq_lane_s32(vcvtnq_s32_f32(vrndiq_f32(vreinterpretq_f32_m128(a))), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int32_t) data; #endif } // Convert packed 16-bit integers in a to packed single-precision (32-bit) // floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi16_ps FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); } // Convert packed 32-bit integers in b to packed single-precision (32-bit) // floating-point elements, store the results in the lower 2 elements of dst, // and copy the upper 2 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_ps FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed signed 32-bit integers in a to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, then convert the packed signed 32-bit integers in b to // single-precision (32-bit) floating-point element, and store the results in // the upper 2 elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32x2_ps FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); } // Convert the lower packed 8-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi8_ps FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 16-bit integers, and store the results in dst. Note: this intrinsic // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and // 0x7FFFFFFF. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi16 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { return vreinterpret_m64_s16( vqmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi32 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) // Convert packed single-precision (32-bit) floating-point elements in a to // packed 8-bit integers, and store the results in lower 4 elements of dst. // Note: this intrinsic will generate 0x7F, rather than 0x80, for input values // between 0x7F and 0x7FFFFFFF. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pi8 FORCE_INLINE __m64 _mm_cvtps_pi8(__m128 a) { return vreinterpret_m64_s8(vqmovn_s16( vcombine_s16(vreinterpret_s16_m64(_mm_cvtps_pi16(a)), vdup_n_s16(0)))); } // Convert packed unsigned 16-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu16_ps FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); } // Convert the lower packed unsigned 8-bit integers in a to packed // single-precision (32-bit) floating-point elements, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpu8_ps FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_u32( vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_ss #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) // Convert the signed 64-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_ss FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Copy the lower single-precision (32-bit) floating-point element of a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_f32 FORCE_INLINE float _mm_cvtss_f32(__m128 a) { return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si32 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_si64 FORCE_INLINE int64_t _mm_cvtss_si64(__m128 a) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return (int64_t) vgetq_lane_f32(vrndiq_f32(vreinterpretq_f32_m128(a)), 0); #else float32_t data = vgetq_lane_f32( vreinterpretq_f32_m128(_mm_round_ps(a, _MM_FROUND_CUR_DIRECTION)), 0); return (int64_t) data; #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ps2pi FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { return vreinterpret_m64_s32( vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtt_ss2si FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_pi32 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si32 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttss_si64 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { return (int64_t) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Divide packed single-precision (32-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // Due to ARMv7-A NEON's lack of a precise division intrinsic, we implement // division by multiplying a by b's reciprocal before using the Newton-Raphson // method to approximate the results. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ps FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); #endif } // Divide the lower single-precision (32-bit) floating-point element in a by the // lower single-precision (32-bit) floating-point element in b, store the result // in the lower element of dst, and copy the upper 3 packed elements from a to // the upper elements of dst. // Warning: ARMv7-A does not produce the same result compared to Intel and not // IEEE-compliant. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_ss FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_pi16 #define _mm_extract_pi16(a, imm) \ (int32_t) vget_lane_u16(vreinterpret_u16_m64(a), (imm)) // Free aligned memory that was allocated with _mm_malloc. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_free #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void _mm_free(void *addr) { free(addr); } #endif FORCE_INLINE uint64_t _sse2neon_get_fpcr(void) { uint64_t value; #if defined(_MSC_VER) value = _ReadStatusReg(ARM64_FPCR); #else __asm__ __volatile__("mrs %0, FPCR" : "=r"(value)); /* read */ #endif return value; } FORCE_INLINE void _sse2neon_set_fpcr(uint64_t value) { #if defined(_MSC_VER) _WriteStatusReg(ARM64_FPCR, value); #else __asm__ __volatile__("msr FPCR, %0" ::"r"(value)); /* write */ #endif } // Macro: Get the flush zero bits from the MXCSR control and status register. // The flush zero may contain any of the following flags: _MM_FLUSH_ZERO_ON or // _MM_FLUSH_ZERO_OFF // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_FLUSH_ZERO_MODE FORCE_INLINE unsigned int _sse2neon_mm_get_flush_zero_mode(void) { union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_FLUSH_ZERO_ON : _MM_FLUSH_ZERO_OFF; } // Macro: Get the rounding mode bits from the MXCSR control and status register. // The rounding mode may contain any of the following flags: _MM_ROUND_NEAREST, // _MM_ROUND_DOWN, _MM_ROUND_UP, _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_GET_ROUNDING_MODE FORCE_INLINE unsigned int _MM_GET_ROUNDING_MODE(void) { union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif if (r.field.bit22) { return r.field.bit23 ? _MM_ROUND_TOWARD_ZERO : _MM_ROUND_UP; } else { return r.field.bit23 ? _MM_ROUND_DOWN : _MM_ROUND_NEAREST; } } // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_pi16 #define _mm_insert_pi16(a, b, imm) \ vreinterpret_m64_s16(vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))) // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point // elements) from memory into dst. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps FORCE_INLINE __m128 _mm_load_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[63:32] := MEM[mem_addr+31:mem_addr] // dst[95:64] := MEM[mem_addr+31:mem_addr] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ps1 #define _mm_load_ps1 _mm_load1_ps // Load a single-precision (32-bit) floating-point element from memory into the // lower of dst, and zero the upper 3 elements. mem_addr does not need to be // aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_ss FORCE_INLINE __m128 _mm_load_ss(const float *p) { return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_ps FORCE_INLINE __m128 _mm_load1_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_dup_f32(p)); } // Load 2 single-precision (32-bit) floating-point elements from memory into the // upper 2 elements of dst, and copy the lower 2 elements from a to dst. // mem_addr does not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pi FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); } // Load 2 single-precision (32-bit) floating-point elements from memory into the // lower 2 elements of dst, and copy the upper 2 elements from a to dst. // mem_addr does not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pi FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); } // Load 4 single-precision (32-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_ps FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { float32x4_t v = vrev64q_f32(vld1q_f32(p)); return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); } // Load 128-bits (composed of 4 packed single-precision (32-bit) floating-point // elements) from memory into dst. mem_addr does not need to be aligned on any // particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_ps FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are // equivalent for neon return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load unaligned 16-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si16 FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { return vreinterpretq_m128i_s16( vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si64 FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); } // Allocate size bytes of memory, aligned to the alignment specified in align, // and return a pointer to the allocated memory. _mm_free should be used to free // memory that is allocated with _mm_malloc. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_malloc #if !defined(SSE2NEON_ALLOC_DEFINED) FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { void *ptr; if (align == 1) return malloc(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); if (!posix_memalign(&ptr, align, size)) return ptr; return NULL; } #endif // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmove_si64 FORCE_INLINE void _mm_maskmove_si64(__m64 a, __m64 mask, char *mem_addr) { int8x8_t shr_mask = vshr_n_s8(vreinterpret_s8_m64(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x8_t masked = vbsl_s8(vreinterpret_u8_s8(shr_mask), vreinterpret_s8_m64(a), vreinterpret_s8_u64(vget_low_u64(vreinterpretq_u64_m128(b)))); vst1_s8((int8_t *) mem_addr, masked); } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_maskmovq #define _m_maskmovq(a, mask, mem_addr) _mm_maskmove_si64(a, mask, mem_addr) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pi16 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Compare packed single-precision (32-bit) floating-point elements in a and b, // and store packed maximum values in dst. dst does not follow the IEEE Standard // for Floating-Point Arithmetic (IEEE 754) maximum value when inputs are NaN or // signed-zero values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ps FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcgtq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pu8 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b, store the maximum value in the lower element of dst, and copy the upper 3 // packed elements from a to the upper element of dst. dst does not follow the // IEEE Standard for Floating-Point Arithmetic (IEEE 754) maximum value when // inputs are NaN or signed-zero values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_ss FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pi16 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Compare packed single-precision (32-bit) floating-point elements in a and b, // and store packed minimum values in dst. dst does not follow the IEEE Standard // for Floating-Point Arithmetic (IEEE 754) minimum value when inputs are NaN or // signed-zero values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ps FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vreinterpretq_m128_f32(vbslq_f32(vcltq_f32(_a, _b), _a, _b)); #else return vreinterpretq_m128_f32( vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pu8 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compare the lower single-precision (32-bit) floating-point elements in a and // b, store the minimum value in the lower element of dst, and copy the upper 3 // packed elements from a to the upper element of dst. dst does not follow the // IEEE Standard for Floating-Point Arithmetic (IEEE 754) minimum value when // inputs are NaN or signed-zero values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_ss FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Move the lower single-precision (32-bit) floating-point element from b to the // lower element of dst, and copy the upper 3 packed elements from a to the // upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_ss FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), vreinterpretq_f32_m128(a), 0)); } // Move the upper 2 single-precision (32-bit) floating-point elements from b to // the lower 2 elements of dst, and copy the upper 2 elements from a to the // upper 2 elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehl_ps FORCE_INLINE __m128 _mm_movehl_ps(__m128 a, __m128 b) { #if defined(aarch64__) return vreinterpretq_m128_u64( vzip2q_u64(vreinterpretq_u64_m128(b), vreinterpretq_u64_m128(a))); #else float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); #endif } // Move the lower 2 single-precision (32-bit) floating-point elements from b to // the upper 2 elements of dst, and copy the lower 2 elements from a to the // lower 2 elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movelh_ps FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pi8 FORCE_INLINE int _mm_movemask_pi8(__m64 a) { uint8x8_t input = vreinterpret_u8_m64(a); #if defined(__aarch64__) || defined(_M_ARM64) static const int8_t shift[8] = {0, 1, 2, 3, 4, 5, 6, 7}; uint8x8_t tmp = vshr_n_u8(input, 7); return vaddv_u8(vshl_u8(tmp, vld1_s8(shift))); #else // Refer the implementation of `_mm_movemask_epi8` uint16x4_t high_bits = vreinterpret_u16_u8(vshr_n_u8(input, 7)); uint32x2_t paired16 = vreinterpret_u32_u16(vsra_n_u16(high_bits, high_bits, 7)); uint8x8_t paired32 = vreinterpret_u8_u32(vsra_n_u32(paired16, paired16, 14)); return vget_lane_u8(paired32, 0) | ((int) vget_lane_u8(paired32, 4) << 4); #endif } // Set each bit of mask dst based on the most significant bit of the // corresponding packed single-precision (32-bit) floating-point element in a. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_ps FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); #if defined(__aarch64__) || defined(_M_ARM64) static const int32_t shift[4] = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return vaddvq_u32(vshlq_u32(tmp, vld1q_s32(shift))); #else // Uses the exact same method as _mm_movemask_epi8, see that for details. // Shift out everything but the sign bits with a 32-bit unsigned shift // right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); #endif } // Multiply packed single-precision (32-bit) floating-point elements in a and b, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ps FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Multiply the lower single-precision (32-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper 3 packed // elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_ss FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_mul_ps(a, b)); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_pu16 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16(vshrn_n_u32( vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); } // Compute the bitwise OR of packed single-precision (32-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_ps FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgb #define _m_pavgb(a, b) _mm_avg_pu8(a, b) // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pavgw #define _m_pavgw(a, b) _mm_avg_pu16(a, b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pextrw #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_pinsrw #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxsw #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmaxub #define _m_pmaxub(a, b) _mm_max_pu8(a, b) // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminsw #define _m_pminsw(a, b) _mm_min_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pminub #define _m_pminub(a, b) _mm_min_pu8(a, b) // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmovmskb #define _m_pmovmskb(a) _mm_movemask_pi8(a) // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pmulhuw #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) // Fetch the line of data from memory that contains address p to a location in // the cache hierarchy specified by the locality hint i. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_prefetch FORCE_INLINE void _mm_prefetch(char const *p, int i) { (void) i; #if defined(_MSC_VER) switch (i) { case _MM_HINT_NTA: __prefetch2(p, 1); break; case _MM_HINT_T0: __prefetch2(p, 0); break; case _MM_HINT_T1: __prefetch2(p, 2); break; case _MM_HINT_T2: __prefetch2(p, 4); break; } #else switch (i) { case _MM_HINT_NTA: __builtin_prefetch(p, 0, 0); break; case _MM_HINT_T0: __builtin_prefetch(p, 0, 3); break; case _MM_HINT_T1: __builtin_prefetch(p, 0, 2); break; case _MM_HINT_T2: __builtin_prefetch(p, 0, 1); break; } #endif } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=m_psadbw #define _m_psadbw(a, b) _mm_sad_pu8(a, b) // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_m_pshufw #define _m_pshufw(a, imm) _mm_shuffle_pi16(a, imm) // Compute the approximate reciprocal of packed single-precision (32-bit) // floating-point elements in a, and store the results in dst. The maximum // relative error for this approximation is less than 1.5*2^-12. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ps FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); return vreinterpretq_m128_f32(recip); } // Compute the approximate reciprocal of the lower single-precision (32-bit) // floating-point element in a, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. The // maximum relative error for this approximation is less than 1.5*2^-12. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rcp_ss FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { return _mm_move_ss(a, _mm_rcp_ps(a)); } // Compute the approximate reciprocal square root of packed single-precision // (32-bit) floating-point elements in a, and store the results in dst. The // maximum relative error for this approximation is less than 1.5*2^-12. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ps FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); // Generate masks for detecting whether input has any 0.0f/-0.0f // (which becomes positive/negative infinity by IEEE-754 arithmetic rules). const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t neg_inf = vdupq_n_u32(0xFF800000); const uint32x4_t has_pos_zero = vceqq_u32(pos_inf, vreinterpretq_u32_f32(out)); const uint32x4_t has_neg_zero = vceqq_u32(neg_inf, vreinterpretq_u32_f32(out)); out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); // Set output vector element to infinity/negative-infinity if // the corresponding input vector element is 0.0f/-0.0f. out = vbslq_f32(has_pos_zero, (float32x4_t) pos_inf, out); out = vbslq_f32(has_neg_zero, (float32x4_t) neg_inf, out); return vreinterpretq_m128_f32(out); } // Compute the approximate reciprocal square root of the lower single-precision // (32-bit) floating-point element in a, store the result in the lower element // of dst, and copy the upper 3 packed elements from a to the upper elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_rsqrt_ss FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_pu8 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { uint64x1_t t = vpaddl_u32(vpaddl_u16( vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))))); return vreinterpret_m64_u16( vset_lane_u16((int) vget_lane_u64(t, 0), vdup_n_u16(0), 0)); } // Macro: Set the flush zero bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The flush zero may contain any of the // following flags: _MM_FLUSH_ZERO_ON or _MM_FLUSH_ZERO_OFF // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_FLUSH_ZERO_MODE FORCE_INLINE void _sse2neon_mm_set_flush_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_FLUSH_ZERO_MASK) == _MM_FLUSH_ZERO_ON; #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Set packed single-precision (32-bit) floating-point elements in dst with the // supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Broadcast single-precision (32-bit) floating-point value a to all elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ps1 FORCE_INLINE __m128 _mm_set_ps1(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Macro: Set the rounding mode bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The rounding mode may contain any of // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif switch (rounding) { case _MM_ROUND_TOWARD_ZERO: r.field.bit22 = 1; r.field.bit23 = 1; break; case _MM_ROUND_DOWN: r.field.bit22 = 0; r.field.bit23 = 1; break; case _MM_ROUND_UP: r.field.bit22 = 1; r.field.bit23 = 0; break; default: //_MM_ROUND_NEAREST r.field.bit22 = 0; r.field.bit23 = 0; } #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Copy single-precision (32-bit) floating-point element a to the lower element // of dst, and zero the upper 3 elements. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_ss FORCE_INLINE __m128 _mm_set_ss(float a) { return vreinterpretq_m128_f32(vsetq_lane_f32(a, vdupq_n_f32(0), 0)); } // Broadcast single-precision (32-bit) floating-point value a to all elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_ps FORCE_INLINE __m128 _mm_set1_ps(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Set the MXCSR control and status register with the value in unsigned 32-bit // integer a. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setcsr // FIXME: _mm_setcsr() implementation supports changing the rounding mode only. FORCE_INLINE void _mm_setcsr(unsigned int a) { _MM_SET_ROUNDING_MODE(a); } // Get the unsigned 32-bit value of the MXCSR control and status register. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_getcsr // FIXME: _mm_getcsr() implementation supports reading the rounding mode only. FORCE_INLINE unsigned int _mm_getcsr(void) { return _MM_GET_ROUNDING_MODE(); } // Set packed single-precision (32-bit) floating-point elements in dst with the // supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_ps FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Return vector of type __m128 with all elements set to zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_ps FORCE_INLINE __m128 _mm_setzero_ps(void) { return vreinterpretq_m128_f32(vdupq_n_f32(0)); } // Shuffle 16-bit integers in a using the control in imm8, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi16 #ifdef _sse2neon_shuffle #define _mm_shuffle_pi16(a, imm) \ vreinterpret_m64_s16(vshuffle_s16( \ vreinterpret_s16_m64(a), vreinterpret_s16_m64(a), (imm & 0x3), \ ((imm >> 2) & 0x3), ((imm >> 4) & 0x3), ((imm >> 6) & 0x3))) #else #define _mm_shuffle_pi16(a, imm) \ _sse2neon_define1( \ __m64, a, int16x4_t ret; \ ret = vmov_n_s16( \ vget_lane_s16(vreinterpret_s16_m64(_a), (imm) & (0x3))); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vset_lane_s16( \ vget_lane_s16(vreinterpret_s16_m64(_a), ((imm) >> 6) & 0x3), ret, \ 3); \ _sse2neon_return(vreinterpret_m64_s16(ret));) #endif // Perform a serializing operation on all store-to-memory instructions that were // issued prior to this instruction. Guarantees that every store instruction // that precedes, in program order, is globally visible before any store // instruction which follows the fence in program order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sfence FORCE_INLINE void _mm_sfence(void) { _sse2neon_smp_mb(); } // Perform a serializing operation on all load-from-memory and store-to-memory // instructions that were issued prior to this instruction. Guarantees that // every memory access that precedes, in program order, the memory fence // instruction is globally visible before any memory instruction which follows // the fence in program order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mfence FORCE_INLINE void _mm_mfence(void) { _sse2neon_smp_mb(); } // Perform a serializing operation on all load-from-memory instructions that // were issued prior to this instruction. Guarantees that every load instruction // that precedes, in program order, is globally visible before any load // instruction which follows the fence in program order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lfence FORCE_INLINE void _mm_lfence(void) { _sse2neon_smp_mb(); } // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) // int imm) #ifdef _sse2neon_shuffle #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ float32x4_t _input1 = vreinterpretq_f32_m128(a); \ float32x4_t _input2 = vreinterpretq_f32_m128(b); \ float32x4_t _shuf = \ vshuffleq_s32(_input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128_f32(_shuf); \ }) #else // generic #define _mm_shuffle_ps(a, b, imm) \ _sse2neon_define2( \ __m128, a, b, __m128 ret; switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_ps_1032(_a, _b); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_ps_2301(_a, _b); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_ps_0321(_a, _b); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_ps_2103(_a, _b); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_movelh_ps(_a, _b); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_ps_1001(_a, _b); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_ps_0101(_a, _b); \ break; \ case _MM_SHUFFLE(3, 2, 1, 0): \ ret = _mm_shuffle_ps_3210(_a, _b); \ break; \ case _MM_SHUFFLE(0, 0, 1, 1): \ ret = _mm_shuffle_ps_0011(_a, _b); \ break; \ case _MM_SHUFFLE(0, 0, 2, 2): \ ret = _mm_shuffle_ps_0022(_a, _b); \ break; \ case _MM_SHUFFLE(2, 2, 0, 0): \ ret = _mm_shuffle_ps_2200(_a, _b); \ break; \ case _MM_SHUFFLE(3, 2, 0, 2): \ ret = _mm_shuffle_ps_3202(_a, _b); \ break; \ case _MM_SHUFFLE(3, 2, 3, 2): \ ret = _mm_movehl_ps(_b, _a); \ break; \ case _MM_SHUFFLE(1, 1, 3, 3): \ ret = _mm_shuffle_ps_1133(_a, _b); \ break; \ case _MM_SHUFFLE(2, 0, 1, 0): \ ret = _mm_shuffle_ps_2010(_a, _b); \ break; \ case _MM_SHUFFLE(2, 0, 0, 1): \ ret = _mm_shuffle_ps_2001(_a, _b); \ break; \ case _MM_SHUFFLE(2, 0, 3, 2): \ ret = _mm_shuffle_ps_2032(_a, _b); \ break; \ default: \ ret = _mm_shuffle_ps_default(_a, _b, (imm)); \ break; \ } _sse2neon_return(ret);) #endif // Compute the square root of packed single-precision (32-bit) floating-point // elements in a, and store the results in dst. // Due to ARMv7-A NEON's lack of a precise square root intrinsic, we implement // square root by multiplying input in with its reciprocal square root before // using the Newton-Raphson method to approximate the results. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ps FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); // Test for vrsqrteq_f32(0) -> positive infinity case. // Change to zero, so that s * 1/sqrt(s) result is zero too. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t div_by_zero = vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); recip = vreinterpretq_f32_u32( vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); // sqrt(s) = s * 1/sqrt(s) return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); #endif } // Compute the square root of the lower single-precision (32-bit) floating-point // element in a, store the result in the lower element of dst, and copy the // upper 3 packed elements from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_ss FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary // or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ps1 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); vst1q_f32(p, vdupq_n_f32(a0)); } // Store the lower single-precision (32-bit) floating-point element from a into // memory. mem_addr does not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_ss FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store1_ps #define _mm_store1_ps _mm_store_ps1 // Store the upper 2 single-precision (32-bit) floating-point elements from a // into memory. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pi FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_high_f32(a)); } // Store the lower 2 single-precision (32-bit) floating-point elements from a // into memory. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pi FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_low_f32(a)); } // Store 4 single-precision (32-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_ps FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); float32x4_t rev = vextq_f32(tmp, tmp, 2); vst1q_f32(p, rev); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating-point // elements) from a into memory. mem_addr does not need to be aligned on any // particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_ps FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Stores 16-bits of integer data a at the address p. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si16 FORCE_INLINE void _mm_storeu_si16(void *p, __m128i a) { vst1q_lane_s16((int16_t *) p, vreinterpretq_s16_m128i(a), 0); } // Stores 64-bits of integer data a at the address p. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si64 FORCE_INLINE void _mm_storeu_si64(void *p, __m128i a) { vst1q_lane_s64((int64_t *) p, vreinterpretq_s64_m128i(a), 0); } // Store 64-bits of integer data from a into memory using a non-temporal memory // hint. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pi FORCE_INLINE void _mm_stream_pi(__m64 *p, __m64 a) { vst1_s64((int64_t *) p, vreinterpret_s64_m64(a)); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- // point elements) from a into memory using a non-temporal memory hint. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_ps FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #else vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif } // Subtract packed single-precision (32-bit) floating-point elements in b from // packed single-precision (32-bit) floating-point elements in a, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ps FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Subtract the lower single-precision (32-bit) floating-point element in b from // the lower single-precision (32-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper 3 packed elements from // a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_ss FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_sub_ps(a, b)); } // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the // transposed matrix in these vectors (row0 now contains column 0, etc.). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=MM_TRANSPOSE4_PS #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ vget_low_f32(ROW23.val[0])); \ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ vget_low_f32(ROW23.val[1])); \ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ vget_high_f32(ROW23.val[0])); \ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ vget_high_f32(ROW23.val[1])); \ } while (0) // according to the documentation, these intrinsics behave the same as the // non-'u' versions. We'll just alias them here. #define _mm_ucomieq_ss _mm_comieq_ss #define _mm_ucomige_ss _mm_comige_ss #define _mm_ucomigt_ss _mm_comigt_ss #define _mm_ucomile_ss _mm_comile_ss #define _mm_ucomilt_ss _mm_comilt_ss #define _mm_ucomineq_ss _mm_comineq_ss // Return vector of type __m128i with undefined elements. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_undefined_si128 FORCE_INLINE __m128i _mm_undefined_si128(void) { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128i a; #if defined(_MSC_VER) a = _mm_setzero_si128(); #endif return a; #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } // Return vector of type __m128 with undefined elements. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_ps FORCE_INLINE __m128 _mm_undefined_ps(void) { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128 a; #if defined(_MSC_VER) a = _mm_setzero_ps(); #endif return a; #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } // Unpack and interleave single-precision (32-bit) floating-point elements from // the high half a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_ps FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Unpack and interleave single-precision (32-bit) floating-point elements from // the low half of a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_ps FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Compute the bitwise XOR of packed single-precision (32-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_ps FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } /* SSE2 */ // Add packed 16-bit integers in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi16 FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Add packed 32-bit integers in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi32 FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Add packed 64-bit integers in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi64 FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Add packed 8-bit integers in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_epi8 FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed double-precision (64-bit) floating-point elements in a and b, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1] + db[1]; return vld1q_f32((float32_t *) c); #endif } // Add the lower double-precision (64-bit) floating-point element in a and b, // store the result in the lower element of dst, and copy the upper element from // a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_add_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1]; return vld1q_f32((float32_t *) c); #endif } // Add 64-bit integers a and b, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_add_si64 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Add packed signed 16-bit integers in a and b using saturation, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi16 FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Add packed signed 8-bit integers in a and b using saturation, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epi8 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Add packed unsigned 16-bit integers in a and b using saturation, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu16 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Add packed unsigned 8-bit integers in a and b using saturation, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_adds_epu8 FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compute the bitwise AND of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_pd FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_and_si128 FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compute the bitwise NOT of packed double-precision (64-bit) floating-point // elements in a and then AND with b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_pd FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { // *NOTE* argument swap return vreinterpretq_m128d_s64( vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); } // Compute the bitwise NOT of 128 bits (representing integer data) in a and then // AND with b, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_andnot_si128 FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a))); // *NOTE* argument swap } // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu16 FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_avg_epu8 FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bslli_si128 #define _mm_bslli_si128(a, imm) _mm_slli_si128(a, imm) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_bsrli_si128 #define _mm_bsrli_si128(a, imm) _mm_srli_si128(a, imm) // Cast vector of type __m128d to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_ps FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128d to type __m128i. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castpd_si128 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128 to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_pd FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); } // Cast vector of type __m128 to type __m128i. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castps_si128 FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); } // Cast vector of type __m128i to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); #endif } // Cast vector of type __m128i to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); } // Invalidate and flush the cache line that contains p from all levels of the // cache hierarchy. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clflush #if defined(__APPLE__) #include #endif FORCE_INLINE void _mm_clflush(void const *p) { (void) p; /* sys_icache_invalidate is supported since macOS 10.5. * However, it does not work on non-jailbroken iOS devices, although the * compilation is successful. */ #if defined(__APPLE__) sys_icache_invalidate((void *) (uintptr_t) p, SSE2NEON_CACHELINE_SIZE); #elif defined(__GNUC__) || defined(__clang__) uintptr_t ptr = (uintptr_t) p; __builtin___clear_cache((char *) ptr, (char *) ptr + SSE2NEON_CACHELINE_SIZE); #elif (_MSC_VER) && SSE2NEON_INCLUDE_WINDOWS_H FlushInstructionCache(GetCurrentProcess(), p, SSE2NEON_CACHELINE_SIZE); #endif } // Compare packed 16-bit integers in a and b for equality, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi16 FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed 32-bit integers in a and b for equality, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi32 FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed 8-bit integers in a and b for equality, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_epi8 FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for equality, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for equality, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpeq_sd FORCE_INLINE __m128d _mm_cmpeq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpeq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_pd FORCE_INLINE __m128d _mm_cmpge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) >= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpge_sd FORCE_INLINE __m128d _mm_cmpge_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpge_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) >= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed signed 16-bit integers in a and b for greater-than, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi16 FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed signed 32-bit integers in a and b for greater-than, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi32 FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b for greater-than, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_epi8 FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for greater-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_pd FORCE_INLINE __m128d _mm_cmpgt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) > (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for greater-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpgt_sd FORCE_INLINE __m128d _mm_cmpgt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpgt_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_pd FORCE_INLINE __m128d _mm_cmple_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) <= (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than-or-equal, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmple_sd FORCE_INLINE __m128d _mm_cmple_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmple_pd(a, b)); #else // expand "_mm_cmpge_pd()" to reduce unnecessary operations uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) <= (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed signed 16-bit integers in a and b for less-than, and store the // results in dst. Note: This intrinsic emits the pcmpgtw instruction with the // order of the operands switched. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi16 FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed signed 32-bit integers in a and b for less-than, and store the // results in dst. Note: This intrinsic emits the pcmpgtd instruction with the // order of the operands switched. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi32 FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b for less-than, and store the // results in dst. Note: This intrinsic emits the pcmpgtb instruction with the // order of the operands switched. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_epi8 FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for less-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_pd FORCE_INLINE __m128d _mm_cmplt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = (*(double *) &a1) < (*(double *) &b1) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for less-than, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmplt_sd FORCE_INLINE __m128d _mm_cmplt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmplt_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_pd FORCE_INLINE __m128d _mm_cmpneq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_s32(vmvnq_s32(vreinterpretq_s32_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vmvnq_u32(vandq_u32(cmp, swapped))); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-equal, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpneq_sd FORCE_INLINE __m128d _mm_cmpneq_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpneq_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_pd FORCE_INLINE __m128d _mm_cmpnge_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcgeq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) >= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) >= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than-or-equal, store the result in the lower element of // dst, and copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnge_sd FORCE_INLINE __m128d _mm_cmpnge_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnge_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-greater-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_cmpngt_pd FORCE_INLINE __m128d _mm_cmpngt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcgtq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) > (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) > (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-greater-than, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpngt_sd FORCE_INLINE __m128d _mm_cmpngt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpngt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than-or-equal, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_pd FORCE_INLINE __m128d _mm_cmpnle_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcleq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) <= (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) <= (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than-or-equal, store the result in the lower element of dst, // and copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnle_sd FORCE_INLINE __m128d _mm_cmpnle_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnle_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for not-less-than, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_pd FORCE_INLINE __m128d _mm_cmpnlt_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_u64(veorq_u64( vcltq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)), vdupq_n_u64(UINT64_MAX))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = !((*(double *) &a0) < (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = !((*(double *) &a1) < (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b for not-less-than, store the result in the lower element of dst, and copy // the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpnlt_sd FORCE_INLINE __m128d _mm_cmpnlt_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_cmpnlt_pd(a, b)); } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if neither is NaN, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_pd FORCE_INLINE __m128d _mm_cmpord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) // Excluding NaNs, any two floating point numbers can be compared. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_u64(vandq_u64(not_nan_a, not_nan_b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? ~UINT64_C(0) : UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if neither is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpord_sd FORCE_INLINE __m128d _mm_cmpord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? ~UINT64_C(0) : UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare packed double-precision (64-bit) floating-point elements in a and b // to see if either is NaN, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_pd FORCE_INLINE __m128d _mm_cmpunord_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) // Two NaNs are not equal in comparison operation. uint64x2_t not_nan_a = vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(a)); uint64x2_t not_nan_b = vceqq_f64(vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_s32( vmvnq_s32(vreinterpretq_s32_u64(vandq_u64(not_nan_a, not_nan_b)))); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = ((*(double *) &a1) == (*(double *) &a1) && (*(double *) &b1) == (*(double *) &b1)) ? UINT64_C(0) : ~UINT64_C(0); return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b to see if either is NaN, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpunord_sd FORCE_INLINE __m128d _mm_cmpunord_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_cmpunord_pd(a, b)); #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t d[2]; d[0] = ((*(double *) &a0) == (*(double *) &a0) && (*(double *) &b0) == (*(double *) &b0)) ? UINT64_C(0) : ~UINT64_C(0); d[1] = a1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than-or-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comige_sd FORCE_INLINE int _mm_comige_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgeq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 >= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for greater-than, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comigt_sd FORCE_INLINE int _mm_comigt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcgtq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 > *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than-or-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comile_sd FORCE_INLINE int _mm_comile_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcleq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 <= *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for less-than, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comilt_sd FORCE_INLINE int _mm_comilt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vcltq_f64(a, b), 0) & 0x1; #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); return (*(double *) &a0 < *(double *) &b0); #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for equality, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comieq_sd FORCE_INLINE int _mm_comieq_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_u64(vceqq_f64(a, b), 0) & 0x1; #else uint32x4_t a_not_nan = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(a)); uint32x4_t b_not_nan = vceqq_u32(vreinterpretq_u32_m128d(b), vreinterpretq_u32_m128d(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_eq_b = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint64x2_t and_results = vandq_u64(vreinterpretq_u64_u32(a_and_b_not_nan), vreinterpretq_u64_u32(a_eq_b)); return vgetq_lane_u64(and_results, 0) & 0x1; #endif } // Compare the lower double-precision (64-bit) floating-point element in a and b // for not-equal, and return the boolean result (0 or 1). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_comineq_sd FORCE_INLINE int _mm_comineq_sd(__m128d a, __m128d b) { return !_mm_comieq_sd(a, b); } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_pd FORCE_INLINE __m128d _mm_cvtepi32_pd(__m128i a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a))))); #else double a0 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); double a1 = (double) vgetq_lane_s32(vreinterpretq_s32_m128i(a), 1); return _mm_set_pd(a1, a0); #endif } // Convert packed signed 32-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_ps FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_epi32 FORCE_INLINE __m128i _mm_cvtpd_epi32(__m128d a) { // vrnd32xq_f64 not supported on clang #if defined(__ARM_FEATURE_FRINT) && !defined(__clang__) float64x2_t rounded = vrnd32xq_f64(vreinterpretq_f64_m128d(a)); int64x2_t integers = vcvtq_s64_f64(rounded); return vreinterpretq_m128i_s32( vcombine_s32(vmovn_s64(integers), vdup_n_s32(0))); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; return _mm_set_epi32(0, 0, (int32_t) d1, (int32_t) d0); #endif } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_pi32 FORCE_INLINE __m64 _mm_cvtpd_pi32(__m128d a) { __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double d0 = ((double *) &rnd)[0]; double d1 = ((double *) &rnd)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) d0, (int32_t) d1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed single-precision (32-bit) floating-point elements, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else float a0 = (float) ((double *) &a)[0]; float a1 = (float) ((double *) &a)[1]; return _mm_set_ps(0, 0, a1, a0); #endif } // Convert packed signed 32-bit integers in a to packed double-precision // (64-bit) floating-point elements, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtpi32_pd FORCE_INLINE __m128d _mm_cvtpi32_pd(__m64 a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvtq_f64_s64(vmovl_s32(vreinterpret_s32_m64(a)))); #else double a0 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 0); double a1 = (double) vget_lane_s32(vreinterpret_s32_m64(a), 1); return _mm_set_pd(a1, a0); #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_epi32 // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A // does not support! It is supported on ARMv8-A however. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__ARM_FEATURE_FRINT) return vreinterpretq_m128i_s32(vcvtq_s32_f32(vrnd32xq_f32(a))); #elif (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); case _MM_ROUND_DOWN: return vreinterpretq_m128i_s32(vcvtmq_s32_f32(a)); case _MM_ROUND_UP: return vreinterpretq_m128i_s32(vcvtpq_s32_f32(a)); default: // _MM_ROUND_TOWARD_ZERO return vreinterpretq_m128i_s32(vcvtq_s32_f32(a)); } #else float *f = (float *) &a; switch (_MM_GET_ROUNDING_MODE()) { case _MM_ROUND_NEAREST: { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128i_s32( vbslq_s32(is_delta_half, r_even, r_normal)); } case _MM_ROUND_DOWN: return _mm_set_epi32(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); case _MM_ROUND_UP: return _mm_set_epi32(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); default: // _MM_ROUND_TOWARD_ZERO return _mm_set_epi32((int32_t) f[3], (int32_t) f[2], (int32_t) f[1], (int32_t) f[0]); } #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed double-precision (64-bit) floating-point elements, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); return _mm_set_pd(a1, a0); #endif } // Copy the lower double-precision (64-bit) floating-point element of a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else return ((double *) &a)[0]; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si32 FORCE_INLINE int32_t _mm_cvtsd_si32(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return (int32_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int32_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64 FORCE_INLINE int64_t _mm_cvtsd_si64(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return (int64_t) vgetq_lane_f64(vrndiq_f64(vreinterpretq_f64_m128d(a)), 0); #else __m128d rnd = _mm_round_pd(a, _MM_FROUND_CUR_DIRECTION); double ret = ((double *) &rnd)[0]; return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_si64x #define _mm_cvtsd_si64x _mm_cvtsd_si64 // Convert the lower double-precision (64-bit) floating-point element in b to a // single-precision (32-bit) floating-point element, store the result in the // lower element of dst, and copy the upper 3 packed elements from a to the // upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsd_ss FORCE_INLINE __m128 _mm_cvtsd_ss(__m128 a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32(vsetq_lane_f32( vget_lane_f32(vcvt_f32_f64(vreinterpretq_f64_m128d(b)), 0), vreinterpretq_f32_m128(a), 0)); #else return vreinterpretq_m128_f32(vsetq_lane_f32((float) ((double *) &b)[0], vreinterpretq_f32_m128(a), 0)); #endif } // Copy the lower 32-bit integer in a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si32 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Convert the signed 32-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_sd FORCE_INLINE __m128d _mm_cvtsi32_sd(__m128d a, int32_t b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Copy the lower 64-bit integer in a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Copy 32-bit integer a to the lower elements of dst, and zero the upper // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi32_si128 FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) { return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); } // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_sd FORCE_INLINE __m128d _mm_cvtsi64_sd(__m128d a, int64_t b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64((double) b, vreinterpretq_f64_m128d(a), 0)); #else double bf = (double) b; return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &bf, vreinterpretq_s64_m128d(a), 0)); #endif } // Copy 64-bit integer a to the lower element of dst, and zero the upper // element. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64_si128 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); } // Copy 64-bit integer a to the lower element of dst, and zero the upper // element. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_si128 #define _mm_cvtsi64x_si128(a) _mm_cvtsi64_si128(a) // Convert the signed 64-bit integer b to a double-precision (64-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtsi64x_sd #define _mm_cvtsi64x_sd(a, b) _mm_cvtsi64_sd(a, b) // Convert the lower single-precision (32-bit) floating-point element in b to a // double-precision (64-bit) floating-point element, store the result in the // lower element of dst, and copy the upper element from a to the upper element // of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); #endif } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_epi32 FORCE_INLINE __m128i _mm_cvttpd_epi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; return _mm_set_epi32(0, 0, (int32_t) a1, (int32_t) a0); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttpd_pi32 FORCE_INLINE __m64 _mm_cvttpd_pi32(__m128d a) { double a0 = ((double *) &a)[0]; double a1 = ((double *) &a)[1]; int32_t ALIGN_STRUCT(16) data[2] = {(int32_t) a0, (int32_t) a1}; return vreinterpret_m64_s32(vld1_s32(data)); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttps_epi32 FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); } // Convert the lower double-precision (64-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si32 FORCE_INLINE int32_t _mm_cvttsd_si32(__m128d a) { double ret = *((double *) &a); return (int32_t) ret; } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else double ret = *((double *) &a); return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvttsd_si64x #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) // Divide packed double-precision (64-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] / db[0]; c[1] = da[1] / db[1]; return vld1q_f32((float32_t *) c); #endif } // Divide the lower double-precision (64-bit) floating-point element in a by the // lower double-precision (64-bit) floating-point element in b, store the result // in the lower element of dst, and copy the upper element from a to the upper // element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); #else return _mm_move_sd(a, _mm_div_pd(a, b)); #endif } // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi16 // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) #define _mm_extract_epi16(a, imm) \ vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi16 // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, // __constrange(0,8) int imm) #define _mm_insert_epi16(a, b, imm) \ vreinterpretq_m128i_s16( \ vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))) // Load 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from memory into dst. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_f64(p)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_pd1 #define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into the // lower of dst, and zero the upper element. mem_addr does not need to be // aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Load 128-bits of integer data from memory into dst. mem_addr must be aligned // on a 16-byte boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load_si128 FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); #endif } // Load a double-precision (64-bit) floating-point element from memory into the // upper element of dst, and copy the lower element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else return vreinterpretq_m128d_f32(vcombine_f32( vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); #endif } // Load 64-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_epi64 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { /* Load the lower 64 bits of the value pointed to by p into the * lower 64 bits of the result, zeroing the upper 64 bits of the result. */ return vreinterpretq_m128i_s32( vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); } // Load a double-precision (64-bit) floating-point element from memory into the // lower element of dst, and copy the upper element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else return vreinterpretq_m128d_f32( vcombine_f32(vld1_f32((const float *) p), vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } // Load 2 double-precision (64-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { #if defined(__aarch64__) || defined(_M_ARM64) float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else int64x2_t v = vld1q_s64((const int64_t *) p); return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); #endif } // Loads two double-precision from unaligned memory, floating-point values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_pd FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); } // Load 128-bits of integer data from memory into dst. mem_addr does not need to // be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si128 FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loadu_si32 FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Horizontally add adjacent pairs of intermediate // 32-bit integers, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_madd_epi16 FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); #if defined(__aarch64__) || defined(_M_ARM64) int32x4_t high = vmull_high_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b)); return vreinterpretq_m128i_s32(vpaddq_s32(low, high)); #else int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); #endif } // Conditionally store 8-bit integer elements from a into memory using mask // (elements are not stored when the highest bit is not set in the corresponding // element) and a non-temporal memory hint. mem_addr does not need to be aligned // on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maskmoveu_si128 FORCE_INLINE void _mm_maskmoveu_si128(__m128i a, __m128i mask, char *mem_addr) { int8x16_t shr_mask = vshrq_n_s8(vreinterpretq_s8_m128i(mask), 7); __m128 b = _mm_load_ps((const float *) mem_addr); int8x16_t masked = vbslq_s8(vreinterpretq_u8_s8(shr_mask), vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128(b)); vst1q_s8((int8_t *) mem_addr, masked); } // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi16 FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu8 FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed maximum values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_pd FORCE_INLINE __m128d _mm_max_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcgtq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vmaxq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) > (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) > (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the maximum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_sd FORCE_INLINE __m128d _mm_max_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_max_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] > db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi16 FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu8 FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b, // and store packed minimum values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_pd FORCE_INLINE __m128d _mm_min_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) #if SSE2NEON_PRECISE_MINMAX float64x2_t _a = vreinterpretq_f64_m128d(a); float64x2_t _b = vreinterpretq_f64_m128d(b); return vreinterpretq_m128d_f64(vbslq_f64(vcltq_f64(_a, _b), _a, _b)); #else return vreinterpretq_m128d_f64( vminq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #endif #else uint64_t a0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(a)); uint64_t a1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(a)); uint64_t b0 = (uint64_t) vget_low_u64(vreinterpretq_u64_m128d(b)); uint64_t b1 = (uint64_t) vget_high_u64(vreinterpretq_u64_m128d(b)); uint64_t d[2]; d[0] = (*(double *) &a0) < (*(double *) &b0) ? a0 : b0; d[1] = (*(double *) &a1) < (*(double *) &b1) ? a1 : b1; return vreinterpretq_m128d_u64(vld1q_u64(d)); #endif } // Compare the lower double-precision (64-bit) floating-point elements in a and // b, store the minimum value in the lower element of dst, and copy the upper // element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_sd FORCE_INLINE __m128d _mm_min_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_min_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2] = {da[0] < db[0] ? da[0] : db[0], da[1]}; return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) c)); #endif } // Copy the lower 64-bit integer in a to the lower element of dst, and zero the // upper element. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_epi64 FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { return vreinterpretq_m128i_s64( vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); } // Move the lower double-precision (64-bit) floating-point element from b to the // lower element of dst, and copy the upper element from a to the upper element // of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_move_sd FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { return vreinterpretq_m128d_f32( vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), vget_high_f32(vreinterpretq_f32_m128d(a)))); } // Create mask from the most significant bit of each 8-bit element in a, and // store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_epi8 FORCE_INLINE int _mm_movemask_epi8(__m128i a) { // Use increasingly wide shifts+adds to collect the sign bits // together. // Since the widening shifts would be rather confusing to follow in little // endian, everything will be illustrated in big endian order instead. This // has a different result - the bits would actually be reversed on a big // endian machine. // Starting input (only half the elements are shown): // 89 ff 1d c0 00 10 99 33 uint8x16_t input = vreinterpretq_u8_m128i(a); // Shift out everything but the sign bits with an unsigned shift right. // // Bytes of the vector:: // 89 ff 1d c0 00 10 99 33 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) // | | | | | | | | // 01 01 00 01 00 00 01 00 // // Bits of first important lane(s): // 10001001 (89) // \______ // | // 00000001 (01) uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); // Merge the even lanes together with a 16-bit unsigned shift right + add. // 'xx' represents garbage data which will be ignored in the final result. // In the important bytes, the add functions like a binary OR. // // 01 01 00 01 00 00 01 00 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) // \| \| \| \| // xx 03 xx 01 xx 00 xx 02 // // 00000001 00000001 (01 01) // \_______ | // \| // xxxxxxxx xxxxxx11 (xx 03) uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); // Repeat with a wider 32-bit shift + add. // xx 03 xx 01 xx 00 xx 02 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> // 14)) // \| \| // xx xx xx 0d xx xx xx 02 // // 00000011 00000001 (03 01) // \\_____ || // '----.\|| // xxxxxxxx xxxx1101 (xx 0d) uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); // Last, an even wider 64-bit shift + add to get our result in the low 8 bit // lanes. xx xx xx 0d xx xx xx 02 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> // 28)) // \| // xx xx xx xx xx xx xx d2 // // 00001101 00000010 (0d 02) // \ \___ | | // '---. \| | // xxxxxxxx 11010010 (xx d2) uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. // xx xx xx xx xx xx xx d2 // || return paired64[0] // d2 // Note: Little endian would return the correct value 4b (01001011) instead. return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } // Set each bit of mask dst based on the most significant bit of the // corresponding packed double-precision (64-bit) floating-point element in a. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movemask_pd FORCE_INLINE int _mm_movemask_pd(__m128d a) { uint64x2_t input = vreinterpretq_u64_m128d(a); uint64x2_t high_bits = vshrq_n_u64(input, 63); return (int) (vgetq_lane_u64(high_bits, 0) | (vgetq_lane_u64(high_bits, 1) << 1)); } // Copy the lower 64-bit integer in a to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movepi64_pi64 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); } // Copy the 64-bit integer a to the lower element of dst, and zero the upper // element. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movpi64_epi64 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { return vreinterpretq_m128i_s64( vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); } // Multiply the low unsigned 32-bit integers from each packed 64-bit element in // a and b, and store the unsigned 64-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epu32 FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { // vmull_u32 upcasts instead of masking, so we downcast. uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); } // Multiply packed double-precision (64-bit) floating-point elements in a and b, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] * db[0]; c[1] = da[1] * db[1]; return vld1q_f32((float32_t *) c); #endif } // Multiply the lower double-precision (64-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper element // from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_mul_sd FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_mul_pd(a, b)); } // Multiply the low unsigned 32-bit integers from a and b, and store the // unsigned 64-bit result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_su32 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { return vreinterpret_m64_u64(vget_low_u64( vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); } // Multiply the packed signed 16-bit integers in a and b, producing intermediate // 32-bit integers, and store the high 16 bits of the intermediate integers in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epi16 FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { /* FIXME: issue with large values because of result saturation */ // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhi_epu16 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); #if defined(__aarch64__) || defined(_M_ARM64) uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r); #else uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab7654 = vmull_u16(a7654, b7654); uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); #endif } // Multiply the packed 16-bit integers in a and b, producing intermediate 32-bit // integers, and store the low 16 bits of the intermediate integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi16 FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compute the bitwise OR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_or_pd FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Compute the bitwise OR of 128 bits (representing integer data) in a and b, // and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_or_si128 FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Convert packed signed 16-bit integers from a and b to packed 8-bit integers // using signed saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi16 FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b)))); } // Convert packed signed 32-bit integers from a and b to packed 16-bit integers // using signed saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packs_epi32 FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b)))); } // Convert packed signed 16-bit integers from a and b to packed 8-bit integers // using unsigned saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi16 FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { return vreinterpretq_m128i_u8( vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b)))); } // Pause the processor. This is typically used in spin-wait loops and depending // on the x86 processor typical values are in the 40-100 cycle range. The // 'yield' instruction isn't a good fit because it's effectively a nop on most // Arm cores. Experience with several databases has shown has shown an 'isb' is // a reasonable approximation. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_pause FORCE_INLINE void _mm_pause(void) { #if defined(_MSC_VER) __isb(_ARM64_BARRIER_SY); #else __asm__ __volatile__("isb\n"); #endif } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce two // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of 64-bit elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sad_epu8 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); return vreinterpretq_m128i_u64(vpaddlq_u32(vpaddlq_u16(t))); } // Set packed 16-bit integers in dst with the supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi16 FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0) { int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; return vreinterpretq_m128i_s16(vld1q_s16(data)); } // Set packed 32-bit integers in dst with the supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi32 FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Set packed 64-bit integers in dst with the supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64 FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { return _mm_set_epi64x(vget_lane_s64(i1, 0), vget_lane_s64(i2, 0)); } // Set packed 64-bit integers in dst with the supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi64x FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { return vreinterpretq_m128i_s64( vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); } // Set packed 8-bit integers in dst with the supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_epi8 FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); #else return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); #endif } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_pd1 #define _mm_set_pd1 _mm_set1_pd // Copy double-precision (64-bit) floating-point element a to the lower element // of dst, and zero the upper element. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsetq_lane_f64(a, vdupq_n_f64(0), 0)); #else return _mm_set_pd(0, a); #endif } // Broadcast 16-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi16 FORCE_INLINE __m128i _mm_set1_epi16(short w) { return vreinterpretq_m128i_s16(vdupq_n_s16(w)); } // Broadcast 32-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi32 FORCE_INLINE __m128i _mm_set1_epi32(int _i) { return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); } // Broadcast 64-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64 FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { return vreinterpretq_m128i_s64(vdupq_lane_s64(_i, 0)); } // Broadcast 64-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi64x FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); } // Broadcast 8-bit integer a to all elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_epi8 FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { return vreinterpretq_m128i_s8(vdupq_n_s8(w)); } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); #endif } // Set packed 16-bit integers in dst with the supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi16 FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) { int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); } // Set packed 32-bit integers in dst with the supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi32 FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Set packed 64-bit integers in dst with the supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi64 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); } // Set packed 8-bit integers in dst with the supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_epi8 FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values in reverse order. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setr_pd FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { return _mm_set_pd(e0, e1); } // Return vector of type __m128d with all elements set to zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); #endif } // Return vector of type __m128i with all elements set to zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_setzero_si128 FORCE_INLINE __m128i _mm_setzero_si128(void) { return vreinterpretq_m128i_s32(vdupq_n_s32(0)); } // Shuffle 32-bit integers in a using the control in imm8, and store the results // in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi32 // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, // __constrange(0,255) int imm) #if defined(_sse2neon_shuffle) #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ int32x4_t _input = vreinterpretq_s32_m128i(a); \ int32x4_t _shuf = \ vshuffleq_s32(_input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ vreinterpretq_m128i_s32(_shuf); \ }) #else // generic #define _mm_shuffle_epi32(a, imm) \ _sse2neon_define1( \ __m128i, a, __m128i ret; switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_epi_1032(_a); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_epi_2301(_a); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_epi_0321(_a); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_epi_2103(_a); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_shuffle_epi_1010(_a); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_epi_1001(_a); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_epi_0101(_a); \ break; \ case _MM_SHUFFLE(2, 2, 1, 1): \ ret = _mm_shuffle_epi_2211(_a); \ break; \ case _MM_SHUFFLE(0, 1, 2, 2): \ ret = _mm_shuffle_epi_0122(_a); \ break; \ case _MM_SHUFFLE(3, 3, 3, 2): \ ret = _mm_shuffle_epi_3332(_a); \ break; \ case _MM_SHUFFLE(0, 0, 0, 0): \ ret = _mm_shuffle_epi32_splat(_a, 0); \ break; \ case _MM_SHUFFLE(1, 1, 1, 1): \ ret = _mm_shuffle_epi32_splat(_a, 1); \ break; \ case _MM_SHUFFLE(2, 2, 2, 2): \ ret = _mm_shuffle_epi32_splat(_a, 2); \ break; \ case _MM_SHUFFLE(3, 3, 3, 3): \ ret = _mm_shuffle_epi32_splat(_a, 3); \ break; \ default: \ ret = _mm_shuffle_epi32_default(_a, (imm)); \ break; \ } _sse2neon_return(ret);) #endif // Shuffle double-precision (64-bit) floating-point elements using the control // in imm8, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pd #ifdef _sse2neon_shuffle #define _mm_shuffle_pd(a, b, imm8) \ vreinterpretq_m128d_s64( \ vshuffleq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), \ imm8 & 0x1, ((imm8 & 0x2) >> 1) + 2)) #else #define _mm_shuffle_pd(a, b, imm8) \ _mm_castsi128_pd(_mm_set_epi64x( \ vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) #endif // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, // __constrange(0,255) int imm) #if defined(_sse2neon_shuffle) #define _mm_shufflehi_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = \ vshuffleq_s16(_input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) #endif // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, // __constrange(0,255) int imm) #if defined(_sse2neon_shuffle) #define _mm_shufflelo_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = vshuffleq_s16( \ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) #endif // Shift packed 16-bit integers in a left by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi16 FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16((int16_t) c); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } // Shift packed 32-bit integers in a left by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi32 FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32((int32_t) c); return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } // Shift packed 64-bit integers in a left by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sll_epi64 FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64((int64_t) c); return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } // Shift packed 16-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi16 FORCE_INLINE __m128i _mm_slli_epi16(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~15)) return _mm_setzero_si128(); return vreinterpretq_m128i_s16( vshlq_s16(vreinterpretq_s16_m128i(a), vdupq_n_s16(imm))); } // Shift packed 32-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi32 FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~31)) return _mm_setzero_si128(); return vreinterpretq_m128i_s32( vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); } // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_epi64 FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) { if (_sse2neon_unlikely(imm & ~63)) return _mm_setzero_si128(); return vreinterpretq_m128i_s64( vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); } // Shift a left by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_slli_si128 #define _mm_slli_si128(a, imm) \ _sse2neon_define1( \ __m128i, a, int8x16_t ret; \ if (_sse2neon_unlikely(imm == 0)) ret = vreinterpretq_s8_m128i(_a); \ else if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ else ret = vextq_s8(vdupq_n_s8(0), vreinterpretq_s8_m128i(_a), \ ((imm <= 0 || imm > 15) ? 0 : (16 - imm))); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Compute the square root of packed double-precision (64-bit) floating-point // elements in a, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_pd FORCE_INLINE __m128d _mm_sqrt_pd(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vsqrtq_f64(vreinterpretq_f64_m128d(a))); #else double a0 = sqrt(((double *) &a)[0]); double a1 = sqrt(((double *) &a)[1]); return _mm_set_pd(a1, a0); #endif } // Compute the square root of the lower double-precision (64-bit) floating-point // element in b, store the result in the lower element of dst, and copy the // upper element from a to the upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sqrt_sd FORCE_INLINE __m128d _mm_sqrt_sd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return _mm_move_sd(a, _mm_sqrt_pd(b)); #else return _mm_set_pd(((double *) &a)[1], sqrt(((double *) &b)[0])); #endif } // Shift packed 16-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi16 FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { int64_t c = vgetq_lane_s64(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16( vshlq_s16((int16x8_t) a, vdupq_n_s16((int) -c))); } // Shift packed 32-bit integers in a right by count while shifting in sign bits, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sra_epi32 FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { int64_t c = vgetq_lane_s64(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_cmplt_epi32(a, _mm_setzero_si128()); return vreinterpretq_m128i_s32( vshlq_s32((int32x4_t) a, vdupq_n_s32((int) -c))); } // Shift packed 16-bit integers in a right by imm8 while shifting in sign // bits, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { const int count = (imm & ~15) ? 15 : imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); } // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srai_epi32 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srai_epi32(a, imm) \ _sse2neon_define0( \ __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) == 0)) { \ ret = _a; \ } else if (_sse2neon_likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_s32( \ vshlq_s32(vreinterpretq_s32_m128i(_a), vdupq_n_s32(-(imm)))); \ } else { \ ret = vreinterpretq_m128i_s32( \ vshrq_n_s32(vreinterpretq_s32_m128i(_a), 31)); \ } _sse2neon_return(ret);) // Shift packed 16-bit integers in a right by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi16 FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16(-(int16_t) c); return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } // Shift packed 32-bit integers in a right by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi32 FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32(-(int32_t) c); return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } // Shift packed 64-bit integers in a right by count while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srl_epi64 FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (_sse2neon_unlikely(c & ~63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64(-(int64_t) c); return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi16 #define _mm_srli_epi16(a, imm) \ _sse2neon_define0( \ __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u16( \ vshlq_u16(vreinterpretq_u16_m128i(_a), vdupq_n_s16(-(imm)))); \ } _sse2neon_return(ret);) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi32 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srli_epi32(a, imm) \ _sse2neon_define0( \ __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~31)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u32( \ vshlq_u32(vreinterpretq_u32_m128i(_a), vdupq_n_s32(-(imm)))); \ } _sse2neon_return(ret);) // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_epi64 #define _mm_srli_epi64(a, imm) \ _sse2neon_define0( \ __m128i, a, __m128i ret; if (_sse2neon_unlikely((imm) & ~63)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_u64( \ vshlq_u64(vreinterpretq_u64_m128i(_a), vdupq_n_s64(-(imm)))); \ } _sse2neon_return(ret);) // Shift a right by imm8 bytes while shifting in zeros, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_srli_si128 #define _mm_srli_si128(a, imm) \ _sse2neon_define1( \ __m128i, a, int8x16_t ret; \ if (_sse2neon_unlikely((imm) & ~15)) ret = vdupq_n_s8(0); \ else ret = vextq_s8(vreinterpretq_s8_m128i(_a), vdupq_n_s8(0), \ (imm > 15 ? 0 : imm)); \ _sse2neon_return(vreinterpretq_m128i_s8(ret));) // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary // or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); #else vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // memory. mem_addr does not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_store_sd FORCE_INLINE void _mm_store_sd(double *mem_addr, __m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_u64((uint64_t *) mem_addr, vget_low_u64(vreinterpretq_u64_m128d(a))); #endif } // Store 128-bits of integer data from a into memory. mem_addr must be aligned // on a 16-byte boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_store_si128 FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#expand=9,526,5601&text=_mm_store1_pd #define _mm_store1_pd _mm_store_pd1 // Store the upper double-precision (64-bit) floating-point element from a into // memory. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); #endif } // Store 64-bit integer from the first element of a into memory. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_epi64 FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { vst1_u64((uint64_t *) a, vget_low_u64(vreinterpretq_u64_m128i(b))); } // Store the lower double-precision (64-bit) floating-point element from a into // memory. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); #endif } // Store 2 double-precision (64-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storer_pd FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { float32x4_t f = vreinterpretq_f32_m128d(a); _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr does not need to be aligned on any // particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_pd FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { _mm_store_pd(mem_addr, a); } // Store 128-bits of integer data from a into memory. mem_addr does not need to // be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si128 FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Store 32-bit integer from the first element of a into memory. mem_addr does // not need to be aligned on any particular boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_storeu_si32 FORCE_INLINE void _mm_storeu_si32(void *p, __m128i a) { vst1q_lane_s32((int32_t *) p, vreinterpretq_s32_m128i(a), 0); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory using a non-temporal memory hint. mem_addr must // be aligned on a 16-byte boundary or a general-protection exception may be // generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_pd FORCE_INLINE void _mm_stream_pd(double *p, __m128d a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (__m128d *) p); #elif defined(__aarch64__) || defined(_M_ARM64) vst1q_f64(p, vreinterpretq_f64_m128d(a)); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128d(a)); #endif } // Store 128-bits of integer data from a into memory using a non-temporal memory // hint. mem_addr must be aligned on a 16-byte boundary or a general-protection // exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si128 FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, p); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); #endif } // Store 32-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si32 FORCE_INLINE void _mm_stream_si32(int *p, int a) { vst1q_lane_s32((int32_t *) p, vdupq_n_s32(a), 0); } // Store 64-bit integer a into memory using a non-temporal hint to minimize // cache pollution. If the cache line containing address mem_addr is already in // the cache, the cache will be updated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_si64 FORCE_INLINE void _mm_stream_si64(__int64 *p, __int64 a) { vst1_s64((int64_t *) p, vdup_n_s64((int64_t) a)); } // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi16 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtract packed 32-bit integers in b from packed 32-bit integers in a, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi32 FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Subtract packed 64-bit integers in b from packed 64-bit integers in a, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi64 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_epi8 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtract packed double-precision (64-bit) floating-point elements in b from // packed double-precision (64-bit) floating-point elements in a, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] - db[0]; c[1] = da[1] - db[1]; return vld1q_f32((float32_t *) c); #endif } // Subtract the lower double-precision (64-bit) floating-point element in b from // the lower double-precision (64-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_sd FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_sub_pd(a, b)); } // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sub_si64 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Subtract packed signed 16-bit integers in b from packed 16-bit integers in a // using saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi16 FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtract packed signed 8-bit integers in b from packed 8-bit integers in a // using saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epi8 FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtract packed unsigned 16-bit integers in b from packed unsigned 16-bit // integers in a using saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu16 FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Subtract packed unsigned 8-bit integers in b from packed unsigned 8-bit // integers in a using saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_subs_epu8 FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } #define _mm_ucomieq_sd _mm_comieq_sd #define _mm_ucomige_sd _mm_comige_sd #define _mm_ucomigt_sd _mm_comigt_sd #define _mm_ucomile_sd _mm_comile_sd #define _mm_ucomilt_sd _mm_comilt_sd #define _mm_ucomineq_sd _mm_comineq_sd // Return vector of type __m128d with undefined elements. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_undefined_pd FORCE_INLINE __m128d _mm_undefined_pd(void) { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128d a; #if defined(_MSC_VER) a = _mm_setzero_pd(); #endif return a; #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } // Unpack and interleave 16-bit integers from the high half of a and b, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi16 FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Unpack and interleave 32-bit integers from the high half of a and b, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi32 FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } // Unpack and interleave 64-bit integers from the high half of a and b, and // store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi64 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s64( vzip2q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); #endif } // Unpack and interleave 8-bit integers from the high half of a and b, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_epi8 FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the high half of a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), vget_high_s64(vreinterpretq_s64_m128d(b)))); #endif } // Unpack and interleave 16-bit integers from the low half of a and b, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi16 FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Unpack and interleave 32-bit integers from the low half of a and b, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi32 FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } // Unpack and interleave 64-bit integers from the low half of a and b, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi64 FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s64( vzip1q_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); #endif } // Unpack and interleave 8-bit integers from the low half of a and b, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_epi8 FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the low half of a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), vget_low_s64(vreinterpretq_s64_m128d(b)))); #endif } // Compute the bitwise XOR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_pd FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Compute the bitwise XOR of 128 bits (representing integer data) in a and b, // and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_xor_si128 FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } /* SSE3 */ // Alternatively add and subtract packed double-precision (64-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_addsub_pd FORCE_INLINE __m128d _mm_addsub_pd(__m128d a, __m128d b) { _sse2neon_const __m128d mask = _mm_set_pd(1.0f, -1.0f); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vfmaq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b), vreinterpretq_f64_m128d(mask))); #else return _mm_add_pd(_mm_mul_pd(b, mask), a); #endif } // Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=addsub_ps FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { _sse2neon_const __m128 mask = _mm_setr_ps(-1.0f, 1.0f, -1.0f, 1.0f); #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_FMA) /* VFPv4+ */ return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(mask), vreinterpretq_f32_m128(b))); #else return _mm_add_ps(_mm_mul_ps(b, mask), a); #endif } // Horizontally add adjacent pairs of double-precision (64-bit) floating-point // elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[] = {da[0] + da[1], db[0] + db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Horizontally add adjacent pairs of single-precision (32-bit) floating-point // elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_ps FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32( vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); #endif } // Horizontally subtract adjacent pairs of double-precision (64-bit) // floating-point elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pd FORCE_INLINE __m128d _mm_hsub_pd(__m128d _a, __m128d _b) { #if defined(__aarch64__) || defined(_M_ARM64) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64( vsubq_f64(vuzp1q_f64(a, b), vuzp2q_f64(a, b))); #else double *da = (double *) &_a; double *db = (double *) &_b; double c[] = {da[0] - da[1], db[0] - db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Horizontally subtract adjacent pairs of single-precision (32-bit) // floating-point elements in a and b, and pack the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_ps FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vsubq_f32(vuzp1q_f32(a, b), vuzp2q_f32(a, b))); #else float32x4x2_t c = vuzpq_f32(a, b); return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); #endif } // Load 128-bits of integer data from unaligned memory into dst. This intrinsic // may perform better than _mm_loadu_si128 when the data crosses a cache line // boundary. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_lddqu_si128 #define _mm_lddqu_si128 _mm_loadu_si128 // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_loaddup_pd #define _mm_loaddup_pd _mm_load1_pd // Duplicate the low double-precision (64-bit) floating-point element from a, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_u64( vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); #endif } // Duplicate odd-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vtrn2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) return vreinterpretq_m128_f32(vshuffleq_s32( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); #else float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } // Duplicate even-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128_f32( vtrn1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a))); #elif defined(_sse2neon_shuffle) return vreinterpretq_m128_f32(vshuffleq_s32( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); #else float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } /* SSSE3 */ // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi16 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi32 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_epi8 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); } // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi16 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi32 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_abs_pi8 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); } // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift // the result right by imm8 bytes, and store the low 16 bytes in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_epi8 #if defined(__GNUC__) && !defined(__clang__) #define _mm_alignr_epi8(a, b, imm) \ __extension__({ \ uint8x16_t _a = vreinterpretq_u8_m128i(a); \ uint8x16_t _b = vreinterpretq_u8_m128i(b); \ __m128i ret; \ if (_sse2neon_unlikely((imm) & ~31)) \ ret = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ else if (imm >= 16) \ ret = _mm_srli_si128(a, imm >= 16 ? imm - 16 : 0); \ else \ ret = \ vreinterpretq_m128i_u8(vextq_u8(_b, _a, imm < 16 ? imm : 0)); \ ret; \ }) #else #define _mm_alignr_epi8(a, b, imm) \ _sse2neon_define2( \ __m128i, a, b, uint8x16_t __a = vreinterpretq_u8_m128i(_a); \ uint8x16_t __b = vreinterpretq_u8_m128i(_b); __m128i ret; \ if (_sse2neon_unlikely((imm) & ~31)) ret = \ vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ else if (imm >= 16) ret = \ _mm_srli_si128(_a, imm >= 16 ? imm - 16 : 0); \ else ret = \ vreinterpretq_m128i_u8(vextq_u8(__b, __a, imm < 16 ? imm : 0)); \ _sse2neon_return(ret);) #endif // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift // the result right by imm8 bytes, and store the low 8 bytes in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_alignr_pi8 #define _mm_alignr_pi8(a, b, imm) \ _sse2neon_define2( \ __m64, a, b, __m64 ret; if (_sse2neon_unlikely((imm) >= 16)) { \ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ } else { \ uint8x8_t tmp_low; \ uint8x8_t tmp_high; \ if ((imm) >= 8) { \ const int idx = (imm) -8; \ tmp_low = vreinterpret_u8_m64(_a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } else { \ const int idx = (imm); \ tmp_low = vreinterpret_u8_m64(_b); \ tmp_high = vreinterpret_u8_m64(_a); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } \ } _sse2neon_return(ret);) // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi16 FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); #endif } // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the // signed 32-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_epi32 FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32(vpaddq_s32(a, b)); #else return vreinterpretq_m128i_s32( vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); #endif } // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi16 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the // signed 32-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadd_pi32 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { return vreinterpret_m64_s32( vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); } // Horizontally add adjacent pairs of signed 16-bit integers in a and b using // saturation, and pack the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_epi16 FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) || defined(_M_ARM64) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Saturated add return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); #endif } // Horizontally add adjacent pairs of signed 16-bit integers in a and b using // saturation, and pack the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hadds_pi16 FORCE_INLINE __m64 _mm_hadds_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_s64_s16(vqadd_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t res = vuzp_s16(a, b); return vreinterpret_s64_s16(vqadd_s16(res.val[0], res.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi16 FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_epi32 FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s32( vsubq_s32(vuzp1q_s32(a, b), vuzp2q_s32(a, b))); #else int32x4x2_t c = vuzpq_s32(a, b); return vreinterpretq_m128i_s32(vsubq_s32(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 16-bit integers in a and b, and pack // the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsub_pi16 FORCE_INLINE __m64 _mm_hsub_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s16(vsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vsub_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of 32-bit integers in a and b, and pack // the signed 32-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_hsub_pi32 FORCE_INLINE __m64 _mm_hsub_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s32(vsub_s32(vuzp1_s32(a, b), vuzp2_s32(a, b))); #else int32x2x2_t c = vuzp_s32(a, b); return vreinterpret_m64_s32(vsub_s32(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b // using saturation, and pack the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_epi16 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int16x8x2_t c = vuzpq_s16(a, b); return vreinterpretq_m128i_s16(vqsubq_s16(c.val[0], c.val[1])); #endif } // Horizontally subtract adjacent pairs of signed 16-bit integers in a and b // using saturation, and pack the signed 16-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_hsubs_pi16 FORCE_INLINE __m64 _mm_hsubs_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpret_m64_s16(vqsub_s16(vuzp1_s16(a, b), vuzp2_s16(a, b))); #else int16x4x2_t c = vuzp_s16(a, b); return vreinterpret_m64_s16(vqsub_s16(c.val[0], c.val[1])); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, // and pack the saturated results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_epi16 FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) || defined(_M_ARM64) uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))); int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))); return vreinterpretq_m128i_s16( vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); #else // This would be much simpler if x86 would choose to zero extend OR sign // extend, not both. This could probably be optimized better. uint16x8_t a = vreinterpretq_u16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // Zero extend a int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); // Sign extend by shifting left then shifting right. int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); int16x8_t b_odd = vshrq_n_s16(b, 8); // multiply int16x8_t prod1 = vmulq_s16(a_even, b_even); int16x8_t prod2 = vmulq_s16(a_odd, b_odd); // saturated add return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); #endif } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, and // pack the saturated results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_maddubs_pi16 FORCE_INLINE __m64 _mm_maddubs_pi16(__m64 _a, __m64 _b) { uint16x4_t a = vreinterpret_u16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // Zero extend a int16x4_t a_odd = vreinterpret_s16_u16(vshr_n_u16(a, 8)); int16x4_t a_even = vreinterpret_s16_u16(vand_u16(a, vdup_n_u16(0xff))); // Sign extend by shifting left then shifting right. int16x4_t b_even = vshr_n_s16(vshl_n_s16(b, 8), 8); int16x4_t b_odd = vshr_n_s16(b, 8); // multiply int16x4_t prod1 = vmul_s16(a_even, b_even); int16x4_t prod2 = vmul_s16(a_odd, b_odd); // saturated add return vreinterpret_m64_s16(vqadd_s16(prod1, prod2)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Shift right by 15 bits while rounding up, and store // the packed 16-bit integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_epi16 FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { // Has issues due to saturation // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); // Multiply int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); // Rounding narrowing shift right // narrow = (int16_t)((mul + 16384) >> 15); int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); // Join together return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Truncate each intermediate integer to the 18 most // significant bits, round by adding 1, and store bits [16:1] to dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mulhrs_pi16 FORCE_INLINE __m64 _mm_mulhrs_pi16(__m64 a, __m64 b) { int32x4_t mul_extend = vmull_s16((vreinterpret_s16_m64(a)), (vreinterpret_s16_m64(b))); // Rounding narrowing shift right return vreinterpret_m64_s16(vrshrn_n_s32(mul_extend, 15)); } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_epi8 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; // %e and %f represent the even and odd D registers // respectively. __asm__ __volatile__( "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" : [ret] "=&w"(ret) : [tbl] "w"(tbl), [idx] "w"(idx_masked)); return vreinterpretq_m128i_s8(ret); #else // use this line if testing on aarch64 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; return vreinterpretq_m128i_s8( vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), vtbl2_s8(a_split, vget_high_u8(idx_masked)))); #endif } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_shuffle_pi8 FORCE_INLINE __m64 _mm_shuffle_pi8(__m64 a, __m64 b) { const int8x8_t controlMask = vand_s8(vreinterpret_s8_m64(b), vdup_n_s8((int8_t) (0x1 << 7 | 0x07))); int8x8_t res = vtbl1_s8(vreinterpret_s8_m64(a), controlMask); return vreinterpret_m64_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed // 16-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi16 FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); #endif // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative // 'a') based on ltMask int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); // res = masked & (~zeroMask) int16x8_t res = vbicq_s16(masked, zeroMask); return vreinterpretq_m128i_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed // 32-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi32 FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); #endif // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative // 'a') based on ltMask int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); // res = masked & (~zeroMask) int32x4_t res = vbicq_s32(masked, zeroMask); return vreinterpretq_m128i_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed // 8-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_epi8 FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { int8x16_t a = vreinterpretq_s8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); #endif // bitwise select either a or negative 'a' (vnegq_s8(a) return negative 'a') // based on ltMask int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); // res = masked & (~zeroMask) int8x16_t res = vbicq_s8(masked, zeroMask); return vreinterpretq_m128i_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi16 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); #endif // bitwise select either a or negative 'a' (vneg_s16(a) return negative 'a') // based on ltMask int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); // res = masked & (~zeroMask) int16x4_t res = vbic_s16(masked, zeroMask); return vreinterpret_m64_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed 32-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi32 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); #endif // bitwise select either a or negative 'a' (vneg_s32(a) return negative 'a') // based on ltMask int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); // res = masked & (~zeroMask) int32x2_t res = vbic_s32(masked, zeroMask); return vreinterpret_m64_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer // in b is negative, and store the results in dst. Element in dst are zeroed out // when the corresponding element in b is zero. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_sign_pi8 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { int8x8_t a = vreinterpret_s8_m64(_a); int8x8_t b = vreinterpret_s8_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) || defined(_M_ARM64) int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); #endif // bitwise select either a or negative 'a' (vneg_s8(a) return negative 'a') // based on ltMask int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); // res = masked & (~zeroMask) int8x8_t res = vbic_s8(masked, zeroMask); return vreinterpret_m64_s8(res); } /* SSE4.1 */ // Blend packed 16-bit integers from a and b using control mask imm8, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_epi16 // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, // __constrange(0,255) int imm) #define _mm_blend_epi16(a, b, imm) \ _sse2neon_define2( \ __m128i, a, b, \ const uint16_t _mask[8] = \ _sse2neon_init(((imm) & (1 << 0)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 1)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 2)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 3)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 4)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 5)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 6)) ? (uint16_t) -1 : 0x0, \ ((imm) & (1 << 7)) ? (uint16_t) -1 : 0x0); \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t __a = vreinterpretq_u16_m128i(_a); \ uint16x8_t __b = vreinterpretq_u16_m128i(_b); _sse2neon_return( \ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, __b, __a)));) // Blend packed double-precision (64-bit) floating-point elements from a and b // using control mask imm8, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_pd #define _mm_blend_pd(a, b, imm) \ _sse2neon_define2( \ __m128d, a, b, \ const uint64_t _mask[2] = \ _sse2neon_init(((imm) & (1 << 0)) ? ~UINT64_C(0) : UINT64_C(0), \ ((imm) & (1 << 1)) ? ~UINT64_C(0) : UINT64_C(0)); \ uint64x2_t _mask_vec = vld1q_u64(_mask); \ uint64x2_t __a = vreinterpretq_u64_m128d(_a); \ uint64x2_t __b = vreinterpretq_u64_m128d(_b); _sse2neon_return( \ vreinterpretq_m128d_u64(vbslq_u64(_mask_vec, __b, __a)));) // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { const uint32_t ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, ((imm8) & (1 << 1)) ? UINT32_MAX : 0, ((imm8) & (1 << 2)) ? UINT32_MAX : 0, ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Blend packed 8-bit integers from a and b using mask, and store the results in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_epi8 FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) { // Use a signed shift right to create a mask with the sign bit uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); uint8x16_t a = vreinterpretq_u8_m128i(_a); uint8x16_t b = vreinterpretq_u8_m128i(_b); return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); } // Blend packed double-precision (64-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_pd FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); #if defined(__aarch64__) || defined(_M_ARM64) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); #else uint64x2_t a = vreinterpretq_u64_m128d(_a); uint64x2_t b = vreinterpretq_u64_m128d(_b); return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); #endif } // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_blendv_ps FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { // Use a signed shift right to create a mask with the sign bit uint32x4_t mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Round the packed double-precision (64-bit) floating-point elements in a up // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_pd FORCE_INLINE __m128d _mm_ceil_pd(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndpq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(ceil(f[1]), ceil(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a up to // an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(ceilf(f[3]), ceilf(f[2]), ceilf(f[1]), ceilf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b up to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_sd FORCE_INLINE __m128d _mm_ceil_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_ceil_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b up to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_ceil_ss FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_ceil_ps(b)); } // Compare packed 64-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else // ARMv7 lacks vceqq_u64 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); #endif } // Sign extend packed 16-bit integers in a to packed 32-bit integers, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi32 FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { return vreinterpretq_m128i_s32( vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); } // Sign extend packed 16-bit integers in a to packed 64-bit integers, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi16_epi64 FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Sign extend packed 32-bit integers in a to packed 64-bit integers, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi32_epi64 FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { return vreinterpretq_m128i_s64( vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); } // Sign extend packed 8-bit integers in a to packed 16-bit integers, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi16 FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ return vreinterpretq_m128i_s16(s16x8); } // Sign extend packed 8-bit integers in a to packed 32-bit integers, and store // the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi32 FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_s32(s32x4); } // Sign extend packed 8-bit integers in the low 8 bytes of a to packed 64-bit // integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepi8_epi64 FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Zero extend packed unsigned 16-bit integers in a to packed 32-bit integers, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi32 FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { return vreinterpretq_m128i_u32( vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); } // Zero extend packed unsigned 16-bit integers in a to packed 64-bit integers, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu16_epi64 FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Zero extend packed unsigned 32-bit integers in a to packed 64-bit integers, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu32_epi64 FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { return vreinterpretq_m128i_u64( vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); } // Zero extend packed unsigned 8-bit integers in a to packed 16-bit integers, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi16 FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx HGFE DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0H0G 0F0E 0D0C 0B0A */ return vreinterpretq_m128i_u16(u16x8); } // Zero extend packed unsigned 8-bit integers in a to packed 32-bit integers, // and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi32 FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_u32(u32x4); } // Zero extend packed unsigned 8-bit integers in the low 8 bytes of a to packed // 64-bit integers, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cvtepu8_epi64 FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Conditionally multiply the packed double-precision (64-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, and // conditionally store the sum in dst using the low 4 bits of imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_pd FORCE_INLINE __m128d _mm_dp_pd(__m128d a, __m128d b, const int imm) { // Generate mask value from constant immediate bit value const int64_t bit0Mask = imm & 0x01 ? UINT64_MAX : 0; const int64_t bit1Mask = imm & 0x02 ? UINT64_MAX : 0; #if !SSE2NEON_PRECISE_DP const int64_t bit4Mask = imm & 0x10 ? UINT64_MAX : 0; const int64_t bit5Mask = imm & 0x20 ? UINT64_MAX : 0; #endif // Conditional multiplication #if !SSE2NEON_PRECISE_DP __m128d mul = _mm_mul_pd(a, b); const __m128d mulMask = _mm_castsi128_pd(_mm_set_epi64x(bit5Mask, bit4Mask)); __m128d tmp = _mm_and_pd(mul, mulMask); #else #if defined(__aarch64__) || defined(_M_ARM64) double d0 = (imm & 0x10) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 0) : 0; double d1 = (imm & 0x20) ? vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1) * vgetq_lane_f64(vreinterpretq_f64_m128d(b), 1) : 0; #else double d0 = (imm & 0x10) ? ((double *) &a)[0] * ((double *) &b)[0] : 0; double d1 = (imm & 0x20) ? ((double *) &a)[1] * ((double *) &b)[1] : 0; #endif __m128d tmp = _mm_set_pd(d1, d0); #endif // Sum the products #if defined(__aarch64__) || defined(_M_ARM64) double sum = vpaddd_f64(vreinterpretq_f64_m128d(tmp)); #else double sum = *((double *) &tmp) + *(((double *) &tmp) + 1); #endif // Conditionally store the sum const __m128d sumMask = _mm_castsi128_pd(_mm_set_epi64x(bit1Mask, bit0Mask)); __m128d res = _mm_and_pd(_mm_set_pd1(sum), sumMask); return res; } // Conditionally multiply the packed single-precision (32-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, // and conditionally store the sum in dst using the low 4 bits of imm. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_dp_ps FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { float32x4_t elementwise_prod = _mm_mul_ps(a, b); #if defined(__aarch64__) || defined(_M_ARM64) /* shortcuts */ if (imm == 0xFF) { return _mm_set1_ps(vaddvq_f32(elementwise_prod)); } if ((imm & 0x0F) == 0x0F) { if (!(imm & (1 << 4))) elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 0); if (!(imm & (1 << 5))) elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 1); if (!(imm & (1 << 6))) elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 2); if (!(imm & (1 << 7))) elementwise_prod = vsetq_lane_f32(0.0f, elementwise_prod, 3); return _mm_set1_ps(vaddvq_f32(elementwise_prod)); } #endif float s = 0.0f; if (imm & (1 << 4)) s += vgetq_lane_f32(elementwise_prod, 0); if (imm & (1 << 5)) s += vgetq_lane_f32(elementwise_prod, 1); if (imm & (1 << 6)) s += vgetq_lane_f32(elementwise_prod, 2); if (imm & (1 << 7)) s += vgetq_lane_f32(elementwise_prod, 3); const float32_t res[4] = { (imm & 0x1) ? s : 0.0f, (imm & 0x2) ? s : 0.0f, (imm & 0x4) ? s : 0.0f, (imm & 0x8) ? s : 0.0f, }; return vreinterpretq_m128_f32(vld1q_f32(res)); } // Extract a 32-bit integer from a, selected with imm8, and store the result in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi32 // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) #define _mm_extract_epi32(a, imm) \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) // Extract a 64-bit integer from a, selected with imm8, and store the result in // dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi64 // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) #define _mm_extract_epi64(a, imm) \ vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) // Extract an 8-bit integer from a, selected with imm8, and store the result in // the lower element of dst. FORCE_INLINE int _mm_extract_epi8(__m128i a, // __constrange(0,16) int imm) // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_extract_epi8 #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) // Extracts the selected single-precision (32-bit) floating-point from a. // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) // Round the packed double-precision (64-bit) floating-point elements in a down // to an integer value, and store the results as packed double-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_pd FORCE_INLINE __m128d _mm_floor_pd(__m128d a) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128d_f64(vrndmq_f64(vreinterpretq_f64_m128d(a))); #else double *f = (double *) &a; return _mm_set_pd(floor(f[1]), floor(f[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a down // to an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); #else float *f = (float *) &a; return _mm_set_ps(floorf(f[3]), floorf(f[2]), floorf(f[1]), floorf(f[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b down to // an integer value, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_sd FORCE_INLINE __m128d _mm_floor_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_floor_pd(b)); } // Round the lower single-precision (32-bit) floating-point element in b down to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_floor_ss FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_floor_ps(b)); } // Copy a to dst, and insert the 32-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi32 // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, // __constrange(0,4) int imm) #define _mm_insert_epi32(a, b, imm) \ vreinterpretq_m128i_s32( \ vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))) // Copy a to dst, and insert the 64-bit integer i into dst at the location // specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi64 // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, // __constrange(0,2) int imm) #define _mm_insert_epi64(a, b, imm) \ vreinterpretq_m128i_s64( \ vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))) // Copy a to dst, and insert the lower 8-bit integer from i into dst at the // location specified by imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_insert_epi8 // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, // __constrange(0,16) int imm) #define _mm_insert_epi8(a, b, imm) \ vreinterpretq_m128i_s8(vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))) // Copy a to tmp, then insert a single-precision (32-bit) floating-point // element from b into tmp using the control in imm8. Store tmp to dst using // the mask in imm8 (elements are zeroed out when the corresponding bit is set). // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=insert_ps #define _mm_insert_ps(a, b, imm8) \ _sse2neon_define2( \ __m128, a, b, \ float32x4_t tmp1 = \ vsetq_lane_f32(vgetq_lane_f32(_b, (imm8 >> 6) & 0x3), \ vreinterpretq_f32_m128(_a), 0); \ float32x4_t tmp2 = \ vsetq_lane_f32(vgetq_lane_f32(tmp1, 0), \ vreinterpretq_f32_m128(_a), ((imm8 >> 4) & 0x3)); \ const uint32_t data[4] = \ _sse2neon_init(((imm8) & (1 << 0)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 1)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 2)) ? UINT32_MAX : 0, \ ((imm8) & (1 << 3)) ? UINT32_MAX : 0); \ uint32x4_t mask = vld1q_u32(data); \ float32x4_t all_zeros = vdupq_n_f32(0); \ \ _sse2neon_return(vreinterpretq_m128_f32( \ vbslq_f32(mask, all_zeros, vreinterpretq_f32_m128(tmp2))));) // Compare packed signed 32-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi32 FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epi8 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu16 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed maximum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Compare packed signed 32-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi32 FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epi8 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_min_epu16 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed minimum // values in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Horizontally compute the minimum amongst the packed unsigned 16-bit integers // in a, store the minimum and index in dst, and zero the remaining bits in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_minpos_epu16 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { __m128i dst; uint16_t min, idx = 0; #if defined(__aarch64__) || defined(_M_ARM64) // Find the minimum value min = vminvq_u16(vreinterpretq_u16_m128i(a)); // Get the index of the minimum value static const uint16_t idxv[] = {0, 1, 2, 3, 4, 5, 6, 7}; uint16x8_t minv = vdupq_n_u16(min); uint16x8_t cmeq = vceqq_u16(minv, vreinterpretq_u16_m128i(a)); idx = vminvq_u16(vornq_u16(vld1q_u16(idxv), cmeq)); #else // Find the minimum value __m64 tmp; tmp = vreinterpret_m64_u16( vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), vget_high_u16(vreinterpretq_u16_m128i(a)))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); // Get the index of the minimum value int i; for (i = 0; i < 8; i++) { if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { idx = (uint16_t) i; break; } a = _mm_srli_si128(a, 2); } #endif // Generate result dst = _mm_setzero_si128(); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); return dst; } // Compute the sum of absolute differences (SADs) of quadruplets of unsigned // 8-bit integers in a compared to those in b, and store the 16-bit results in // dst. Eight SADs are performed using one quadruplet from b and eight // quadruplets from a. One quadruplet is selected from b starting at on the // offset specified in imm8. Eight quadruplets are formed from sequential 8-bit // integers selected from a starting at the offset specified in imm8. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mpsadbw_epu8 FORCE_INLINE __m128i _mm_mpsadbw_epu8(__m128i a, __m128i b, const int imm) { uint8x16_t _a, _b; switch (imm & 0x4) { case 0: // do nothing _a = vreinterpretq_u8_m128i(a); break; case 4: _a = vreinterpretq_u8_u32(vextq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(a), 1)); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #elif defined(_MSC_VER) __assume(0); #endif break; } switch (imm & 0x3) { case 0: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 0))); break; case 1: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 1))); break; case 2: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 2))); break; case 3: _b = vreinterpretq_u8_u32( vdupq_n_u32(vgetq_lane_u32(vreinterpretq_u32_m128i(b), 3))); break; default: #if defined(__GNUC__) || defined(__clang__) __builtin_unreachable(); #elif defined(_MSC_VER) __assume(0); #endif break; } int16x8_t c04, c15, c26, c37; uint8x8_t low_b = vget_low_u8(_b); c04 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a), low_b)); uint8x16_t _a_1 = vextq_u8(_a, _a, 1); c15 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_1), low_b)); uint8x16_t _a_2 = vextq_u8(_a, _a, 2); c26 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_2), low_b)); uint8x16_t _a_3 = vextq_u8(_a, _a, 3); c37 = vreinterpretq_s16_u16(vabdl_u8(vget_low_u8(_a_3), low_b)); #if defined(__aarch64__) || defined(_M_ARM64) // |0|4|2|6| c04 = vpaddq_s16(c04, c26); // |1|5|3|7| c15 = vpaddq_s16(c15, c37); int32x4_t trn1_c = vtrn1q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); int32x4_t trn2_c = vtrn2q_s32(vreinterpretq_s32_s16(c04), vreinterpretq_s32_s16(c15)); return vreinterpretq_m128i_s16(vpaddq_s16(vreinterpretq_s16_s32(trn1_c), vreinterpretq_s16_s32(trn2_c))); #else int16x4_t c01, c23, c45, c67; c01 = vpadd_s16(vget_low_s16(c04), vget_low_s16(c15)); c23 = vpadd_s16(vget_low_s16(c26), vget_low_s16(c37)); c45 = vpadd_s16(vget_high_s16(c04), vget_high_s16(c15)); c67 = vpadd_s16(vget_high_s16(c26), vget_high_s16(c37)); return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(c01, c23), vpadd_s16(c45, c67))); #endif } // Multiply the low signed 32-bit integers from each packed 64-bit element in // a and b, and store the signed 64-bit results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mul_epi32 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { // vmull_s32 upcasts instead of masking, so we downcast. int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); } // Multiply the packed 32-bit integers in a and b, producing intermediate 64-bit // integers, and store the low 32 bits of the intermediate integers in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_mullo_epi32 FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Convert packed signed 32-bit integers from a and b to packed 16-bit integers // using unsigned saturation, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_packus_epi32 FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), vqmovun_s32(vreinterpretq_s32_m128i(b)))); } // Round the packed double-precision (64-bit) floating-point elements in a using // the rounding parameter, and store the results as packed double-precision // floating-point elements in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_pd FORCE_INLINE __m128d _mm_round_pd(__m128d a, int rounding) { #if defined(__aarch64__) || defined(_M_ARM64) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndnq_f64(vreinterpretq_f64_m128d(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_pd(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_pd(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128d_f64(vrndq_f64(vreinterpretq_f64_m128d(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128d_f64(vrndiq_f64(vreinterpretq_f64_m128d(a))); } #else double *v_double = (double *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { double res[2], tmp; for (int i = 0; i < 2; i++) { tmp = (v_double[i] < 0) ? -v_double[i] : v_double[i]; double roundDown = floor(tmp); // Round down value double roundUp = ceil(tmp); // Round up value double diffDown = tmp - roundDown; double diffUp = roundUp - tmp; if (diffDown < diffUp) { /* If it's closer to the round down value, then use it */ res[i] = roundDown; } else if (diffDown > diffUp) { /* If it's closer to the round up value, then use it */ res[i] = roundUp; } else { /* If it's equidistant between round up and round down value, * pick the one which is an even number */ double half = roundDown / 2; if (half != floor(half)) { /* If the round down value is odd, return the round up value */ res[i] = roundUp; } else { /* If the round up value is odd, return the round down value */ res[i] = roundDown; } } res[i] = (v_double[i] < 0) ? -res[i] : res[i]; } return _mm_set_pd(res[1], res[0]); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_pd(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_pd(a); } return _mm_set_pd(v_double[1] > 0 ? floor(v_double[1]) : ceil(v_double[1]), v_double[0] > 0 ? floor(v_double[0]) : ceil(v_double[0])); #endif } // Round the packed single-precision (32-bit) floating-point elements in a using // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { #if (defined(__aarch64__) || defined(_M_ARM64)) || \ defined(__ARM_FEATURE_DIRECTED_ROUNDING) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return _mm_floor_ps(a); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return _mm_ceil_ps(a); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); } #else float *v_float = (float *) &a; if (rounding == (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_NEAREST)) { uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32( vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128_f32( vcvtq_f32_s32(vbslq_s32(is_delta_half, r_even, r_normal))); } else if (rounding == (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_DOWN)) { return _mm_floor_ps(a); } else if (rounding == (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC) || (rounding == _MM_FROUND_CUR_DIRECTION && _MM_GET_ROUNDING_MODE() == _MM_ROUND_UP)) { return _mm_ceil_ps(a); } return _mm_set_ps(v_float[3] > 0 ? floorf(v_float[3]) : ceilf(v_float[3]), v_float[2] > 0 ? floorf(v_float[2]) : ceilf(v_float[2]), v_float[1] > 0 ? floorf(v_float[1]) : ceilf(v_float[1]), v_float[0] > 0 ? floorf(v_float[0]) : ceilf(v_float[0])); #endif } // Round the lower double-precision (64-bit) floating-point element in b using // the rounding parameter, store the result as a double-precision floating-point // element in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_sd FORCE_INLINE __m128d _mm_round_sd(__m128d a, __m128d b, int rounding) { return _mm_move_sd(a, _mm_round_pd(b, rounding)); } // Round the lower single-precision (32-bit) floating-point element in b using // the rounding parameter, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. Rounding is done according to the // rounding[3:0] parameter, which can be one of: // (_MM_FROUND_TO_NEAREST_INT |_MM_FROUND_NO_EXC) // round to nearest, and // suppress exceptions // (_MM_FROUND_TO_NEG_INF |_MM_FROUND_NO_EXC) // round down, and // suppress exceptions // (_MM_FROUND_TO_POS_INF |_MM_FROUND_NO_EXC) // round up, and suppress // exceptions // (_MM_FROUND_TO_ZERO |_MM_FROUND_NO_EXC) // truncate, and suppress // exceptions _MM_FROUND_CUR_DIRECTION // use MXCSR.RC; see // _MM_SET_ROUNDING_MODE // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_round_ss FORCE_INLINE __m128 _mm_round_ss(__m128 a, __m128 b, int rounding) { return _mm_move_ss(a, _mm_round_ps(b, rounding)); } // Load 128-bits of integer data from memory into dst using a non-temporal // memory hint. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_stream_load_si128 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { #if __has_builtin(__builtin_nontemporal_store) return __builtin_nontemporal_load(p); #else return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); #endif } // Compute the bitwise NOT of a and then AND with a 128-bit vector containing // all 1's, and return 1 if the result is zero, otherwise return 0. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_ones FORCE_INLINE int _mm_test_all_ones(__m128i a) { return (uint64_t) (vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == ~(uint64_t) 0; } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and return 1 if the result is zero, otherwise return 0. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_test_all_zeros FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) { int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); return !(vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute // the bitwise NOT of a and then AND with mask, and set CF to 1 if the result is // zero, otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=mm_test_mix_ones_zero // Note: Argument names may be wrong in the Intel intrinsics guide. FORCE_INLINE int _mm_test_mix_ones_zeros(__m128i a, __m128i mask) { uint64x2_t v = vreinterpretq_u64_m128i(a); uint64x2_t m = vreinterpretq_u64_m128i(mask); // find ones (set-bits) and zeros (clear-bits) under clip mask uint64x2_t ones = vandq_u64(m, v); uint64x2_t zeros = vbicq_u64(m, v); // If both 128-bit variables are populated (non-zero) then return 1. // For comparision purposes, first compact each var down to 32-bits. uint32x2_t reduced = vpmax_u32(vqmovn_u64(ones), vqmovn_u64(zeros)); // if folding minimum is non-zero then both vars must be non-zero return (vget_lane_u32(vpmin_u32(reduced, reduced), 0) != 0); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the CF value. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { int64x2_t s64 = vbicq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return 1 if both the ZF and CF values are zero, // otherwise return 0. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testnzc_si128 #define _mm_testnzc_si128(a, b) _mm_test_mix_ones_zeros(a, b) // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the ZF value. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } /* SSE4.2 */ static const uint16_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask16b[8] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; static const uint8_t ALIGN_STRUCT(16) _sse2neon_cmpestr_mask8b[16] = { 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, 0x01, 0x02, 0x04, 0x08, 0x10, 0x20, 0x40, 0x80, }; /* specify the source data format */ #define _SIDD_UBYTE_OPS 0x00 /* unsigned 8-bit characters */ #define _SIDD_UWORD_OPS 0x01 /* unsigned 16-bit characters */ #define _SIDD_SBYTE_OPS 0x02 /* signed 8-bit characters */ #define _SIDD_SWORD_OPS 0x03 /* signed 16-bit characters */ /* specify the comparison operation */ #define _SIDD_CMP_EQUAL_ANY 0x00 /* compare equal any: strchr */ #define _SIDD_CMP_RANGES 0x04 /* compare ranges */ #define _SIDD_CMP_EQUAL_EACH 0x08 /* compare equal each: strcmp */ #define _SIDD_CMP_EQUAL_ORDERED 0x0C /* compare equal ordered */ /* specify the polarity */ #define _SIDD_POSITIVE_POLARITY 0x00 #define _SIDD_MASKED_POSITIVE_POLARITY 0x20 #define _SIDD_NEGATIVE_POLARITY 0x10 /* negate results */ #define _SIDD_MASKED_NEGATIVE_POLARITY \ 0x30 /* negate results only before end of string */ /* specify the output selection in _mm_cmpXstri */ #define _SIDD_LEAST_SIGNIFICANT 0x00 #define _SIDD_MOST_SIGNIFICANT 0x40 /* specify the output selection in _mm_cmpXstrm */ #define _SIDD_BIT_MASK 0x00 #define _SIDD_UNIT_MASK 0x40 /* Pattern Matching for C macros. * https://github.com/pfultz2/Cloak/wiki/C-Preprocessor-tricks,-tips,-and-idioms */ /* catenate */ #define SSE2NEON_PRIMITIVE_CAT(a, ...) a##__VA_ARGS__ #define SSE2NEON_CAT(a, b) SSE2NEON_PRIMITIVE_CAT(a, b) #define SSE2NEON_IIF(c) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_IIF_, c) /* run the 2nd parameter */ #define SSE2NEON_IIF_0(t, ...) __VA_ARGS__ /* run the 1st parameter */ #define SSE2NEON_IIF_1(t, ...) t #define SSE2NEON_COMPL(b) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_COMPL_, b) #define SSE2NEON_COMPL_0 1 #define SSE2NEON_COMPL_1 0 #define SSE2NEON_DEC(x) SSE2NEON_PRIMITIVE_CAT(SSE2NEON_DEC_, x) #define SSE2NEON_DEC_1 0 #define SSE2NEON_DEC_2 1 #define SSE2NEON_DEC_3 2 #define SSE2NEON_DEC_4 3 #define SSE2NEON_DEC_5 4 #define SSE2NEON_DEC_6 5 #define SSE2NEON_DEC_7 6 #define SSE2NEON_DEC_8 7 #define SSE2NEON_DEC_9 8 #define SSE2NEON_DEC_10 9 #define SSE2NEON_DEC_11 10 #define SSE2NEON_DEC_12 11 #define SSE2NEON_DEC_13 12 #define SSE2NEON_DEC_14 13 #define SSE2NEON_DEC_15 14 #define SSE2NEON_DEC_16 15 /* detection */ #define SSE2NEON_CHECK_N(x, n, ...) n #define SSE2NEON_CHECK(...) SSE2NEON_CHECK_N(__VA_ARGS__, 0, ) #define SSE2NEON_PROBE(x) x, 1, #define SSE2NEON_NOT(x) SSE2NEON_CHECK(SSE2NEON_PRIMITIVE_CAT(SSE2NEON_NOT_, x)) #define SSE2NEON_NOT_0 SSE2NEON_PROBE(~) #define SSE2NEON_BOOL(x) SSE2NEON_COMPL(SSE2NEON_NOT(x)) #define SSE2NEON_IF(c) SSE2NEON_IIF(SSE2NEON_BOOL(c)) #define SSE2NEON_EAT(...) #define SSE2NEON_EXPAND(...) __VA_ARGS__ #define SSE2NEON_WHEN(c) SSE2NEON_IF(c)(SSE2NEON_EXPAND, SSE2NEON_EAT) /* recursion */ /* deferred expression */ #define SSE2NEON_EMPTY() #define SSE2NEON_DEFER(id) id SSE2NEON_EMPTY() #define SSE2NEON_OBSTRUCT(...) __VA_ARGS__ SSE2NEON_DEFER(SSE2NEON_EMPTY)() #define SSE2NEON_EXPAND(...) __VA_ARGS__ #define SSE2NEON_EVAL(...) \ SSE2NEON_EVAL1(SSE2NEON_EVAL1(SSE2NEON_EVAL1(__VA_ARGS__))) #define SSE2NEON_EVAL1(...) \ SSE2NEON_EVAL2(SSE2NEON_EVAL2(SSE2NEON_EVAL2(__VA_ARGS__))) #define SSE2NEON_EVAL2(...) \ SSE2NEON_EVAL3(SSE2NEON_EVAL3(SSE2NEON_EVAL3(__VA_ARGS__))) #define SSE2NEON_EVAL3(...) __VA_ARGS__ #define SSE2NEON_REPEAT(count, macro, ...) \ SSE2NEON_WHEN(count) \ (SSE2NEON_OBSTRUCT(SSE2NEON_REPEAT_INDIRECT)()( \ SSE2NEON_DEC(count), macro, \ __VA_ARGS__) SSE2NEON_OBSTRUCT(macro)(SSE2NEON_DEC(count), \ __VA_ARGS__)) #define SSE2NEON_REPEAT_INDIRECT() SSE2NEON_REPEAT #define SSE2NEON_SIZE_OF_byte 8 #define SSE2NEON_NUMBER_OF_LANES_byte 16 #define SSE2NEON_SIZE_OF_word 16 #define SSE2NEON_NUMBER_OF_LANES_word 8 #define SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE(i, type) \ mtx[i] = vreinterpretq_m128i_##type(vceqq_##type( \ vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)), \ vreinterpretq_##type##_m128i(a))); #define SSE2NEON_FILL_LANE(i, type) \ vec_b[i] = \ vdupq_n_##type(vgetq_lane_##type(vreinterpretq_##type##_m128i(b), i)); #define PCMPSTR_RANGES(a, b, mtx, data_type_prefix, type_prefix, size, \ number_of_lanes, byte_or_word) \ do { \ SSE2NEON_CAT( \ data_type_prefix, \ SSE2NEON_CAT(size, \ SSE2NEON_CAT(x, SSE2NEON_CAT(number_of_lanes, _t)))) \ vec_b[number_of_lanes]; \ __m128i mask = SSE2NEON_IIF(byte_or_word)( \ vreinterpretq_m128i_u16(vdupq_n_u16(0xff)), \ vreinterpretq_m128i_u32(vdupq_n_u32(0xffff))); \ SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, SSE2NEON_FILL_LANE, \ SSE2NEON_CAT(type_prefix, size))) \ for (int i = 0; i < number_of_lanes; i++) { \ mtx[i] = SSE2NEON_CAT(vreinterpretq_m128i_u, \ size)(SSE2NEON_CAT(vbslq_u, size)( \ SSE2NEON_CAT(vreinterpretq_u, \ SSE2NEON_CAT(size, _m128i))(mask), \ SSE2NEON_CAT(vcgeq_, SSE2NEON_CAT(type_prefix, size))( \ vec_b[i], \ SSE2NEON_CAT( \ vreinterpretq_, \ SSE2NEON_CAT(type_prefix, \ SSE2NEON_CAT(size, _m128i(a))))), \ SSE2NEON_CAT(vcleq_, SSE2NEON_CAT(type_prefix, size))( \ vec_b[i], \ SSE2NEON_CAT( \ vreinterpretq_, \ SSE2NEON_CAT(type_prefix, \ SSE2NEON_CAT(size, _m128i(a))))))); \ } \ } while (0) #define PCMPSTR_EQ(a, b, mtx, size, number_of_lanes) \ do { \ SSE2NEON_EVAL(SSE2NEON_REPEAT(number_of_lanes, \ SSE2NEON_COMPARE_EQUAL_THEN_FILL_LANE, \ SSE2NEON_CAT(u, size))) \ } while (0) #define SSE2NEON_CMP_EQUAL_ANY_IMPL(type) \ static int _sse2neon_cmp_##type##_equal_any(__m128i a, int la, __m128i b, \ int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ return SSE2NEON_CAT( \ _sse2neon_aggregate_equal_any_, \ SSE2NEON_CAT( \ SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ type))))(la, lb, mtx); \ } #define SSE2NEON_CMP_RANGES_IMPL(type, data_type, us, byte_or_word) \ static int _sse2neon_cmp_##us##type##_ranges(__m128i a, int la, __m128i b, \ int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_RANGES( \ a, b, mtx, data_type, us, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), byte_or_word); \ return SSE2NEON_CAT( \ _sse2neon_aggregate_ranges_, \ SSE2NEON_CAT( \ SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(x, SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, \ type))))(la, lb, mtx); \ } #define SSE2NEON_CMP_EQUAL_ORDERED_IMPL(type) \ static int _sse2neon_cmp_##type##_equal_ordered(__m128i a, int la, \ __m128i b, int lb) \ { \ __m128i mtx[16]; \ PCMPSTR_EQ(a, b, mtx, SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type)); \ return SSE2NEON_CAT( \ _sse2neon_aggregate_equal_ordered_, \ SSE2NEON_CAT( \ SSE2NEON_CAT(SSE2NEON_SIZE_OF_, type), \ SSE2NEON_CAT(x, \ SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type))))( \ SSE2NEON_CAT(SSE2NEON_NUMBER_OF_LANES_, type), la, lb, mtx); \ } static int _sse2neon_aggregate_equal_any_8x16(int la, int lb, __m128i mtx[16]) { int res = 0; int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u8( vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); int tmp = _sse2neon_vaddvq_u8(vreinterpretq_u8_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; } static int _sse2neon_aggregate_equal_any_16x8(int la, int lb, __m128i mtx[16]) { int res = 0; int m = (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u16( vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u16( vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); int tmp = _sse2neon_vaddvq_u16(vreinterpretq_u16_m128i(mtx[j])) ? 1 : 0; res |= (tmp << j); } return res; } /* clang-format off */ #define SSE2NEON_GENERATE_CMP_EQUAL_ANY(prefix) \ prefix##IMPL(byte) \ prefix##IMPL(word) /* clang-format on */ SSE2NEON_GENERATE_CMP_EQUAL_ANY(SSE2NEON_CMP_EQUAL_ANY_) static int _sse2neon_aggregate_ranges_16x8(int la, int lb, __m128i mtx[16]) { int res = 0; int m = (1 << la) - 1; uint16x8_t vec = vtstq_u16(vdupq_n_u16(m), vld1q_u16(_sse2neon_cmpestr_mask16b)); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u16( vandq_u16(vec, vreinterpretq_u16_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u16( vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 15)); __m128i tmp = vreinterpretq_m128i_u32( vshrq_n_u32(vreinterpretq_u32_m128i(mtx[j]), 16)); uint32x4_t vec_res = vandq_u32(vreinterpretq_u32_m128i(mtx[j]), vreinterpretq_u32_m128i(tmp)); #if defined(__aarch64__) || defined(_M_ARM64) int t = vaddvq_u32(vec_res) ? 1 : 0; #else uint64x2_t sumh = vpaddlq_u32(vec_res); int t = vgetq_lane_u64(sumh, 0) + vgetq_lane_u64(sumh, 1); #endif res |= (t << j); } return res; } static int _sse2neon_aggregate_ranges_8x16(int la, int lb, __m128i mtx[16]) { int res = 0; int m = (1 << la) - 1; uint8x8_t vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); uint8x8_t t_lo = vtst_u8(vdup_n_u8(m & 0xff), vec_mask); uint8x8_t t_hi = vtst_u8(vdup_n_u8(m >> 8), vec_mask); uint8x16_t vec = vcombine_u8(t_lo, t_hi); for (int j = 0; j < lb; j++) { mtx[j] = vreinterpretq_m128i_u8( vandq_u8(vec, vreinterpretq_u8_m128i(mtx[j]))); mtx[j] = vreinterpretq_m128i_u8( vshrq_n_u8(vreinterpretq_u8_m128i(mtx[j]), 7)); __m128i tmp = vreinterpretq_m128i_u16( vshrq_n_u16(vreinterpretq_u16_m128i(mtx[j]), 8)); uint16x8_t vec_res = vandq_u16(vreinterpretq_u16_m128i(mtx[j]), vreinterpretq_u16_m128i(tmp)); int t = _sse2neon_vaddvq_u16(vec_res) ? 1 : 0; res |= (t << j); } return res; } #define SSE2NEON_CMP_RANGES_IS_BYTE 1 #define SSE2NEON_CMP_RANGES_IS_WORD 0 /* clang-format off */ #define SSE2NEON_GENERATE_CMP_RANGES(prefix) \ prefix##IMPL(byte, uint, u, prefix##IS_BYTE) \ prefix##IMPL(byte, int, s, prefix##IS_BYTE) \ prefix##IMPL(word, uint, u, prefix##IS_WORD) \ prefix##IMPL(word, int, s, prefix##IS_WORD) /* clang-format on */ SSE2NEON_GENERATE_CMP_RANGES(SSE2NEON_CMP_RANGES_) #undef SSE2NEON_CMP_RANGES_IS_BYTE #undef SSE2NEON_CMP_RANGES_IS_WORD static int _sse2neon_cmp_byte_equal_each(__m128i a, int la, __m128i b, int lb) { uint8x16_t mtx = vceqq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b)); int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); int m1 = 0x10000 - (1 << la); int tb = 0x10000 - (1 << lb); uint8x8_t vec_mask, vec0_lo, vec0_hi, vec1_lo, vec1_hi; uint8x8_t tmp_lo, tmp_hi, res_lo, res_hi; vec_mask = vld1_u8(_sse2neon_cmpestr_mask8b); vec0_lo = vtst_u8(vdup_n_u8(m0), vec_mask); vec0_hi = vtst_u8(vdup_n_u8(m0 >> 8), vec_mask); vec1_lo = vtst_u8(vdup_n_u8(m1), vec_mask); vec1_hi = vtst_u8(vdup_n_u8(m1 >> 8), vec_mask); tmp_lo = vtst_u8(vdup_n_u8(tb), vec_mask); tmp_hi = vtst_u8(vdup_n_u8(tb >> 8), vec_mask); res_lo = vbsl_u8(vec0_lo, vdup_n_u8(0), vget_low_u8(mtx)); res_hi = vbsl_u8(vec0_hi, vdup_n_u8(0), vget_high_u8(mtx)); res_lo = vbsl_u8(vec1_lo, tmp_lo, res_lo); res_hi = vbsl_u8(vec1_hi, tmp_hi, res_hi); res_lo = vand_u8(res_lo, vec_mask); res_hi = vand_u8(res_hi, vec_mask); int res = _sse2neon_vaddv_u8(res_lo) + (_sse2neon_vaddv_u8(res_hi) << 8); return res; } static int _sse2neon_cmp_word_equal_each(__m128i a, int la, __m128i b, int lb) { uint16x8_t mtx = vceqq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); int m0 = (la < lb) ? 0 : ((1 << la) - (1 << lb)); int m1 = 0x100 - (1 << la); int tb = 0x100 - (1 << lb); uint16x8_t vec_mask = vld1q_u16(_sse2neon_cmpestr_mask16b); uint16x8_t vec0 = vtstq_u16(vdupq_n_u16(m0), vec_mask); uint16x8_t vec1 = vtstq_u16(vdupq_n_u16(m1), vec_mask); uint16x8_t tmp = vtstq_u16(vdupq_n_u16(tb), vec_mask); mtx = vbslq_u16(vec0, vdupq_n_u16(0), mtx); mtx = vbslq_u16(vec1, tmp, mtx); mtx = vandq_u16(mtx, vec_mask); return _sse2neon_vaddvq_u16(mtx); } #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE 1 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD 0 #define SSE2NEON_AGGREGATE_EQUAL_ORDER_IMPL(size, number_of_lanes, data_type) \ static int _sse2neon_aggregate_equal_ordered_##size##x##number_of_lanes( \ int bound, int la, int lb, __m128i mtx[16]) \ { \ int res = 0; \ int m1 = SSE2NEON_IIF(data_type)(0x10000, 0x100) - (1 << la); \ uint##size##x8_t vec_mask = SSE2NEON_IIF(data_type)( \ vld1_u##size(_sse2neon_cmpestr_mask##size##b), \ vld1q_u##size(_sse2neon_cmpestr_mask##size##b)); \ uint##size##x##number_of_lanes##_t vec1 = SSE2NEON_IIF(data_type)( \ vcombine_u##size(vtst_u##size(vdup_n_u##size(m1), vec_mask), \ vtst_u##size(vdup_n_u##size(m1 >> 8), vec_mask)), \ vtstq_u##size(vdupq_n_u##size(m1), vec_mask)); \ uint##size##x##number_of_lanes##_t vec_minusone = vdupq_n_u##size(-1); \ uint##size##x##number_of_lanes##_t vec_zero = vdupq_n_u##size(0); \ for (int j = 0; j < lb; j++) { \ mtx[j] = vreinterpretq_m128i_u##size(vbslq_u##size( \ vec1, vec_minusone, vreinterpretq_u##size##_m128i(mtx[j]))); \ } \ for (int j = lb; j < bound; j++) { \ mtx[j] = vreinterpretq_m128i_u##size( \ vbslq_u##size(vec1, vec_minusone, vec_zero)); \ } \ unsigned SSE2NEON_IIF(data_type)(char, short) *ptr = \ (unsigned SSE2NEON_IIF(data_type)(char, short) *) mtx; \ for (int i = 0; i < bound; i++) { \ int val = 1; \ for (int j = 0, k = i; j < bound - i && k < bound; j++, k++) \ val &= ptr[k * bound + j]; \ res += val << i; \ } \ return res; \ } /* clang-format off */ #define SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(prefix) \ prefix##IMPL(8, 16, prefix##IS_UBYTE) \ prefix##IMPL(16, 8, prefix##IS_UWORD) /* clang-format on */ SSE2NEON_GENERATE_AGGREGATE_EQUAL_ORDER(SSE2NEON_AGGREGATE_EQUAL_ORDER_) #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UBYTE #undef SSE2NEON_AGGREGATE_EQUAL_ORDER_IS_UWORD /* clang-format off */ #define SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(prefix) \ prefix##IMPL(byte) \ prefix##IMPL(word) /* clang-format on */ SSE2NEON_GENERATE_CMP_EQUAL_ORDERED(SSE2NEON_CMP_EQUAL_ORDERED_) #define SSE2NEON_CMPESTR_LIST \ _(CMP_UBYTE_EQUAL_ANY, cmp_byte_equal_any) \ _(CMP_UWORD_EQUAL_ANY, cmp_word_equal_any) \ _(CMP_SBYTE_EQUAL_ANY, cmp_byte_equal_any) \ _(CMP_SWORD_EQUAL_ANY, cmp_word_equal_any) \ _(CMP_UBYTE_RANGES, cmp_ubyte_ranges) \ _(CMP_UWORD_RANGES, cmp_uword_ranges) \ _(CMP_SBYTE_RANGES, cmp_sbyte_ranges) \ _(CMP_SWORD_RANGES, cmp_sword_ranges) \ _(CMP_UBYTE_EQUAL_EACH, cmp_byte_equal_each) \ _(CMP_UWORD_EQUAL_EACH, cmp_word_equal_each) \ _(CMP_SBYTE_EQUAL_EACH, cmp_byte_equal_each) \ _(CMP_SWORD_EQUAL_EACH, cmp_word_equal_each) \ _(CMP_UBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ _(CMP_UWORD_EQUAL_ORDERED, cmp_word_equal_ordered) \ _(CMP_SBYTE_EQUAL_ORDERED, cmp_byte_equal_ordered) \ _(CMP_SWORD_EQUAL_ORDERED, cmp_word_equal_ordered) enum { #define _(name, func_suffix) name, SSE2NEON_CMPESTR_LIST #undef _ }; typedef int (*cmpestr_func_t)(__m128i a, int la, __m128i b, int lb); static cmpestr_func_t _sse2neon_cmpfunc_table[] = { #define _(name, func_suffix) _sse2neon_##func_suffix, SSE2NEON_CMPESTR_LIST #undef _ }; FORCE_INLINE int _sse2neon_sido_negative(int res, int lb, int imm8, int bound) { switch (imm8 & 0x30) { case _SIDD_NEGATIVE_POLARITY: res ^= 0xffffffff; break; case _SIDD_MASKED_NEGATIVE_POLARITY: res ^= (1 << lb) - 1; break; default: break; } return res & ((bound == 8) ? 0xFF : 0xFFFF); } FORCE_INLINE int _sse2neon_clz(unsigned int x) { #ifdef _MSC_VER unsigned long cnt = 0; if (_BitScanReverse(&cnt, x)) return 31 - cnt; return 32; #else return x != 0 ? __builtin_clz(x) : 32; #endif } FORCE_INLINE int _sse2neon_ctz(unsigned int x) { #ifdef _MSC_VER unsigned long cnt = 0; if (_BitScanForward(&cnt, x)) return cnt; return 32; #else return x != 0 ? __builtin_ctz(x) : 32; #endif } FORCE_INLINE int _sse2neon_ctzll(unsigned long long x) { #ifdef _MSC_VER unsigned long cnt; #if defined(SSE2NEON_HAS_BITSCAN64) if (_BitScanForward64(&cnt, x)) return (int) (cnt); #else if (_BitScanForward(&cnt, (unsigned long) (x))) return (int) cnt; if (_BitScanForward(&cnt, (unsigned long) (x >> 32))) return (int) (cnt + 32); #endif /* SSE2NEON_HAS_BITSCAN64 */ return 64; #else /* assume GNU compatible compilers */ return x != 0 ? __builtin_ctzll(x) : 64; #endif } #define SSE2NEON_MIN(x, y) (x) < (y) ? (x) : (y) #define SSE2NEON_CMPSTR_SET_UPPER(var, imm) \ const int var = (imm & 0x01) ? 8 : 16 #define SSE2NEON_CMPESTRX_LEN_PAIR(a, b, la, lb) \ int tmp1 = la ^ (la >> 31); \ la = tmp1 - (la >> 31); \ int tmp2 = lb ^ (lb >> 31); \ lb = tmp2 - (lb >> 31); \ la = SSE2NEON_MIN(la, bound); \ lb = SSE2NEON_MIN(lb, bound) // Compare all pairs of character in string a and b, // then aggregate the result. // As the only difference of PCMPESTR* and PCMPISTR* is the way to calculate the // length of string, we use SSE2NEON_CMP{I,E}STRX_GET_LEN to get the length of // string a and b. #define SSE2NEON_COMP_AGG(a, b, la, lb, imm8, IE) \ SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); \ SSE2NEON_##IE##_LEN_PAIR(a, b, la, lb); \ int r2 = (_sse2neon_cmpfunc_table[imm8 & 0x0f])(a, la, b, lb); \ r2 = _sse2neon_sido_negative(r2, lb, imm8, bound) #define SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8) \ return (r2 == 0) ? bound \ : ((imm8 & 0x40) ? (31 - _sse2neon_clz(r2)) \ : _sse2neon_ctz(r2)) #define SSE2NEON_CMPSTR_GENERATE_MASK(dst) \ __m128i dst = vreinterpretq_m128i_u8(vdupq_n_u8(0)); \ if (imm8 & 0x40) { \ if (bound == 8) { \ uint16x8_t tmp = vtstq_u16(vdupq_n_u16(r2), \ vld1q_u16(_sse2neon_cmpestr_mask16b)); \ dst = vreinterpretq_m128i_u16(vbslq_u16( \ tmp, vdupq_n_u16(-1), vreinterpretq_u16_m128i(dst))); \ } else { \ uint8x16_t vec_r2 = \ vcombine_u8(vdup_n_u8(r2), vdup_n_u8(r2 >> 8)); \ uint8x16_t tmp = \ vtstq_u8(vec_r2, vld1q_u8(_sse2neon_cmpestr_mask8b)); \ dst = vreinterpretq_m128i_u8( \ vbslq_u8(tmp, vdupq_n_u8(-1), vreinterpretq_u8_m128i(dst))); \ } \ } else { \ if (bound == 16) { \ dst = vreinterpretq_m128i_u16( \ vsetq_lane_u16(r2 & 0xffff, vreinterpretq_u16_m128i(dst), 0)); \ } else { \ dst = vreinterpretq_m128i_u8( \ vsetq_lane_u8(r2 & 0xff, vreinterpretq_u8_m128i(dst), 0)); \ } \ } \ return dst // Compare packed strings in a and b with lengths la and lb using the control // in imm8, and returns 1 if b did not contain a null character and the // resulting mask was zero, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestra FORCE_INLINE int _mm_cmpestra(__m128i a, int la, __m128i b, int lb, const int imm8) { int lb_cpy = lb; SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); return !r2 & (lb_cpy > bound); } // Compare packed strings in a and b with lengths la and lb using the control in // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrc FORCE_INLINE int _mm_cmpestrc(__m128i a, int la, __m128i b, int lb, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); return r2 != 0; } // Compare packed strings in a and b with lengths la and lb using the control // in imm8, and store the generated index in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestri FORCE_INLINE int _mm_cmpestri(__m128i a, int la, __m128i b, int lb, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); } // Compare packed strings in a and b with lengths la and lb using the control // in imm8, and store the generated mask in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrm FORCE_INLINE __m128i _mm_cmpestrm(__m128i a, int la, __m128i b, int lb, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); SSE2NEON_CMPSTR_GENERATE_MASK(dst); } // Compare packed strings in a and b with lengths la and lb using the control in // imm8, and returns bit 0 of the resulting bit mask. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestro FORCE_INLINE int _mm_cmpestro(__m128i a, int la, __m128i b, int lb, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPESTRX); return r2 & 1; } // Compare packed strings in a and b with lengths la and lb using the control in // imm8, and returns 1 if any character in a was null, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrs FORCE_INLINE int _mm_cmpestrs(__m128i a, int la, __m128i b, int lb, const int imm8) { (void) a; (void) b; (void) lb; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); return la <= (bound - 1); } // Compare packed strings in a and b with lengths la and lb using the control in // imm8, and returns 1 if any character in b was null, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpestrz FORCE_INLINE int _mm_cmpestrz(__m128i a, int la, __m128i b, int lb, const int imm8) { (void) a; (void) b; (void) la; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); return lb <= (bound - 1); } #define SSE2NEON_CMPISTRX_LENGTH(str, len, imm8) \ do { \ if (imm8 & 0x01) { \ uint16x8_t equal_mask_##str = \ vceqq_u16(vreinterpretq_u16_m128i(str), vdupq_n_u16(0)); \ uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ uint64_t matches_##str = \ vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ len = _sse2neon_ctzll(matches_##str) >> 3; \ } else { \ uint16x8_t equal_mask_##str = vreinterpretq_u16_u8( \ vceqq_u8(vreinterpretq_u8_m128i(str), vdupq_n_u8(0))); \ uint8x8_t res_##str = vshrn_n_u16(equal_mask_##str, 4); \ uint64_t matches_##str = \ vget_lane_u64(vreinterpret_u64_u8(res_##str), 0); \ len = _sse2neon_ctzll(matches_##str) >> 2; \ } \ } while (0) #define SSE2NEON_CMPISTRX_LEN_PAIR(a, b, la, lb) \ int la, lb; \ do { \ SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); \ SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); \ } while (0) // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns 1 if b did not contain a null character and the resulting // mask was zero, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistra FORCE_INLINE int _mm_cmpistra(__m128i a, __m128i b, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); return !r2 & (lb >= bound); } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns 1 if the resulting mask was non-zero, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrc FORCE_INLINE int _mm_cmpistrc(__m128i a, __m128i b, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); return r2 != 0; } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and store the generated index in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistri FORCE_INLINE int _mm_cmpistri(__m128i a, __m128i b, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); SSE2NEON_CMPSTR_GENERATE_INDEX(r2, bound, imm8); } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and store the generated mask in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrm FORCE_INLINE __m128i _mm_cmpistrm(__m128i a, __m128i b, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); SSE2NEON_CMPSTR_GENERATE_MASK(dst); } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns bit 0 of the resulting bit mask. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistro FORCE_INLINE int _mm_cmpistro(__m128i a, __m128i b, const int imm8) { SSE2NEON_COMP_AGG(a, b, la, lb, imm8, CMPISTRX); return r2 & 1; } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns 1 if any character in a was null, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrs FORCE_INLINE int _mm_cmpistrs(__m128i a, __m128i b, const int imm8) { (void) b; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); int la; SSE2NEON_CMPISTRX_LENGTH(a, la, imm8); return la <= (bound - 1); } // Compare packed strings with implicit lengths in a and b using the control in // imm8, and returns 1 if any character in b was null, and 0 otherwise. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_cmpistrz FORCE_INLINE int _mm_cmpistrz(__m128i a, __m128i b, const int imm8) { (void) a; SSE2NEON_CMPSTR_SET_UPPER(bound, imm8); int lb; SSE2NEON_CMPISTRX_LENGTH(b, lb, imm8); return lb <= (bound - 1); } // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) || defined(_M_ARM64) return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else return vreinterpretq_m128i_s64(vshrq_n_s64( vqsubq_s64(vreinterpretq_s64_m128i(b), vreinterpretq_s64_m128i(a)), 63)); #endif } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 16-bit integer v, and stores the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u16 FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32ch(crc, v); #else crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 32-bit integer v, and stores the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u32 FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cw(crc, v); #else crc = _mm_crc32_u16(crc, v & 0xffff); crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 64-bit integer v, and stores the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u64 FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #elif (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cd((uint32_t) crc, v); #else crc = _mm_crc32_u32((uint32_t) (crc), v & 0xffffffff); crc = _mm_crc32_u32((uint32_t) (crc), (v >> 32) & 0xffffffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 8-bit integer v, and stores the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_crc32_u8 FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #elif ((__ARM_ARCH == 8) && defined(__ARM_FEATURE_CRC32)) || \ (defined(_M_ARM64) && !defined(__clang__)) crc = __crc32cb(crc, v); #else crc ^= v; for (int bit = 0; bit < 8; bit++) { if (crc & 1) crc = (crc >> 1) ^ UINT32_C(0x82f63b78); else crc = (crc >> 1); } #endif return crc; } /* AES */ #if !defined(__ARM_FEATURE_CRYPTO) && (!defined(_M_ARM64) || defined(__clang__)) /* clang-format off */ #define SSE2NEON_AES_SBOX(w) \ { \ w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ w(0xb0), w(0x54), w(0xbb), w(0x16) \ } #define SSE2NEON_AES_RSBOX(w) \ { \ w(0x52), w(0x09), w(0x6a), w(0xd5), w(0x30), w(0x36), w(0xa5), \ w(0x38), w(0xbf), w(0x40), w(0xa3), w(0x9e), w(0x81), w(0xf3), \ w(0xd7), w(0xfb), w(0x7c), w(0xe3), w(0x39), w(0x82), w(0x9b), \ w(0x2f), w(0xff), w(0x87), w(0x34), w(0x8e), w(0x43), w(0x44), \ w(0xc4), w(0xde), w(0xe9), w(0xcb), w(0x54), w(0x7b), w(0x94), \ w(0x32), w(0xa6), w(0xc2), w(0x23), w(0x3d), w(0xee), w(0x4c), \ w(0x95), w(0x0b), w(0x42), w(0xfa), w(0xc3), w(0x4e), w(0x08), \ w(0x2e), w(0xa1), w(0x66), w(0x28), w(0xd9), w(0x24), w(0xb2), \ w(0x76), w(0x5b), w(0xa2), w(0x49), w(0x6d), w(0x8b), w(0xd1), \ w(0x25), w(0x72), w(0xf8), w(0xf6), w(0x64), w(0x86), w(0x68), \ w(0x98), w(0x16), w(0xd4), w(0xa4), w(0x5c), w(0xcc), w(0x5d), \ w(0x65), w(0xb6), w(0x92), w(0x6c), w(0x70), w(0x48), w(0x50), \ w(0xfd), w(0xed), w(0xb9), w(0xda), w(0x5e), w(0x15), w(0x46), \ w(0x57), w(0xa7), w(0x8d), w(0x9d), w(0x84), w(0x90), w(0xd8), \ w(0xab), w(0x00), w(0x8c), w(0xbc), w(0xd3), w(0x0a), w(0xf7), \ w(0xe4), w(0x58), w(0x05), w(0xb8), w(0xb3), w(0x45), w(0x06), \ w(0xd0), w(0x2c), w(0x1e), w(0x8f), w(0xca), w(0x3f), w(0x0f), \ w(0x02), w(0xc1), w(0xaf), w(0xbd), w(0x03), w(0x01), w(0x13), \ w(0x8a), w(0x6b), w(0x3a), w(0x91), w(0x11), w(0x41), w(0x4f), \ w(0x67), w(0xdc), w(0xea), w(0x97), w(0xf2), w(0xcf), w(0xce), \ w(0xf0), w(0xb4), w(0xe6), w(0x73), w(0x96), w(0xac), w(0x74), \ w(0x22), w(0xe7), w(0xad), w(0x35), w(0x85), w(0xe2), w(0xf9), \ w(0x37), w(0xe8), w(0x1c), w(0x75), w(0xdf), w(0x6e), w(0x47), \ w(0xf1), w(0x1a), w(0x71), w(0x1d), w(0x29), w(0xc5), w(0x89), \ w(0x6f), w(0xb7), w(0x62), w(0x0e), w(0xaa), w(0x18), w(0xbe), \ w(0x1b), w(0xfc), w(0x56), w(0x3e), w(0x4b), w(0xc6), w(0xd2), \ w(0x79), w(0x20), w(0x9a), w(0xdb), w(0xc0), w(0xfe), w(0x78), \ w(0xcd), w(0x5a), w(0xf4), w(0x1f), w(0xdd), w(0xa8), w(0x33), \ w(0x88), w(0x07), w(0xc7), w(0x31), w(0xb1), w(0x12), w(0x10), \ w(0x59), w(0x27), w(0x80), w(0xec), w(0x5f), w(0x60), w(0x51), \ w(0x7f), w(0xa9), w(0x19), w(0xb5), w(0x4a), w(0x0d), w(0x2d), \ w(0xe5), w(0x7a), w(0x9f), w(0x93), w(0xc9), w(0x9c), w(0xef), \ w(0xa0), w(0xe0), w(0x3b), w(0x4d), w(0xae), w(0x2a), w(0xf5), \ w(0xb0), w(0xc8), w(0xeb), w(0xbb), w(0x3c), w(0x83), w(0x53), \ w(0x99), w(0x61), w(0x17), w(0x2b), w(0x04), w(0x7e), w(0xba), \ w(0x77), w(0xd6), w(0x26), w(0xe1), w(0x69), w(0x14), w(0x63), \ w(0x55), w(0x21), w(0x0c), w(0x7d) \ } /* clang-format on */ /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ #define SSE2NEON_AES_H0(x) (x) static const uint8_t _sse2neon_sbox[256] = SSE2NEON_AES_SBOX(SSE2NEON_AES_H0); static const uint8_t _sse2neon_rsbox[256] = SSE2NEON_AES_RSBOX(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 /* x_time function and matrix multiply function */ #if !defined(__aarch64__) && !defined(_M_ARM64) #define SSE2NEON_XT(x) (((x) << 1) ^ ((((x) >> 7) & 1) * 0x1b)) #define SSE2NEON_MULTIPLY(x, y) \ (((y & 1) * x) ^ ((y >> 1 & 1) * SSE2NEON_XT(x)) ^ \ ((y >> 2 & 1) * SSE2NEON_XT(SSE2NEON_XT(x))) ^ \ ((y >> 3 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))) ^ \ ((y >> 4 & 1) * SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(SSE2NEON_XT(x)))))) #endif // In the absence of crypto extensions, implement aesenc using regular NEON // intrinsics instead. See: // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and // for more information. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i RoundKey) { #if defined(__aarch64__) || defined(_M_ARM64) static const uint8_t shift_rows[] = { 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }; static const uint8_t ror32by8[] = { 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); /* shift rows */ w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); /* sub bytes */ // Here, we separate the whole 256-bytes table into 4 64-bytes tables, and // look up each of the table. After each lookup, we load the next table // which locates at the next 64-bytes. In the meantime, the index in the // table would be smaller than it was, so the index parameters of // `vqtbx4q_u8()` need to be added the same constant as the loaded tables. v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); // 'w-0x40' equals to 'vsubq_u8(w, vdupq_n_u8(0x40))' v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); /* mix columns */ w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); /* add round key */ return vreinterpretq_m128i_u8(w) ^ RoundKey; #else /* ARMv7-A implementation for a table-based AES */ #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t) (b3) << 24) | ((uint32_t) (b2) << 16) | \ ((uint32_t) (b1) << 8) | (uint32_t) (b0)) // muliplying 'x' by 2 in GF(2^8) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) // muliplying 'x' by 3 in GF(2^8) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) #define SSE2NEON_AES_U1(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) #define SSE2NEON_AES_U2(p) \ SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) #define SSE2NEON_AES_U3(p) \ SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) // this generates a table containing every possible permutation of // shift_rows() and sub_bytes() with mix_columns(). static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { SSE2NEON_AES_SBOX(SSE2NEON_AES_U0), SSE2NEON_AES_SBOX(SSE2NEON_AES_U1), SSE2NEON_AES_SBOX(SSE2NEON_AES_U2), SSE2NEON_AES_SBOX(SSE2NEON_AES_U3), }; #undef SSE2NEON_AES_B2W #undef SSE2NEON_AES_F2 #undef SSE2NEON_AES_F3 #undef SSE2NEON_AES_U0 #undef SSE2NEON_AES_U1 #undef SSE2NEON_AES_U2 #undef SSE2NEON_AES_U3 uint32_t x0 = _mm_cvtsi128_si32(a); // get a[31:0] uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); // get a[63:32] uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xAA)); // get a[95:64] uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); // get a[127:96] // finish the modulo addition step in mix_columns() __m128i out = _mm_set_epi32( (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); return _mm_xor_si128(out, RoundKey); #endif } // Perform one round of an AES decryption flow on data (state) in a using the // round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t inv_shift_rows[] = { 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, }; static const uint8_t ror32by8[] = { 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); // inverse shift rows w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); // inverse sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); // inverse mix columns // multiplying 'v' by 4 in GF(2^8) w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); v ^= w; v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); // muliplying 'v' by 2 in GF(2^8) w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); // add round key return vreinterpretq_m128i_u8(w) ^ RoundKey; #else /* ARMv7-A NEON implementation */ /* FIXME: optimized for NEON */ uint8_t i, e, f, g, h, v[4][4]; uint8_t *_a = (uint8_t *) &a; for (i = 0; i < 16; ++i) { v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; } // inverse mix columns for (i = 0; i < 4; ++i) { e = v[i][0]; f = v[i][1]; g = v[i][2]; h = v[i][3]; v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); } return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; #endif } // Perform the last round of an AES encryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t shift_rows[] = { 0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb, }; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); // shift rows w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); // sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), w - 0xc0); // add round key return vreinterpretq_m128i_u8(v) ^ RoundKey; #else /* ARMv7-A implementation */ uint8_t v[16] = { _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 0)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 5)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 10)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 15)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 4)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 9)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 14)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 3)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 8)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 13)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 2)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 7)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 12)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 1)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 6)], _sse2neon_sbox[vgetq_lane_u8(vreinterpretq_u8_m128i(a), 11)], }; return vreinterpretq_m128i_u8(vld1q_u8(v)) ^ RoundKey; #endif } // Perform the last round of an AES decryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t inv_shift_rows[] = { 0x0, 0xd, 0xa, 0x7, 0x4, 0x1, 0xe, 0xb, 0x8, 0x5, 0x2, 0xf, 0xc, 0x9, 0x6, 0x3, }; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(a); // inverse shift rows w = vqtbl1q_u8(w, vld1q_u8(inv_shift_rows)); // inverse sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_rsbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_rsbox + 0xc0), w - 0xc0); // add round key return vreinterpretq_m128i_u8(v) ^ RoundKey; #else /* ARMv7-A NEON implementation */ /* FIXME: optimized for NEON */ uint8_t v[4][4]; uint8_t *_a = (uint8_t *) &a; for (int i = 0; i < 16; ++i) { v[((i / 4) + (i % 4)) % 4][i % 4] = _sse2neon_rsbox[_a[i]]; } return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)) ^ RoundKey; #endif } // Perform the InvMixColumns transformation on a and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) { #if defined(__aarch64__) static const uint8_t ror32by8[] = { 0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc, }; uint8x16_t v = vreinterpretq_u8_m128i(a); uint8x16_t w; // multiplying 'v' by 4 in GF(2^8) w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); w = (w << 1) ^ (uint8x16_t) (((int8x16_t) w >> 7) & 0x1b); v ^= w; v ^= (uint8x16_t) vrev32q_u16((uint16x8_t) w); // multiplying 'v' by 2 in GF(2^8) w = (v << 1) ^ (uint8x16_t) (((int8x16_t) v >> 7) & 0x1b); w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); return vreinterpretq_m128i_u8(w); #else /* ARMv7-A NEON implementation */ uint8_t i, e, f, g, h, v[4][4]; vst1q_u8((uint8_t *) v, vreinterpretq_u8_m128i(a)); for (i = 0; i < 4; ++i) { e = v[i][0]; f = v[i][1]; g = v[i][2]; h = v[i][3]; v[i][0] = SSE2NEON_MULTIPLY(e, 0x0e) ^ SSE2NEON_MULTIPLY(f, 0x0b) ^ SSE2NEON_MULTIPLY(g, 0x0d) ^ SSE2NEON_MULTIPLY(h, 0x09); v[i][1] = SSE2NEON_MULTIPLY(e, 0x09) ^ SSE2NEON_MULTIPLY(f, 0x0e) ^ SSE2NEON_MULTIPLY(g, 0x0b) ^ SSE2NEON_MULTIPLY(h, 0x0d); v[i][2] = SSE2NEON_MULTIPLY(e, 0x0d) ^ SSE2NEON_MULTIPLY(f, 0x09) ^ SSE2NEON_MULTIPLY(g, 0x0e) ^ SSE2NEON_MULTIPLY(h, 0x0b); v[i][3] = SSE2NEON_MULTIPLY(e, 0x0b) ^ SSE2NEON_MULTIPLY(f, 0x0d) ^ SSE2NEON_MULTIPLY(g, 0x09) ^ SSE2NEON_MULTIPLY(h, 0x0e); } return vreinterpretq_m128i_u8(vld1q_u8((uint8_t *) v)); #endif } // Assist in expanding the AES cipher key by computing steps towards generating // a round key for encryption cipher using data from a and an 8-bit round // constant specified in imm8, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 // // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. // This instruction generates a round key for AES encryption. See // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { #if defined(__aarch64__) uint8x16_t _a = vreinterpretq_u8_m128i(a); uint8x16_t v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(_sse2neon_sbox), _a); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x40), _a - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0x80), _a - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(_sse2neon_sbox + 0xc0), _a - 0xc0); uint32x4_t v_u32 = vreinterpretq_u32_u8(v); uint32x4_t ror_v = vorrq_u32(vshrq_n_u32(v_u32, 8), vshlq_n_u32(v_u32, 24)); uint32x4_t ror_xor_v = veorq_u32(ror_v, vdupq_n_u32(rcon)); return vreinterpretq_m128i_u32(vtrn2q_u32(v_u32, ror_xor_v)); #else /* ARMv7-A NEON implementation */ uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0x55)); uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(a, 0xFF)); for (int i = 0; i < 4; ++i) { ((uint8_t *) &X1)[i] = _sse2neon_sbox[((uint8_t *) &X1)[i]]; ((uint8_t *) &X3)[i] = _sse2neon_sbox[((uint8_t *) &X3)[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); #endif } #undef SSE2NEON_AES_SBOX #undef SSE2NEON_AES_RSBOX #if defined(__aarch64__) #undef SSE2NEON_XT #undef SSE2NEON_MULTIPLY #endif #else /* __ARM_FEATURE_CRYPTO */ // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and // AESMC and then manually applying the real key as an xor operation. This // unfortunately means an additional xor op; the compiler should be able to // optimize this away for repeated calls however. See // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a // for more details. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_u8(veorq_u8( vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), vreinterpretq_u8_m128i(b))); } // Perform one round of an AES decryption flow on data (state) in a using the // round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdec_si128 FORCE_INLINE __m128i _mm_aesdec_si128(__m128i a, __m128i RoundKey) { return vreinterpretq_m128i_u8(veorq_u8( vaesimcq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), vreinterpretq_u8_m128i(RoundKey))); } // Perform the last round of an AES encryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), RoundKey); } // Perform the last round of an AES decryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesdeclast_si128 FORCE_INLINE __m128i _mm_aesdeclast_si128(__m128i a, __m128i RoundKey) { return vreinterpretq_m128i_u8( veorq_u8(vaesdq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)), vreinterpretq_u8_m128i(RoundKey))); } // Perform the InvMixColumns transformation on a and store the result in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aesimc_si128 FORCE_INLINE __m128i _mm_aesimc_si128(__m128i a) { return vreinterpretq_m128i_u8(vaesimcq_u8(vreinterpretq_u8_m128i(a))); } // Assist in expanding the AES cipher key by computing steps towards generating // a round key for encryption cipher using data from a and an 8-bit round // constant specified in imm8, and store the result in dst." // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_aeskeygenassist_si128 FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); #ifndef _MSC_VER uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) }; uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); #else // We have to do this hack because MSVC is strictly adhering to the CPP // standard, in particular C++03 8.5.1 sub-section 15, which states that // unions must be initialized by their first member type. // As per the Windows ARM64 ABI, it is always little endian, so this works __n128 dest{ ((uint64_t) u8.n128_u8[0x4] << 0) | ((uint64_t) u8.n128_u8[0x1] << 8) | ((uint64_t) u8.n128_u8[0xE] << 16) | ((uint64_t) u8.n128_u8[0xB] << 24) | ((uint64_t) u8.n128_u8[0x1] << 32) | ((uint64_t) u8.n128_u8[0xE] << 40) | ((uint64_t) u8.n128_u8[0xB] << 48) | ((uint64_t) u8.n128_u8[0x4] << 56), ((uint64_t) u8.n128_u8[0xC] << 0) | ((uint64_t) u8.n128_u8[0x9] << 8) | ((uint64_t) u8.n128_u8[0x6] << 16) | ((uint64_t) u8.n128_u8[0x3] << 24) | ((uint64_t) u8.n128_u8[0x9] << 32) | ((uint64_t) u8.n128_u8[0x6] << 40) | ((uint64_t) u8.n128_u8[0x3] << 48) | ((uint64_t) u8.n128_u8[0xC] << 56)}; dest.n128_u32[1] = dest.n128_u32[1] ^ rcon; dest.n128_u32[3] = dest.n128_u32[3] ^ rcon; return dest; #endif } #endif /* Others */ // Perform a carry-less multiplication of two 64-bit integers, selected from a // and b according to imm8, and store the results in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_clmulepi64_si128 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) { uint64x2_t a = vreinterpretq_u64_m128i(_a); uint64x2_t b = vreinterpretq_u64_m128i(_b); switch (imm & 0x11) { case 0x00: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); case 0x01: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); case 0x10: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); case 0x11: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); default: abort(); } } FORCE_INLINE unsigned int _sse2neon_mm_get_denormals_zero_mode(void) { union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif return r.field.bit24 ? _MM_DENORMALS_ZERO_ON : _MM_DENORMALS_ZERO_OFF; } // Count the number of bits set to 1 in unsigned 32-bit integer a, and // return that count in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { #if defined(__aarch64__) || defined(_M_ARM64) #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); #elif defined(_MSC_VER) return _CountOneBits(a); #else return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); #endif #else uint32_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); vst1_u32(&count, count32x2_val); return count; #endif } // Count the number of bits set to 1 in unsigned 64-bit integer a, and // return that count in dst. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { #if defined(__aarch64__) || defined(_M_ARM64) #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); #elif defined(_MSC_VER) return _CountOneBits64(a); #else return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); #endif #else uint64_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; uint64x1_t count64x1_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); count64x1_val = vpaddl_u32(count32x2_val); vst1_u64(&count, count64x1_val); return count; #endif } FORCE_INLINE void _sse2neon_mm_set_denormals_zero_mode(unsigned int flag) { // AArch32 Advanced SIMD arithmetic always uses the Flush-to-zero setting, // regardless of the value of the FZ bit. union { fpcr_bitfield field; #if defined(__aarch64__) || defined(_M_ARM64) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) || defined(_M_ARM64) r.value = _sse2neon_get_fpcr(); #else __asm__ __volatile__("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif r.field.bit24 = (flag & _MM_DENORMALS_ZERO_MASK) == _MM_DENORMALS_ZERO_ON; #if defined(__aarch64__) || defined(_M_ARM64) _sse2neon_set_fpcr(r.value); #else __asm__ __volatile__("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } // Return the current 64-bit value of the processor's time-stamp counter. // https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=rdtsc FORCE_INLINE uint64_t _rdtsc(void) { #if defined(__aarch64__) || defined(_M_ARM64) uint64_t val; /* According to ARM DDI 0487F.c, from Armv8.0 to Armv8.5 inclusive, the * system counter is at least 56 bits wide; from Armv8.6, the counter * must be 64 bits wide. So the system counter could be less than 64 * bits wide and it is attributed with the flag 'cap_user_time_short' * is true. */ #if defined(_MSC_VER) val = _ReadStatusReg(ARM64_SYSREG(3, 3, 14, 0, 2)); #else __asm__ __volatile__("mrs %0, cntvct_el0" : "=r"(val)); #endif return val; #else uint32_t pmccntr, pmuseren, pmcntenset; // Read the user mode Performance Monitoring Unit (PMU) // User Enable Register (PMUSERENR) access permissions. __asm__ __volatile__("mrc p15, 0, %0, c9, c14, 0" : "=r"(pmuseren)); if (pmuseren & 1) { // Allows reading PMUSERENR for user mode code. __asm__ __volatile__("mrc p15, 0, %0, c9, c12, 1" : "=r"(pmcntenset)); if (pmcntenset & 0x80000000UL) { // Is it counting? __asm__ __volatile__("mrc p15, 0, %0, c9, c13, 0" : "=r"(pmccntr)); // The counter is set up to count every 64th cycle return (uint64_t) (pmccntr) << 6; } } // Fallback to syscall as we can't enable PMUSERENR in user mode. struct timeval tv; gettimeofday(&tv, NULL); return (uint64_t) (tv.tv_sec) * 1000000 + tv.tv_usec; #endif } #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #endif RenderKit-rkcommon-988718e/rkcommon/math/box.h000066400000000000000000000076361467524601100212640ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "range.h" #include "vec.h" namespace rkcommon { namespace math { // box declaration //////////////////////////////////////////////////////// template using box_t = range_t>; // box free functions ///////////////////////////////////////////////////// template inline scalar_t area(const box_t &b) { return b.size().product(); } template inline scalar_t area(const box_t &b) { const auto size = b.size(); return 2.f * (size.x * size.y + size.x * size.z + size.y * size.z); } /*! return the volume of the 3D box - undefined for empty boxes */ template inline scalar_t volume(const box_t &b) { return b.size().product(); } /*! computes whether two boxes are either touching OR overlapping; ie, the case where boxes just barely touch side-by side (even if they do not have any actual overlapping _volume_!) then this is still true */ template inline bool touchingOrOverlapping(const box_t &a, const box_t &b) { if (a.lower.x > b.upper.x) return false; if (a.lower.y > b.upper.y) return false; if (a.lower.z > b.upper.z) return false; if (b.lower.x > a.upper.x) return false; if (b.lower.y > a.upper.y) return false; if (b.lower.z > a.upper.z) return false; return true; } template inline bool touchingOrOverlapping(const box_t &a, const box_t &b) { if (a.lower.x > b.upper.x) return false; if (a.lower.y > b.upper.y) return false; if (b.lower.x > a.upper.x) return false; if (b.lower.y > a.upper.y) return false; return true; } /*! compute the intersection of two boxes */ template inline box_t intersectionOf(const box_t &a, const box_t &b) { return box_t(max(a.lower, b.lower), min(a.upper, b.upper)); } template inline bool disjoint(const box_t &a, const box_t &b) { return anyLessThan(a.upper, b.lower) || anyLessThan(b.upper, a.lower); } /*! returns the center of the box (not valid for empty boxes) */ template inline vec_t center(const box_t &b) { return b.center(); } template inline range_t intersectRayBox( const vec_t &org, const vec_t &dir, const box_t &box, const range_t &tRange = range_t(0, inf)) { const auto mins = (box.lower - org) * rcp_safe(dir); const auto maxs = (box.upper - org) * rcp_safe(dir); return range_t( reduce_max(vec_t(min(mins, maxs), tRange.lower)), reduce_min(vec_t(max(mins, maxs), tRange.upper))); } using box1i = range_t; using box2i = box_t; using box3i = box_t; using box4i = box_t; using box1f = range_t; using box2f = box_t; using box3f = box_t; using box4f = box_t; using box3fa = box_t; // this is just a renaming - in some cases the code reads cleaner if // we're talking about 'regions' than about boxes using region2i = box2i; } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/box.ih000066400000000000000000000403171467524601100214260ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.ih" #ifndef ISPC namespace ispc { #endif // a 1-d int bounding box (ie, a range struct box1i { int32 lower; int32 upper; #ifndef ISPC box1i() = default; box1i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box1i(const int32 v) : lower(v), upper(v) {} box1i(const int32 l, const int32 u) : lower(l), upper(u) {} #endif }; // a 1-d float bounding box (ie, a range struct box1f { float lower; float upper; #ifndef ISPC box1f() = default; box1f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box1f(const float v) : lower(v), upper(v) {} box1f(const float l, const float u) : lower(l), upper(u) {} #endif }; // a 2-d float bounding box struct box2f { vec2f lower; vec2f upper; #ifndef ISPC box2f() = default; box2f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box2f(const float v) : lower(v), upper(v) {} box2f(const float l, const float u) : lower(l), upper(u) {} #endif }; // a 2-d integer bounding box struct box2i { vec2i lower; vec2i upper; #ifndef ISPC box2i() = default; box2i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box2i(const int v) : lower(v), upper(v) {} box2i(const int l, const int u) : lower(l), upper(u) {} #endif }; // a 3-d float bounding box struct box3f { vec3f lower; vec3f upper; #ifndef ISPC box3f() = default; box3f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3f(const vec3f v) : lower(v), upper(v) {} box3f(const vec3f l, const vec3f u) : lower(l), upper(u) {} #endif }; // a 3-d int bounding box struct box3i { vec3i lower; vec3i upper; #ifndef ISPC box3i() = default; box3i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3i(const int v) : lower(v), upper(v) {} box3i(const int l, const int u) : lower(l), upper(u) {} #endif }; // a 3-d float bounding box with aligned vec3f coordinates struct box3fa { vec3f lower; int32 align0; vec3f upper; int32 align1; #ifndef ISPC box3fa() = default; box3fa(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3fa(const vec3f v) : lower(v), upper(v) {} box3fa(const vec3f l, const vec3f u) : lower(l), upper(u) {} #endif }; // a 4-d int bounding box struct box4i { vec4i lower; vec4i upper; #ifndef ISPC box4i() = default; box4i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box4i(const vec4i v) : lower(v), upper(v) {} box4i(const vec4i l, const vec4i u) : lower(l), upper(u) {} #endif }; // a 4-d float bounding box struct box4f { vec4f lower; vec4f upper; #ifndef ISPC box4f() = default; box4f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box4f(const vec4f v) : lower(v), upper(v) {} box4f(const vec4f l, const vec4f u) : lower(l), upper(u) {} #endif }; // this is just a renaming - in some cases the code reads cleaner if we're // talking about 'regions' than about boxes typedef box1f range1f; typedef box2i range2i; // ------------------------------------------------------- // all box1f operations // ------------------------------------------------------- #define MAKE_BOX1F_uv(univary) \ inline univary box1f make_box1f(const univary float f) \ { \ univary box1f bb; \ bb.lower = bb.upper = f; \ return bb; \ } \ \ inline univary box1f make_box1f( \ const univary float lo, const univary float hi) \ { \ univary box1f bb; \ bb.lower = lo; \ bb.upper = hi; \ return bb; \ } \ \ inline univary float box_size(const univary box1f &bb) \ { \ return bb.upper - bb.lower; \ } \ \ inline univary box1f box_extend( \ const univary box1f &a, const univary box1f &b) \ { \ return make_box1f(min(a.lower, b.lower), max(a.upper, b.upper)); \ } \ \ inline univary bool isEmpty(const univary box1f &bb) \ { \ return bb.upper < bb.lower; \ } #ifdef ISPC MAKE_BOX1F_uv(uniform); MAKE_BOX1F_uv(varying); #else MAKE_BOX1F_uv(); #endif #undef MAKE_BOX1F_uv // ------------------------------------------------------- // box2 'constructors' // ------------------------------------------------------- #define MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, otherT) \ inline univary box2##Tabb make_box2##Tabb( \ const univary vec2##otherT lower, const univary vec2##otherT upper) \ { \ univary box2##Tabb bb; \ bb.lower.x = lower.x; \ bb.lower.y = lower.y; \ bb.upper.x = upper.x; \ bb.upper.y = upper.y; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, otherT) \ inline univary box2##Tabb make_box2##Tabb(const univary box2##otherT other) \ { \ univary box2##Tabb bb; \ bb.lower.x = other.lower.x; \ bb.lower.y = other.lower.y; \ bb.upper.x = other.upper.x; \ bb.upper.y = other.upper.y; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T_empty(univary, Tabb) \ inline univary box2##Tabb make_box2##Tabb##_empty() \ { \ return make_box2##Tabb(make_vec2##Tabb(inf), make_vec2##Tabb(neg_inf)); \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T(univary, Tabb) \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, i); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, i) #define MAKE_BOX_CONSTRUCTORS_uv_2(univary) \ MAKE_BOX_CONSTRUCTORS_uv_2T(univary, i); \ MAKE_BOX_CONSTRUCTORS_uv_2T(univary, f) #ifdef ISPC MAKE_BOX_CONSTRUCTORS_uv_2(uniform); MAKE_BOX_CONSTRUCTORS_uv_2(varying); MAKE_BOX_CONSTRUCTORS_uv_2T_empty(uniform, f); #else MAKE_BOX_CONSTRUCTORS_uv_2(); MAKE_BOX_CONSTRUCTORS_uv_2T_empty(, f); #endif #undef MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2 #undef MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2 #undef MAKE_BOX_CONSTRUCTORS_uv_2T_empty #undef MAKE_BOX_CONSTRUCTORS_uv_2T #undef MAKE_BOX_CONSTRUCTORS_uv_2 // ------------------------------------------------------- // box3 'constructors' // ------------------------------------------------------- #define MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, otherT) \ inline univary box3##Tabb make_box3##Tabb( \ const univary vec3##otherT lower, const univary vec3##otherT upper) \ { \ univary box3##Tabb bb; \ bb.lower.x = lower.x; \ bb.lower.y = lower.y; \ bb.lower.z = lower.z; \ bb.upper.x = upper.x; \ bb.upper.y = upper.y; \ bb.upper.z = upper.z; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, otherT) \ inline univary box3##Tabb make_box3##Tabb(const univary box3##otherT other) \ { \ univary box3##Tabb bb; \ bb.lower.x = other.lower.x; \ bb.lower.y = other.lower.y; \ bb.lower.z = other.lower.z; \ bb.upper.x = other.upper.x; \ bb.upper.y = other.upper.y; \ bb.upper.z = other.upper.z; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T_empty(univary, Tabb) \ inline univary box3##Tabb make_box3##Tabb##_empty() \ { \ return make_box3##Tabb(make_vec3f(inf), make_vec3f(neg_inf)); \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T(univary, Tabb) \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, i); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, fa); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, i) #define MAKE_BOX_CONSTRUCTORS_uv_3(univary) \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, i); \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, fa) #ifdef ISPC MAKE_BOX_CONSTRUCTORS_uv_3(uniform); MAKE_BOX_CONSTRUCTORS_uv_3(varying); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(uniform, f); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(uniform, fa); #else MAKE_BOX_CONSTRUCTORS_uv_3(); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(, f); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(, fa); #endif #undef MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3 #undef MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3 #undef MAKE_BOX_CONSTRUCTORS_uv_3T_empty #undef MAKE_BOX_CONSTRUCTORS_uv_3T #undef MAKE_BOX_CONSTRUCTORS_uv_3 // ------------------------------------------------------- // box 'operations' // ------------------------------------------------------- #define BOX_OPERATIONS_uv_N_T(univary, N, T) \ inline univary vec##N##T box_size(const univary box##N##T &bb) \ { \ return bb.upper - bb.lower; \ } \ \ inline univary bool isEmpty(const univary box##N##T &bb) \ { \ return anyLessThan(bb.upper, bb.lower); \ } \ \ inline univary box##N##T box_extend( \ const univary box##N##T bb, const univary vec##N##T v) \ { \ return make_box##N##T(min(bb.lower, v), max(bb.upper, v)); \ } \ \ inline univary box##N##T box_extend( \ const univary box##N##T bb, const univary box##N##T other) \ { \ return make_box##N##T( \ min(bb.lower, other.lower), max(bb.upper, other.upper)); \ } #define BOX_OPERATIONS_uv_3fa(univary) \ inline univary box3fa box_extend( \ const univary box3fa bb, const univary vec3f v) \ { \ return make_box3fa(min(bb.lower, v), max(bb.upper, v)); \ } \ \ inline univary box3fa box_extend( \ const univary box3fa bb, const univary box3fa other) \ { \ return make_box3fa( \ min(bb.lower, other.lower), max(bb.upper, other.upper)); \ } #define BOX_OPERATIONS_uv_N(univary, N) \ BOX_OPERATIONS_uv_N_T(univary, N, i); \ BOX_OPERATIONS_uv_N_T(univary, N, f) #define BOX_OPERATIONS_uv(univary) \ BOX_OPERATIONS_uv_N(univary, 2); \ BOX_OPERATIONS_uv_N(univary, 3); \ BOX_OPERATIONS_uv_3fa(univary) #ifdef ISPC BOX_OPERATIONS_uv(uniform); BOX_OPERATIONS_uv(varying); #else BOX_OPERATIONS_uv(); #endif #undef BOX_OPERATIONS_uv_N_T #undef BOX_OPERATIONS_uv_N #undef BOX_OPERATIONS_uv inline bool box_contains(const ISPC_UNIFORM box3f &bbox, const vec3f &p) { return p.x >= bbox.lower.x && p.y >= bbox.lower.y && p.z >= bbox.lower.z && p.x <= bbox.upper.x && p.y <= bbox.upper.y && p.z <= bbox.upper.z; } #ifdef ISPC inline void extend(uniform range1f &r, uniform float v) { r.lower = min(r.lower, v); r.upper = max(r.upper, v); } inline void extend(uniform range1f &r, varying float v) { r.lower = min(r.lower, reduce_min(v)); r.upper = max(r.upper, reduce_max(v)); } #endif inline void extend(range1f &r, float v) { r.lower = min(r.lower, v); r.upper = max(r.upper, v); } #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/math/constants.h000066400000000000000000000177071467524601100225100ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../platform.h" #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include // using cmath causes issues under Windows #include namespace rkcommon { namespace math { static const float one_over_255 = 1.0f / 255.0f; static struct ZeroTy { __forceinline operator double() const { return 0; } __forceinline operator float() const { return 0; } __forceinline operator long long() const { return 0; } __forceinline operator unsigned long long() const { return 0; } __forceinline operator long() const { return 0; } __forceinline operator unsigned long() const { return 0; } __forceinline operator int() const { return 0; } __forceinline operator unsigned int() const { return 0; } __forceinline operator short() const { return 0; } __forceinline operator unsigned short() const { return 0; } __forceinline operator char() const { return 0; } __forceinline operator unsigned char() const { return 0; } } zero MAYBE_UNUSED; static struct OneTy { __forceinline operator double() const { return 1; } __forceinline operator float() const { return 1; } __forceinline operator long long() const { return 1; } __forceinline operator unsigned long long() const { return 1; } __forceinline operator long() const { return 1; } __forceinline operator unsigned long() const { return 1; } __forceinline operator int() const { return 1; } __forceinline operator unsigned int() const { return 1; } __forceinline operator short() const { return 1; } __forceinline operator unsigned short() const { return 1; } __forceinline operator char() const { return 1; } __forceinline operator unsigned char() const { return 1; } } one MAYBE_UNUSED; static struct NegInfTy { __forceinline operator double() const { return -std::numeric_limits::infinity(); } __forceinline operator float() const { return -std::numeric_limits::infinity(); } __forceinline operator long long() const { return std::numeric_limits::min(); } __forceinline operator unsigned long long() const { return std::numeric_limits::min(); } __forceinline operator long() const { return std::numeric_limits::min(); } __forceinline operator unsigned long() const { return std::numeric_limits::min(); } __forceinline operator int() const { return std::numeric_limits::min(); } __forceinline operator unsigned int() const { return std::numeric_limits::min(); } __forceinline operator short() const { return std::numeric_limits::min(); } __forceinline operator unsigned short() const { return std::numeric_limits::min(); } __forceinline operator char() const { return std::numeric_limits::min(); } __forceinline operator unsigned char() const { return std::numeric_limits::min(); } } neg_inf MAYBE_UNUSED; static struct PosInfTy { __forceinline operator double() const { return std::numeric_limits::infinity(); } __forceinline operator float() const { return std::numeric_limits::infinity(); } __forceinline operator long long() const { return std::numeric_limits::max(); } __forceinline operator unsigned long long() const { return std::numeric_limits::max(); } __forceinline operator long() const { return std::numeric_limits::max(); } __forceinline operator unsigned long() const { return std::numeric_limits::max(); } __forceinline operator int() const { return std::numeric_limits::max(); } __forceinline operator unsigned int() const { return std::numeric_limits::max(); } __forceinline operator short() const { return std::numeric_limits::max(); } __forceinline operator unsigned short() const { return std::numeric_limits::max(); } __forceinline operator char() const { return std::numeric_limits::max(); } __forceinline operator unsigned char() const { return std::numeric_limits::max(); } } inf MAYBE_UNUSED, pos_inf MAYBE_UNUSED; static struct NaNTy { __forceinline operator double() const { return std::numeric_limits::quiet_NaN(); } __forceinline operator float() const { return std::numeric_limits::quiet_NaN(); } } nan MAYBE_UNUSED; static struct UlpTy { __forceinline operator double() const { return std::numeric_limits::epsilon(); } __forceinline operator float() const { return std::numeric_limits::epsilon(); } } ulp MAYBE_UNUSED; static struct PiTy { __forceinline operator double() const { return M_PI; } __forceinline operator float() const { return M_PI; } } pi MAYBE_UNUSED; static struct OneOverPiTy { __forceinline operator double() const { return M_1_PI; } __forceinline operator float() const { return M_1_PI; } } one_over_pi MAYBE_UNUSED; static struct TwoPiTy { __forceinline operator double() const { return 2.0 * M_PI; } __forceinline operator float() const { return 2.0 * M_PI; } } two_pi MAYBE_UNUSED; static struct HalfPiTy { __forceinline operator double() const { return M_PI_2; } __forceinline operator float() const { return M_PI_2; } } half_pi MAYBE_UNUSED; static struct OneOverTwoPiTy { __forceinline operator double() const { return 0.5 * M_1_PI; } __forceinline operator float() const { return 0.5 * M_1_PI; } } one_over_two_pi MAYBE_UNUSED; static struct FourPiTy { __forceinline operator double() const { return 4.0 * M_PI; } __forceinline operator float() const { return 4.0 * M_PI; } } four_pi MAYBE_UNUSED; static struct QuarterPiTy { __forceinline operator double() const { return M_PI_4; } __forceinline operator float() const { return M_PI_4; } } quarter_pi MAYBE_UNUSED; static struct OneOverFourPiTy { __forceinline operator double() const { return 0.25 * M_1_PI; } __forceinline operator float() const { return 0.25 * M_1_PI; } } one_over_four_pi MAYBE_UNUSED; static struct StepTy { } step MAYBE_UNUSED; static struct ReverseStepTy { } reverse_step MAYBE_UNUSED; static struct EmptyTy { } empty MAYBE_UNUSED; static struct FullTy { } full MAYBE_UNUSED; } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/math.ih000066400000000000000000000367771467524601100216060ustar00rootroot00000000000000// Copyright 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #ifdef ISPC #define ISPC_UNIFORM uniform #define ISPC_VARYING varying #define ISPC_OR(a, b) or (a, b) #else #include #define ISPC_UNIFORM #define ISPC_VARYING #define ISPC_OR(a, b) (a || b) namespace ispc { using uint64 = uint64_t; using uint32 = uint32_t; using uint16 = uint16_t; using uint8 = uint8_t; using int64 = int64_t; using int32 = int32_t; using int16 = int16_t; using int8 = int8_t; #endif // ------------------------------------------------------------------ // Constants // ------------------------------------------------------------------ #define inf floatbits(0x7F800000) #define pos_inf floatbits(0x7F800000) #define neg_inf floatbits(0xFF800000) #define nan floatbits(0x7FBFFFFF) // smallest positive normal number 2^-126 ~ 1.17549435e-38 #define flt_min 0x1.0p-126f #define M_PI 3.14159265358979323846f #define pi 3.14159265358979323846f #define two_pi 6.283185307179586232f #define four_pi 12.566370614359172464f #define one_over_pi 0.31830988618379069122f #define one_over_two_pi 0.15915494309189534561f #define one_over_four_pi 0.079577471545947672804f #define one_over_two_pi_sqr 0.050660591821168885722f #define lntwo_over_two 0.346573590279972654709f #ifndef ISPC static struct OneTy { inline operator float() const { return 1.f; } } one; static struct EmptyTy { } empty; // Native math functions, precision implementation defined inline float sin(const float a) { return sycl::native::sin(a); } inline float cos(const float a) { return sycl::native::cos(a); } inline float tan(const float a) { return sycl::native::tan(a); } inline float rcp(const float a) { return sycl::native::recip(a); } inline float exp(const float a) { return sycl::native::exp(a); } inline float log(const float a) { return sycl::native::log(a); } inline float pow(float a, float b) { return sycl::native::powr(a, b); } inline float sqrt(const float a) { return sycl::native::sqrt(a); } inline float rsqrt(const float a) { return sycl::native::rsqrt(a); } inline void sincos(const float phi, float *sinPhi, float *cosPhi) { *sinPhi = sycl::native::sin(phi); *cosPhi = sycl::native::cos(phi); } inline float roundf(const float f) { return sycl::round(f); } // Math functions with precision guaranteed both on host and device inline float abs(const float a) { return sycl::fabs(a); } inline float floor(const float a) { return sycl::floor(a); } inline float ceil(const float a) { return sycl::ceil(a); } inline float acos(const float a) { return sycl::acos(a); } inline float atan(const float a) { return sycl::atan(a); } inline float atan2(const float a, const float b) { return sycl::atan2(a, b); } inline float isnan(const float a) { return sycl::isnan(a); } inline float nextafter(const float a, const float b) { return sycl::nextafter(a, b); } inline float floatbits(unsigned int a) { return sycl::bit_cast(a); } inline unsigned int intbits(float a) { return sycl::bit_cast(a); } inline unsigned int signbits(float a) { return sycl::signbit(a); } template ::value>::type> inline T min(const T &a, const T &b) { return sycl::min(a, b); } template ::value>::type> inline T max(const T &a, const T &b) { return sycl::max(a, b); } template ::value>::type> inline T extract(const T &t, int) { return t; } template ::value>::type> inline T exclusive_scan_add(const T &) { return 0; } template ::value>::type> inline T reduce_add(const T &t) { return t; } template ::value>::type> inline T reduce_max(const T &t) { return t; } template ::value>::type> inline T reduce_min(const T &t) { return t; } inline float half_to_float(sycl::half x) { return x; } #endif #define __define_functions(univary) \ inline univary float absf(const univary float f) \ { \ return abs(f); \ } \ /* c-style reciprocal. required since ispc 1.7 due to type changes in this \ * version */ \ inline univary float rcpf(const univary float f) \ { \ return rcp(f); \ } \ /* c-style square root */ \ inline univary float sqrtf(const univary float f) \ { \ return sqrt(f); \ } \ /* c-style reciprocal square root */ \ inline univary float rsqrtf(const univary float f) \ { \ return rsqrt(f); \ } \ /* square */ \ inline univary float sqr(const univary float f) \ { \ return f * f; \ } \ /* c-style square */ \ inline univary float sqrf(const univary float f) \ { \ return f * f; \ } \ /* c-style pow function */ \ inline univary float powf(const univary float a, const univary float b) \ { \ return pow(a, b); \ } \ /* c-style cos */ \ inline univary float cosf(const univary float f) \ { \ return cos(f); \ } \ /* c-style sin */ \ inline univary float sinf(const univary float f) \ { \ return sin(f); \ } \ /* c-style exp */ \ inline univary float expf(const univary float f) \ { \ return exp(f); \ } \ /* c-style log */ \ inline univary float logf(const univary float f) \ { \ return log(f); \ } \ inline univary float divide_safe(univary float f) \ { \ return 1.f / (abs(f) < flt_min ? (f >= 0.f ? flt_min : -flt_min) : f); \ } \ inline univary float rcp_safe(univary float f) \ { \ return rcpf(abs(f) < flt_min ? (f >= 0.f ? flt_min : -flt_min) : f); \ } \ inline univary float sqrt_safe(univary float f) \ { \ return sqrt(max(f, 0.0f)); \ } \ inline univary float clamp(const univary float v) \ { \ return max(0.0f, min(v, 1.0f)); \ } \ inline univary float clamp(const univary float v, \ const univary float lower, \ const univary float upper) \ { \ return max(lower, min(v, upper)); \ } \ inline univary int clamp( \ const univary int v, const univary int lower, const univary int upper) \ { \ return max(lower, min(v, upper)); \ } \ inline univary float frac(const univary float x) \ { \ return x - floor(x); \ } \ inline univary float deg2rad(const univary float x) \ { \ return x * 1.74532925199432957692e-2f; \ } \ inline univary float rad2deg(const univary float x) \ { \ return x * 5.72957795130823208768e1f; \ } #ifdef ISPC __define_functions(uniform); __define_functions(varying); #else __define_functions(); #endif inline float cos2sin(const float f) { return sqrt(max(0.f, 1.f - sqr(f))); } inline float sin2cos(const float f) { return cos2sin(f); } #ifdef ISPC inline float roundf(const float f) { return round(f); } inline uniform float roundf(const uniform float f) { return round(f); } inline uniform float nextafter(const uniform float a, const uniform float b) { // Match the behavior of the C99 math.h function if (a == b) return (b); // We will compute the smallest representable floating increment or decrement // around 'a' uniform float delta = (b > a) ? 1.0f : -1.0f; // Iteratively compute the positive or negative increment while (a + 0.5f * delta != a) delta *= 0.5f; // Return the smallest number greater than 'a' or the largest number smaller // than 'a' return (a + delta); } #endif #define __define_lerp(univary, type) \ inline univary type lerp( \ univary float factor, univary type a, univary type b) \ { \ return (1.f - factor) * a + factor * b; \ } #define __define_lerp_type(univary) \ __define_lerp(univary, int8); \ __define_lerp(univary, int32); \ __define_lerp(univary, float); \ __define_lerp(univary, uint8); \ __define_lerp(univary, uint32) #ifdef ISPC __define_lerp_type(uniform); __define_lerp_type(varying); #else __define_lerp_type(); #endif #undef __define_lerp_type #undef __define_lerp // ------------------------------------------------------------------ // min4/max4, for all types // ------------------------------------------------------------------ #define __define_op4(univary, type, op) \ inline univary type op##4( \ univary type a, univary type b, univary type c, univary type d) \ { \ return op(a, op(b, op(c, d))); \ } #define __define_op4_op(univary, type) \ __define_op4(univary, type, min); \ __define_op4(univary, type, max) #define __define_op4_type(univary) \ __define_op4_op(univary, int8); \ __define_op4_op(univary, int32); \ __define_op4_op(univary, uint8); \ __define_op4_op(univary, uint32); \ __define_op4_op(univary, float) #ifdef ISPC __define_op4_type(uniform); __define_op4_type(varying); #else __define_op4_type(); #endif #undef __define_op4_type #undef __define_op4_op #undef __define_op4 #define SIMILAR_EPSILON .00001f #define __define_similar(univary) \ inline univary float similar(univary float a, univary float b) \ { \ return abs(a - b) <= SIMILAR_EPSILON; \ } #ifdef ISPC __define_similar(uniform); __define_similar(varying); #else __define_similar(); #endif #undef __define_similar #undef SIMILAR_EPSILON // convert 32bit unsigned int into float in [0..1] inline float to_float_unorm(uint32 a) { return a * 0x1.0p-32f; } #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/math/range.h000066400000000000000000000106611467524601100215600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // stl #include // common #include "constants.h" #include "vec.h" namespace rkcommon { namespace math { using std::max; using std::min; /*! default implementatoin of 'anyLessThan' for scalar types, so we can make a ranges etc. Vec-types will overwrite that and test if _any_ dimension is less */ template inline bool anyLessThan(const TA &a, const TB &b) { return a < b; } template struct range_t { using bound_t = T; range_t() : lower(pos_inf), upper(neg_inf) {} range_t(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} range_t(const ZeroTy &) : lower(zero), upper(zero) {} range_t(const OneTy &) : lower(zero), upper(one) {} range_t(const T &t) : lower(t), upper(t) {} range_t(const T &_lower, const T &_upper) : lower(_lower), upper(_upper) { } range_t(const T *v) : lower(v[0]), upper(v[1]) {} template explicit range_t(const range_t &other) : lower(T(other.lower)), upper(T(other.upper)) { } inline T size() const { return upper - lower; } inline T center() const { return .5f * (lower + upper); } inline void extend(const T &t) { lower = min(lower, t); upper = max(upper, t); } inline void extend(const range_t &t) { lower = min(lower, t.lower); upper = max(upper, t.upper); } /*! take given value t, and 'clamp' it to 'this->'range; ie, if it already is inside the range return as is, otherwise move it to either lower or upper of this range. */ inline T clamp(const T &t) const { return max(lower, min(t, upper)); } /*! Try to parse given string into a range; and return if successful. if not, return defaultvalue */ static range_t fromString( const std::string &string, const range_t &defaultValue = rkcommon::math::empty); inline bool empty() const { return anyLessThan(upper, lower); } inline bool contains(const T &t) const { return !anyLessThan(t, lower) && !anyLessThan(upper, t); } inline operator T*() { return static_cast(&lower); } inline operator const T*() const { return static_cast(&lower); } T lower, upper; }; template inline std::ostream &operator<<(std::ostream &o, const range_t &r) { o << "[" << r.lower << "," << r.upper << "]"; return o; } /*! scale range, per dimension */ template inline range_t operator*(const range_t &range, const T &scale) { return range_t(range.lower * scale, range.upper * scale); } /*! scale range, per dimension */ template inline range_t operator*(const T &scale, const range_t &range) { return range_t(range.lower * scale, range.upper * scale); } /*! translate a range, per dimension */ template inline range_t operator+(const range_t &range, const T &translation) { return range_t(range.lower + translation, range.upper + translation); } /*! translate a range, per dimension */ template inline range_t operator+(const T &translation, const range_t &range) { return range_t(range.lower + translation, range.upper + translation); } // comparison operators /////////////////////////////////////////////////// template inline bool operator==(const range_t &a, const range_t &b) { return a.lower == b.lower && a.upper == b.upper; } template inline bool operator!=(const range_t &a, const range_t &b) { return !(a == b); } // range_t aliases //////////////////////////////////////////////////////// using range1f = range_t; using range2f = range_t; using range3f = range_t; using range4f = range_t; using range1i = range_t; using range2i = range_t; using range3i = range_t; using range4i = range_t; } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/rkmath.h000066400000000000000000000060271467524601100217530ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../platform.h" #include "constants.h" // std #include // std::min()/std::max() on Windows #include // Include vector intrinsics #ifndef RKCOMMON_NO_SIMD #if defined(_WIN32) #include #elif defined(__ARM_NEON) #include "arm/emulation.h" #else #include #include #endif #endif namespace rkcommon { namespace math { using std::cos; using std::sin; using std::tan; using std::max; using std::min; using std::fmod; __forceinline float sign(const float x) { return x < 0 ? -1.0f : 1.0f; } __forceinline float rcp(const float x) { #ifdef RKCOMMON_NO_SIMD return 1.f / x; #else const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rcp_ss(a); return _mm_cvtss_f32( _mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); #endif } __forceinline double rcp(const double x) { return 1. / x; } template __forceinline T rcp_safe_t(const T x) { const T flt_min = std::numeric_limits::min(); return rcp(std::abs(x) < flt_min ? (x >= 0.f ? flt_min : -flt_min) : x); } __forceinline float rcp_safe(const float x) { return rcp_safe_t(x); } __forceinline double rcp_safe(const double x) { return rcp_safe_t(x); } __forceinline float rsqrt(const float x) { #ifdef RKCOMMON_NO_SIMD return 1.f / std::sqrt(x); #else const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rsqrt_ss(a); const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); return _mm_cvtss_f32(c); #endif } __forceinline double rsqrt(const double x) { return 1. / std::sqrt(x); } template __forceinline T clamp(const T &x, const T &lower = T(zero), const T &upper = T(one)) { return max(min(x, upper), lower); } template __forceinline T deg2rad(const T &x) { return x * T(1.745329251994329576923690768489e-2); } __forceinline float madd(const float a, const float b, const float c) { return a * b + c; } template inline T lerp(const float factor, const T &a, const T &b) { return (1.f - factor) * a + factor * b; } template inline T divRoundUp(T a, T b) { return (a + b - 1) / b; } #define APPROXIMATE_SRGB inline float linear_to_srgb(const float f) { const float c = std::max(f, 0.f); #ifdef APPROXIMATE_SRGB return std::pow(c, 1.f / 2.2f); #else return c <= 0.0031308f ? 12.92f * c : std::pow(c, 1.f / 2.4f) * 1.055f - 0.055f; #endif } } // namespace math } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/math/vec.h000066400000000000000000001107531467524601100212440ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "constants.h" #include "rkmath.h" #include "../traits/rktraits.h" namespace rkcommon { // NOTE: only for identifying vec_t types at compile-time struct vec_base { }; // ------------------------------------------------------- // type traits relevant to vec_t<> type compile-time logic // ------------------------------------------------------- namespace traits { template struct is_vec { const static bool value = std::is_base_of::value; }; template struct is_valid_vec_constructor_type { const static bool value = std::is_constructible::value && !std::is_same::value && !is_vec::value; }; template using is_valid_vec_constructor_type_t = enable_if_t< is_valid_vec_constructor_type::value>; } // namespace traits namespace math { // vec_t<> types ////////////////////////////////////////////////////////// template > struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]) {} vec_t(scalar_t s) : x(s), y(s) {} template > vec_t(const OT &s) : x(s), y(s) { } vec_t(scalar_t x, scalar_t y) : x(x), y(y) {} template vec_t(const vec_t &o) : x(o.x), y(o.y) { } const T &operator[](const size_t idx) const { assert(idx < 2); return (&x)[idx]; } T &operator[](const size_t idx) { assert(idx < 2); return (&x)[idx]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y; } size_t long_product() const { return size_t(x) * size_t(y); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]) {} vec_t(scalar_t s) : x(s), y(s), z(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} template vec_t(const vec_t &o, scalar_t z) : x(o.x), y(o.y), z(z) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z) { } const T &operator[](const size_t axis) const { assert(axis < 3); return (&x)[axis]; } T &operator[](const size_t axis) { assert(axis < 3); return (&x)[axis]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]) {} vec_t(scalar_t s) : x(s), y(s), z(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} template vec_t(const vec_t &o, scalar_t z) : x(o.x), y(o.y), z(z) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z) { } const T &operator[](const size_t axis) const { assert(axis < 3); return (&x)[axis]; } T &operator[](const size_t axis) { assert(axis < 3); return (&x)[axis]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z); } operator vec_t() const { return vec_t(x, y, z); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z; T padding_; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]), w(v[3]) {} vec_t(scalar_t s) : x(s), y(s), z(s), w(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s), w(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z, scalar_t w) : x(x), y(y), z(z), w(w) { } template vec_t(const vec_t &o1, const vec_t &o2) : x(o1.x), y(o1.y), z(o2.x), w(o2.y) { } template vec_t(const vec_t &o, scalar_t w) : x(o.x), y(o.y), z(o.z), w(w) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z), w(o.w) { } const T &operator[](const size_t idx) const { assert(idx < 4); return (&x)[idx]; } T &operator[](const size_t idx) { assert(idx < 4); return (&x)[idx]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z + w; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z * w; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z) * size_t(w); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z, w; }; // ------------------------------------------------------- // unary operators // ------------------------------------------------------- template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z, -v.w); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z, +v.w); } using std::abs; // ------------------------------------------------------- // unary functors // ------------------------------------------------------- #define unary_functor(op) \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z), op(v.w)); \ } // clang-format off unary_functor(rcp) unary_functor(rcp_safe) unary_functor(abs) unary_functor(sin) unary_functor(cos) // clang-format on #undef unary_functor // ------------------------------------------------------- // binary arithmetic operators // ------------------------------------------------------- #define binary_operator(name, op) \ /* "vec op vec" */ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y); \ } \ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y, a.z op b.z); \ } \ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \ } \ \ /* "vec op vec" (element types don't match) */ \ template > \ inline auto name(const vec_t &a, const vec_t &b) \ ->vec_t \ { \ using vector_t = vec_t; \ return vector_t(vector_t(a) op vector_t(b)); \ } \ \ /* "vec op scalar" */ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b); \ } \ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b, a.z op b); \ } \ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b, a.z op b, a.w op b); \ } \ \ /* "vec op U" (element types don't match) */ \ template > \ inline auto name(const vec_t &a, const U &b) \ ->vec_t \ { \ using scalar_t = decltype(T() op U()); \ using vector_t = vec_t; \ return vector_t(vector_t(a) op scalar_t(b)); \ } \ \ /* "scalar op vec" */ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y); \ } \ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y, a op b.z); \ } \ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y, a op b.z, a op b.w); \ } \ \ /* "T op vec" (element types don't match) */ \ template > \ inline auto name(const T &a, const vec_t &b) \ ->vec_t \ { \ using scalar_t = decltype(T() op U()); \ using vector_t = vec_t; \ return vector_t(scalar_t(a) op vector_t(b)); \ } // clang-format off binary_operator(operator+, +) binary_operator(operator-, -) binary_operator(operator*, *) binary_operator(operator/, /) binary_operator(operator%, %) // clang-format on #undef binary_operator // ------------------------------------------------------- // binary arithmetic assignment operators // ------------------------------------------------------- #define binary_operator(name, op) \ /* "vec op vec" */ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ return a; \ } \ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ a.z op b.z; \ return a; \ } \ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ a.z op b.z; \ a.w op b.w; \ return a; \ } \ \ /* "vec op scalar" */ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ return a; \ } \ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ a.z op b; \ return a; \ } \ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ a.z op b; \ a.w op b; \ return a; \ } // clang-format off binary_operator(operator+=, +=) binary_operator(operator-=, -=) binary_operator(operator*=, *=) binary_operator(operator/=, /=) binary_operator(operator%=, %=) // clang-format on #undef binary_operator // ------------------------------------------------------- // ternary operators (just for compatibility with old embree // ------------------------------------------------------- template inline vec_t madd(const vec_t &a, const vec_t &b, const vec_t &c) { return vec_t( madd(a.x, b.x, c.x), madd(a.y, b.y, c.y), madd(a.z, b.z, c.z)); } // ------------------------------------------------------- // comparison operators // ------------------------------------------------------- template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y; } template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y && a.z == b.z; } template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } // 'anyLessThan' - return true if any component is less than the other vec's template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y; } template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y || a.z < b.z; } template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y || a.z < b.z || a.w < b.w; } // ------------------------------------------------------- // dot functions // ------------------------------------------------------- template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } // ------------------------------------------------------- // length functions // ------------------------------------------------------- template inline T length(const vec_t &v) { return sqrt(dot(v, v)); } // ------------------------------------------------------- // cross product // ------------------------------------------------------- template inline vec_t cross(const vec_t &a, const vec_t &b) { return vec_t( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); } // ------------------------------------------------------- // normalize() // ------------------------------------------------------- template inline vec_t normalize(const vec_t &v) { return v * rsqrt(dot(v, v)); } template inline vec_t safe_normalize(const vec_t &v) { return v * rsqrt(max(T(ulp), dot(v, v))); } // ------------------------------------------------------- // interpolation // ------------------------------------------------------- // barycentric interpolation template inline vec_t interpolate_uv(const vec_t &f, const vec_t &a, const vec_t &b, const vec_t &c) { return f.x * a + f.y * b + f.z * c; } // ------------------------------------------------------- // ostream operators // ------------------------------------------------------- template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << ")"; return o; } template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << "," << v.z << ")"; return o; } template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << "," << v.z << "," << v.w << ")"; return o; } // "inherit" std::min/max/etc for basic types using std::max; using std::min; // ------------------------------------------------------- // binary functors // ------------------------------------------------------- #define define_functor(f) \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y)); \ } \ \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \ } \ \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \ } // clang-format off define_functor(min) define_functor(max) define_functor(divRoundUp) // clang-format on #undef define_functor // ------------------------------------------------------- // reductions // ------------------------------------------------------- template inline T reduce_add(const vec_t &v) { return v.x + v.y; } template inline T reduce_add(const vec_t &v) { return v.x + v.y + v.z; } template inline T reduce_add(const vec_t &v) { return v.x + v.y + v.z + v.w; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y * v.z; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y * v.z * v.w; } template inline T reduce_min(const vec_t &v) { return min(v.x, v.y); } template inline T reduce_min(const vec_t &v) { return min(min(v.x, v.y), v.z); } template inline T reduce_min(const vec_t &v) { return min(min(v.x, v.y), min(v.z, v.w)); } template inline T reduce_max(const vec_t &v) { return max(v.x, v.y); } template inline T reduce_max(const vec_t &v) { return max(max(v.x, v.y), v.z); } template inline T reduce_max(const vec_t &v) { return max(max(v.x, v.y), max(v.z, v.w)); } // ------------------------------------------------------- // all vec2 variants // ------------------------------------------------------- typedef vec_t vec2uc; typedef vec_t vec2c; typedef vec_t vec2us; typedef vec_t vec2s; typedef vec_t vec2ui; typedef vec_t vec2i; typedef vec_t vec2ul; typedef vec_t vec2l; typedef vec_t vec2f; typedef vec_t vec2d; // ------------------------------------------------------- // all vec3 variants // ------------------------------------------------------- typedef vec_t vec3uc; typedef vec_t vec3c; typedef vec_t vec3us; typedef vec_t vec3s; typedef vec_t vec3ui; typedef vec_t vec3i; typedef vec_t vec3ul; typedef vec_t vec3l; typedef vec_t vec3f; typedef vec_t vec3d; typedef vec_t vec3fa; typedef vec_t vec3ia; // ------------------------------------------------------- // all vec4 variants // ------------------------------------------------------- typedef vec_t vec4uc; typedef vec_t vec4c; typedef vec_t vec4us; typedef vec_t vec4s; typedef vec_t vec4ui; typedef vec_t vec4i; typedef vec_t vec4ul; typedef vec_t vec4l; typedef vec_t vec4f; typedef vec_t vec4d; template inline size_t arg_max(const vec_t &v) { size_t maxIdx = 0; for (size_t i = 1; i < N; i++) if (v[i] > v[maxIdx]) maxIdx = i; return maxIdx; } inline vec4f linear_to_srgba(const vec4f c) { return vec4f(linear_to_srgb(c.x), linear_to_srgb(c.y), linear_to_srgb(c.z), std::max(c.w, 0.f)); // alpha is never gamma-corrected } inline uint32_t cvt_uint32(const float f) { return (uint32_t)round(255.f * clamp(f, 0.f, 1.f)); } inline uint32_t cvt_uint32(const vec4f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16) | (cvt_uint32(v.w) << 24); } inline uint32_t linear_to_srgba8(const vec4f c) { return cvt_uint32(linear_to_srgba(c)); } } // namespace math } // namespace rkcommon /*! template specialization for std::less comparison operator; * we need those to be able to put vec's in std::map etc @{ */ /* Defining just operator< is prone to bugs, because a definition of an * ordering of vectors is a bit arbitrary and depends on the context. * For example, in box::extend we certainly want the element-wise min/max and * not the std::min/std::max made applicable by vec3f::operator<. */ namespace std { template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && (a.y < b.y)); } }; template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && ((a.y < b.y) || ((a.y == b.y) && (a.z < b.z)))); } }; template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && ((a.y < b.y) || ((a.y == b.y) && ((a.z < b.z) || ((a.z == b.z) && (a.w < b.w)))))); } }; } // namespace std RenderKit-rkcommon-988718e/rkcommon/math/vec.ih000066400000000000000000001302361467524601100214130ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "math.ih" #ifndef ISPC namespace ispc { #endif #ifdef ISPC #define __define_vectors(type, abb) \ struct vec2##abb \ { \ type x, y; \ }; \ struct vec3##abb \ { \ type x, y, z; \ }; \ struct vec4##abb \ { \ type x, y, z, w; \ } __define_vectors(int32, i); __define_vectors(uint32, ui); __define_vectors(uint8, uc); __define_vectors(float, f); __define_vectors(int64, l); __define_vectors(uint64, ul); #undef __define_vectors #else template struct vec_t { }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s) {} vec_t(scalar_t x, scalar_t y) : x(x), y(y) {} T x, y; }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s), z(s) {} vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} T x, y, z; }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s), z(s), w(s) {} vec_t(scalar_t x, scalar_t y, scalar_t z, scalar_t w) : x(x), y(y), z(z), w(w) {} T x, y, z, w; }; // vec2 variants typedef vec_t vec2uc; typedef vec_t vec2ui; typedef vec_t vec2i; typedef vec_t vec2l; typedef vec_t vec2ul; typedef vec_t vec2f; // vec3 variants typedef vec_t vec3uc; typedef vec_t vec3ui; typedef vec_t vec3i; typedef vec_t vec3l; typedef vec_t vec3ul; typedef vec_t vec3f; // vec4 variants typedef vec_t vec4uc; typedef vec_t vec4ui; typedef vec_t vec4i; typedef vec_t vec4l; typedef vec_t vec4ul; typedef vec_t vec4f; #endif // ============================================================================ /* defines all constructors "make_vec2[T]" for 2-vector type */ #define __define_ispc_constructors2(univary, abb, itype, iabb) \ inline univary vec2##abb make_vec2##abb( \ const univary itype x, const univary itype y) \ { \ univary vec2##abb ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } \ inline univary vec2##abb make_vec2##abb(const univary itype x) \ { \ univary vec2##abb ret; \ ret.x = x; \ ret.y = x; \ return ret; \ } /* defines all constructors "make_vec3[T]" for 3-vector type */ #define __define_ispc_constructors3(univary, abb, itype, iabb) \ inline univary vec3##abb make_vec3##abb(const univary itype x) \ { \ univary vec3##abb ret; \ ret.x = x; \ ret.y = x; \ ret.z = x; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb(const univary vec3##iabb v) \ { \ univary vec3##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb( \ const univary itype x, const univary itype y, const univary itype z) \ { \ univary vec3##abb ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb(const univary vec4##iabb v) \ { \ univary vec3##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ return ret; \ } /* defines all constructors "make_vec4[T]" for 4-vector type */ #define __define_ispc_constructors4(univary, abb, itype, iabb) \ /* construct vec4 from a single scalar */ \ inline univary vec4##abb make_vec4##abb(const univary itype f) \ { \ univary vec4##abb ret; \ ret.x = f; \ ret.y = f; \ ret.z = f; \ ret.w = f; \ return ret; \ } \ /* construct vec4 from 4 scalars */ \ inline univary vec4##abb make_vec4##abb(const univary itype x, \ const univary itype y, \ const univary itype z, \ const univary itype w) \ { \ univary vec4##abb ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ ret.w = w; \ return ret; \ } \ /* construct vec4 from another vec4 (of another type) */ \ inline univary vec4##abb make_vec4##abb(const univary vec4##iabb v) \ { \ univary vec4##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ ret.w = v.w; \ return ret; \ } #define __define_ispc_lift_constructors4(univary, type, abb) \ /* lift vec4 from vec3; fill in with 0es */ \ inline univary vec4##abb make_vec4##abb(const univary vec3##abb v) \ { \ univary vec4##abb ret; \ ret.x = (type)v.x; \ ret.y = (type)v.y; \ ret.z = (type)v.z; \ ret.w = (type)0; \ return ret; \ } #define __define_ispc_constructors_uv_t(univary, oabb, itype, iabb) \ __define_ispc_constructors2(univary, oabb, itype, iabb); \ __define_ispc_constructors3(univary, oabb, itype, iabb); \ __define_ispc_constructors4(univary, oabb, itype, iabb) #define __define_ispc_constructors_uv(univary, type, abb) \ __define_ispc_constructors_uv_t(univary, abb, int32, i); \ __define_ispc_constructors_uv_t(univary, abb, uint32, ui); \ __define_ispc_constructors_uv_t(univary, abb, uint8, uc); \ __define_ispc_constructors_uv_t(univary, abb, float, f); \ __define_ispc_lift_constructors4(univary, type, abb) #define __define_ispc_constructors(univary) \ __define_ispc_constructors_uv(univary, int32, i); \ __define_ispc_constructors_uv(univary, uint32, ui); \ __define_ispc_constructors_uv(univary, uint8, uc); \ __define_ispc_constructors_uv(univary, float, f) #ifdef ISPC __define_ispc_constructors(uniform); __define_ispc_constructors(varying); #else __define_ispc_constructors(); #endif #undef __define_ispc_constructors2 #undef __define_ispc_constructors3 #undef __define_ispc_constructors4 #undef __define_ispc_lift_constructors4 #undef __define_ispc_constructors_uv #undef __define_ispc_constructors // ============================================================================ // define 'lifted' binary operators (min/max/...) #define __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, abb) \ inline univary_r vec2##abb fct( \ const univary_a vec2##abb a, const univary_b vec2##abb b) \ { \ return make_vec2##abb(fct(a.x, b.x), fct(a.y, b.y)); \ } \ inline univary_r vec3##abb fct( \ const univary_a vec3##abb a, const univary_b vec3##abb b) \ { \ return make_vec3##abb(fct(a.x, b.x), fct(a.y, b.y), fct(a.z, b.z)); \ } \ inline univary_r vec4##abb fct( \ const univary_a vec4##abb a, const univary_b vec4##abb b) \ { \ return make_vec4##abb( \ fct(a.x, b.x), fct(a.y, b.y), fct(a.z, b.z), fct(a.w, b.w)); \ } #define __define_binary_fct_types(univary_r, univary_a, univary_b, fct) \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, f); \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, i); \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, ui) #define __define_binary_fct(univary_r, univary_a, univary_b) \ __define_binary_fct_types(univary_r, univary_a, univary_b, min); \ __define_binary_fct_types(univary_r, univary_a, univary_b, max) #ifdef ISPC __define_binary_fct(uniform, uniform, uniform); __define_binary_fct(varying, varying, varying); __define_binary_fct(varying, varying, uniform); __define_binary_fct(varying, uniform, varying); #else __define_binary_fct(, , ); #endif #undef __define_binary_fct #undef __define_binary_fct_types #undef __define_binary_fct_dims // ============================================================================ #define __define_binary_operator_dims(uv, opname, op, abb, type) \ /* vec2##abb */ \ inline uv vec2##abb opname(const uv vec2##abb a, const uv vec2##abb b) \ { \ return make_vec2##abb(a.x op b.x, a.y op b.y); \ } \ inline uv vec2##abb opname(const uv vec2##abb a, const uv type b) \ { \ return make_vec2##abb(a.x op b, a.y op b); \ } \ inline uv vec2##abb opname(const uv type a, const uv vec2##abb b) \ { \ return make_vec2##abb(a op b.x, a op b.y); \ } \ /* vec3##abb */ \ inline uv vec3##abb opname(const uv vec3##abb a, const uv vec3##abb b) \ { \ return make_vec3##abb(a.x op b.x, a.y op b.y, a.z op b.z); \ } \ inline uv vec3##abb opname(const uv vec3##abb a, const uv type b) \ { \ return make_vec3##abb(a.x op b, a.y op b, a.z op b); \ } \ inline uv vec3##abb opname(const uv type a, const uv vec3##abb b) \ { \ return make_vec3##abb(a op b.x, a op b.y, a op b.z); \ } \ /* vec4##abb */ \ inline uv vec4##abb opname(const uv vec4##abb a, const uv vec4##abb b) \ { \ return make_vec4##abb(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \ } \ inline uv vec4##abb opname(const uv vec4##abb a, const uv type b) \ { \ return make_vec4##abb(a.x op b, a.y op b, a.z op b, a.w op b); \ } \ inline uv vec4##abb opname(const uv type a, const uv vec4##abb b) \ { \ return make_vec4##abb(a op b.x, a op b.y, a op b.z, a op b.w); \ } #define __define_binary_operator_types(uv, opname, op) \ __define_binary_operator_dims(uv, opname, op, f, float); \ __define_binary_operator_dims(uv, opname, op, i, int32); \ __define_binary_operator_dims(uv, opname, op, ui, uint32) // define 'regular' operators #define __define_binary_operator(uv) \ __define_binary_operator_types(uv, operator+, +); \ __define_binary_operator_types(uv, operator-, -); \ __define_binary_operator_types(uv, operator*, *); \ __define_binary_operator_types(uv, operator/, /) #ifdef ISPC __define_binary_operator(uniform); __define_binary_operator(varying); #else __define_binary_operator(); #endif #undef __define_binary_operator #undef __define_binary_operator_types #undef __define_binary_operator_dims // ============================================================================ #define __define_comp_fn(univary) \ inline univary bool eq(const univary vec2f a, const univary vec2f b) \ { \ return a.x == b.x && a.y == b.y; \ } \ inline univary bool eq(const univary vec3f a, const univary float b) \ { \ return a.x == b && a.y == b && a.z == b; \ } \ inline univary bool eq(const univary vec3f a, const univary vec3f b) \ { \ return a.x == b.x && a.y == b.y && a.z == b.z; \ } \ inline univary bool ne(const univary vec2f a, const univary vec2f b) \ { \ return !eq(a, b); \ } \ inline univary bool ne(const univary vec3f a, const univary float b) \ { \ return !eq(a, b); \ } \ inline univary bool ne(const univary vec3f a, const univary vec3f b) \ { \ return !eq(a, b); \ } \ inline univary vec3f neg(const univary vec3f v) \ { \ return make_vec3f(-v.x, -v.y, -v.z); \ } #ifdef ISPC __define_comp_fn(uniform); __define_comp_fn(varying); #else __define_comp_fn(); #endif #undef __define_comp_fn // ------------------------------------------------------------------ // anyLessThan() // ------------------------------------------------------------------ #define __define_anyLessThan(univary, abb) \ inline univary bool anyLessThan( \ const univary vec2##abb &a, const univary vec2##abb &b) \ { \ return ISPC_OR(a.x < b.x, a.y < b.y); \ } \ inline univary bool anyLessThan( \ const univary vec3##abb &a, const univary vec3##abb &b) \ { \ return ISPC_OR(a.x < b.x, ISPC_OR(a.y < b.y, a.z < b.z)); \ } #define __define_anyLessThan_type(univary) \ __define_anyLessThan(univary, f); \ __define_anyLessThan(univary, i); #ifdef ISPC __define_anyLessThan_type(uniform); __define_anyLessThan_type(varying); #else __define_anyLessThan_type(); #endif #undef __define_anyLessThan_type #undef __define_anyLessThan // ------------------------------------------------------------------ // dot product // ------------------------------------------------------------------ #define __define_dot_product(univary) \ /*! computes 3D dot product for vec3fs */ \ inline univary float dot(const univary vec3f a, const univary vec3f b) \ { \ return a.x * b.x + a.y * b.y + a.z * b.z; \ } \ inline univary float length(const univary vec3f a) \ { \ return sqrtf(dot(a, a)); \ } \ inline univary float distance(const univary vec3f a, const univary vec3f b) \ { \ return length(a - b); \ } #ifdef ISPC __define_dot_product(uniform); __define_dot_product(varying); #else __define_dot_product(); #endif #undef __define_dot_product // ------------------------------------------------------------------ // cross product // ------------------------------------------------------------------ #define __define_cross(univary_r, univary_a, univary_b) \ inline univary_r vec3f cross( \ const univary_a vec3f &a, const univary_b vec3f &b) \ { \ return make_vec3f( \ a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); \ } #ifdef ISPC __define_cross(uniform, uniform, uniform); __define_cross(varying, varying, varying); __define_cross(varying, varying, uniform); __define_cross(varying, uniform, varying); #else __define_cross(, , ); #endif #undef __define_cross // ------------------------------------------------------------------ // rotate // ------------------------------------------------------------------ #ifdef ISPC /* rotates vector around axis for *all-uniform* vec3fs */ inline uniform vec3f rotate( const uniform vec3f &v, const uniform vec3f &axis, uniform float theta) { return v * cos(theta) + cross(axis, v) * sin(theta) + axis * dot(axis, v) * (1.f - cos(theta)); } #endif /* rotates vector around axis for vec3fs that produce varying results */ inline vec3f rotate(const vec3f &v, const vec3f &axis, float theta) { return v * cos(theta) + cross(axis, v) * sin(theta) + axis * dot(axis, v) * (1.f - cos(theta)); } // ------------------------------------------------------------------ // normalize // ------------------------------------------------------------------ #ifdef ISPC /* compute and return normalized version of uniform vec3f passed to this fct */ inline uniform vec3f normalize(const uniform vec3f &v) { return v * (1.f / sqrt(dot(v, v))); } #endif /* compute and return normalized version of varying vec3f passed to this fct */ inline vec3f normalize(const vec3f v) { return v * (1.f / sqrt(dot(v, v))); } /* compute and return normalized version of varying vec3f passed to this fct */ inline vec3f normalize(const vec3f v, float &len) { len = sqrtf(dot(v, v)); return v * rcpf(len); } inline vec3f safe_normalize(const vec3f v) { return v * (1.f / sqrtf(max(flt_min, dot(v, v)))); } /* differentiated normalization */ inline vec3f dnormalize(const vec3f &p, const vec3f &dp) { const float pp = dot(p, p); const float pdp = dot(p, dp); return (pp * dp - pdp * p) * rcp(pp) * rsqrt(pp); } // ------------------------------------------------------------------ // unary functions // ------------------------------------------------------------------ #define __define_unary_fct_dims(univary, fct) \ inline univary vec2f fct(const univary vec2f v) \ { \ return make_vec2f(fct(v.x), fct(v.y)); \ } \ inline univary vec3f fct(const univary vec3f v) \ { \ return make_vec3f(fct(v.x), fct(v.y), fct(v.z)); \ } \ inline univary vec4f fct(const univary vec4f v) \ { \ return make_vec4f(fct(v.x), fct(v.y), fct(v.z), fct(v.w)); \ } #define __define_unary_fct(univary) \ __define_unary_fct_dims(univary, abs); \ __define_unary_fct_dims(univary, absf); \ __define_unary_fct_dims(univary, rcpf); \ __define_unary_fct_dims(univary, expf); \ __define_unary_fct_dims(univary, logf); \ __define_unary_fct_dims(univary, floor); \ __define_unary_fct_dims(univary, divide_safe); \ __define_unary_fct_dims(univary, rcp); \ __define_unary_fct_dims(univary, rcp_safe); \ __define_unary_fct_dims(univary, exp); \ __define_unary_fct_dims(univary, frac); \ __define_unary_fct_dims(univary, sqr); \ __define_unary_fct_dims(univary, sqrt); \ __define_unary_fct_dims(univary, sqrt_safe) #ifdef ISPC __define_unary_fct(uniform); __define_unary_fct(varying); #else __define_unary_fct(); #endif #undef __define_unary_fct #undef __define_unary_fct_dims // ------------------------------------------------------------------ // lerp // ------------------------------------------------------------------ #define __define_lerp_fn(univary, abb) \ inline univary vec2##abb lerp(univary float factor, \ const univary vec2##abb a, \ const univary vec2##abb b) \ { \ return make_vec2##abb(lerp(factor, a.x, b.x), lerp(factor, a.y, b.y)); \ } \ inline univary vec2##abb lerp(univary vec2f factor, \ const univary vec2##abb a, \ const univary vec2##abb b) \ { \ return make_vec2##abb(lerp(factor.x, a.x, b.x), lerp(factor.y, a.y, b.y)); \ } \ inline univary vec3##abb lerp(univary float factor, \ const univary vec3##abb a, \ const univary vec3##abb b) \ { \ return make_vec3##abb(lerp(factor, a.x, b.x), \ lerp(factor, a.y, b.y), \ lerp(factor, a.z, b.z)); \ } \ inline univary vec3##abb lerp(univary vec3f factor, \ const univary vec3##abb a, \ const univary vec3##abb b) \ { \ return make_vec3##abb(lerp(factor.x, a.x, b.x), \ lerp(factor.y, a.y, b.y), \ lerp(factor.z, a.z, b.z)); \ } \ inline univary vec4##abb lerp(univary float factor, \ const univary vec4##abb a, \ const univary vec4##abb b) \ { \ return make_vec4##abb(lerp(factor, a.x, b.x), \ lerp(factor, a.y, b.y), \ lerp(factor, a.z, b.z), \ lerp(factor, a.w, b.w)); \ } \ inline univary vec4##abb lerp(univary vec4f factor, \ const univary vec4##abb a, \ const univary vec4##abb b) \ { \ return make_vec4##abb(lerp(factor.x, a.x, b.x), \ lerp(factor.y, a.y, b.y), \ lerp(factor.z, a.z, b.z), \ lerp(factor.w, a.w, b.w)); \ } #define __define_lerp_type(univary) \ __define_lerp_fn(univary, f); \ __define_lerp_fn(univary, i); \ __define_lerp_fn(univary, ui); \ __define_lerp_fn(univary, uc) #ifdef ISPC __define_lerp_type(varying); __define_lerp_type(uniform); #else __define_lerp_type(); #endif #undef __define_lerp_type #undef __define_lerp_fn // ------------------------------------------------------------------ // interpolate // ------------------------------------------------------------------ #define __define_interpolate_fn(univary, type) \ inline type interpolate(const vec3f &f, \ const univary type a, \ const univary type b, \ const univary type c) \ { \ return f.x * a + f.y * b + f.z * c; \ } #define __define_interpolate_type(univary) \ __define_interpolate_fn(univary, vec2f); \ __define_interpolate_fn(univary, vec3f); \ __define_interpolate_fn(univary, vec4f); #ifdef ISPC __define_interpolate_type(varying); __define_interpolate_type(uniform); #else __define_interpolate_type(); #endif #undef __define_interpolate_type #undef __define_interpolate_fn // ------------------------------------------------------------------ // clamp // ------------------------------------------------------------------ inline vec3f clamp(const vec3f &a) { return (make_vec3f(clamp(a.x), clamp(a.y), clamp(a.z))); } #define __define_clamp_dims(univary_v, univary_l, abb) \ inline univary_v vec2##abb clamp(const univary_v vec2##abb &a, \ const univary_l vec2##abb &b, \ const univary_l vec2##abb &c) \ { \ return (make_vec2##abb(clamp(a.x, b.x, c.x), clamp(a.y, b.y, c.y))); \ } \ inline univary_v vec3##abb clamp(const univary_v vec3##abb &a, \ const univary_l vec3##abb &b, \ const univary_l vec3##abb &c) \ { \ return (make_vec3##abb( \ clamp(a.x, b.x, c.x), clamp(a.y, b.y, c.y), clamp(a.z, b.z, c.z))); \ } #define __define_clamp_types(univary_v, univary_l) \ __define_clamp_dims(univary_v, univary_l, f); \ __define_clamp_dims(univary_v, univary_l, i) #ifdef ISPC __define_clamp_types(varying, varying); __define_clamp_types(uniform, uniform); __define_clamp_types(varying, uniform); #else __define_clamp_types(, ); #endif #undef __define_clamp_types #undef __define_clamp_dims #define __define_reduce_op_dims(univary, op, abb, type) \ inline univary type reduce_##op(const univary vec3##abb &a) \ { \ return op(op(a.x, a.y), a.z); \ } \ inline univary type reduce_##op(const univary vec4##abb &a) \ { \ return op(op(a.x, a.y), op(a.z, a.w)); \ } #define __define_reduce_op_types(univary, op) \ __define_reduce_op_dims(univary, op, i, int) \ __define_reduce_op_dims(univary, op, f, float) #define __define_reduce_op(univary) \ __define_reduce_op_types(univary, min) __define_reduce_op_types(univary, max) #ifdef ISPC __define_reduce_op(varying); __define_reduce_op(uniform); #else __define_reduce_op(); #endif #undef __define_reduce_op #undef __define_reduce_op_types #undef __define_reduce_op_dims // ------------------------------------------------------------------ // other // ------------------------------------------------------------------ #define __define_other(univary) \ inline univary vec4f make_vec4f( \ const univary vec3f rgb, const univary float a) \ { \ return make_vec4f(rgb.x, rgb.y, rgb.z, a); \ } \ inline univary vec3f to_float(const univary vec3i &a) \ { \ return make_vec3f(a); \ } \ inline univary vec3i to_int(const univary vec3f &a) \ { \ return make_vec3i(a); \ } \ inline univary vec3i operator>>(const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x >> b, a.y >> b, a.z >> b)); \ } \ inline univary vec3i operator<<(const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x << b, a.y << b, a.z << b)); \ } \ inline univary vec3i bitwise_AND( \ const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x & b, a.y & b, a.z & b)); \ } \ inline univary vec3f powf(const univary vec3f v, const univary float f) \ { \ return make_vec3f(powf(v.x, f), powf(v.y, f), powf(v.z, f)); \ } \ inline univary float reduce_mul(const univary vec3f &a) \ { \ return a.x * a.y * a.z; \ } \ inline univary float reduce_add(const univary vec3f &a) \ { \ return a.x + a.y + a.z; \ } \ inline univary float reduce_add(const univary vec4f &a) \ { \ return (a.x + a.y) + (a.z + a.w); \ } \ inline univary float reduce_avg(const univary vec3f &a) \ { \ return reduce_add(a) * (1.0f / 3.0f); \ } \ inline univary float luminance(const univary vec3f &c) \ { \ return 0.212671f * c.x + 0.715160f * c.y + 0.072169f * c.z; \ } \ inline univary bool isnan(const univary vec3f v) \ { \ return isnan(v.x + v.y + v.z); \ } #ifdef ISPC __define_other(varying); __define_other(uniform); #else __define_other(); #endif #undef __define_other // The next machine representable number from 'a' in the direction of 'b' inline ISPC_UNIFORM vec3f nextafter( const ISPC_UNIFORM vec3i &a, const ISPC_UNIFORM vec3i &b) { return (make_vec3f( nextafter(a.x, b.x), nextafter(a.y, b.y), nextafter(a.z, b.z))); } inline vec2i make_vec2i(const vec2f &a) { return make_vec2i((int)a.x, (int)a.y); } inline vec2i to_int(const vec2f &a) { return make_vec2i(a); } inline vec2f to_float_unorm(const vec2ui &a) { return make_vec2f(to_float_unorm(a.x), to_float_unorm(a.y)); } inline vec3f to_float_unorm(const vec3ui &a) { return make_vec3f( to_float_unorm(a.x), to_float_unorm(a.y), to_float_unorm(a.z)); } inline vec3f floatbits(const vec3i &a) { return make_vec3f(floatbits(a.x), floatbits(a.y), floatbits(a.z)); } inline vec3ui intbits(const vec3f &a) { return make_vec3ui(intbits(a.x), intbits(a.y), intbits(a.z)); } inline vec3f pow(const vec3f &a, const float b) { return make_vec3f(pow(a.x, b), pow(a.y, b), pow(a.z, b)); } inline vec4f pow(const vec4f &a, const float b) { return make_vec4f(pow(a.x, b), pow(a.y, b), pow(a.z, b), pow(a.w, b)); } // ------------------------------------------------------- // float / int conversion functions // ------------------------------------------------------- /* convert float-color into rgba-uint format, i.e. normalized fixed-point * round to nearest, see "2.3.5 Fixed-Point Data Conversions" of OpenGL 4.6 */ inline uint32 cvt_uint32(const float f) { return (uint32)roundf(255.f * clamp(f)); } inline uint32 cvt_uint32(const vec4f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16) | (cvt_uint32(v.w) << 24); } inline uint32 cvt_uint32(const vec3f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16); } inline uint32 cvt_uint16(const float f) { return (uint32)roundf(65535.f * clamp(f)); } // ------------------------------------------------------- // sRGB conversion functions // ------------------------------------------------------- #define APPROXIMATE_SRGB inline float linear_to_srgb(const float f) { const float c = max(f, 0.f); #ifdef APPROXIMATE_SRGB return pow(c, 1.f / 2.2f); #else return c <= 0.0031308f ? 12.92f * c : pow(c, 1.f / 2.4f) * 1.055f - 0.055f; #endif } inline vec4f linear_to_srgba(const vec4f c) { return make_vec4f(linear_to_srgb(c.x), linear_to_srgb(c.y), linear_to_srgb(c.z), max(c.w, 0.f)); // alpha is never gamma-corrected } inline uint32 linear_to_srgba8(const vec4f c) { #if 1 return cvt_uint32(linear_to_srgba(c)); #else // TODO use ISPC's float_to_srgb8 once it is fixed (issue #1198) return (float_to_srgb8(c.x) << 0) | (float_to_srgb8(c.y) << 8) | (float_to_srgb8(c.z) << 16) | ((uint32)clamp(c.w, 0.f, 1.f) << 24); // alpha is never gamma-corrected #endif } inline float srgb_to_linear(const float f) { const float c = max(f, 0.f); #ifdef APPROXIMATE_SRGB return pow(c, 2.2f); #else return c <= 0.04045f ? c / 12.92f : pow((c + 0.055f) / 1.055f, 2.4f); #endif } inline vec4f srgba_to_linear(const vec4f c) { return make_vec4f(srgb_to_linear(c.x), srgb_to_linear(c.y), srgb_to_linear(c.z), max(c.w, 0.f)); // alpha is never gamma-corrected } // TODO implement srgba8_to_linear with a 256 entry LUT #undef APPROXIMATE_SRGB #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/memory/000077500000000000000000000000001467524601100206665ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/memory/DeletedUniquePtr.h000066400000000000000000000012031467524601100242560ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace memory { template using DeletedUniquePtr = std::unique_ptr>; template inline DeletedUniquePtr make_deleted_unique(DELETE_FCN &&deleter, Args &&... args) { return DeletedUniquePtr(new T(std::forward(args)...), deleter); } } // namespace memory } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/memory/IntrusivePtr.h000066400000000000000000000106561467524601100235250ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace memory { class RefCountedObject { public: RefCountedObject() = default; virtual ~RefCountedObject() = default; RefCountedObject(const RefCountedObject &) = delete; RefCountedObject &operator=(const RefCountedObject &) = delete; RefCountedObject(RefCountedObject &&) = delete; RefCountedObject &operator=(RefCountedObject &&) = delete; void refInc() const; void refDec() const; long long useCount() const; private: mutable std::atomic refCounter{1}; }; // Inlined definitions // inline void RefCountedObject::refInc() const { refCounter++; } inline void RefCountedObject::refDec() const { if ((--refCounter) == 0) delete this; } inline long long RefCountedObject::useCount() const { return refCounter.load(); } /////////////////////////////////////////////////////////////////////////// // Pointer to a RefCountedObject //////////////////////////////////////// /////////////////////////////////////////////////////////////////////////// template class IntrusivePtr { static_assert(std::is_base_of::value, "IntrusivePtr can only be used with objects derived " "from RefCountedObject"); public: T *ptr{nullptr}; IntrusivePtr() = default; ~IntrusivePtr(); IntrusivePtr(const IntrusivePtr &input); IntrusivePtr(IntrusivePtr &&input); template IntrusivePtr(const IntrusivePtr &input); IntrusivePtr(T *const input); IntrusivePtr &operator=(const IntrusivePtr &input); IntrusivePtr &operator=(IntrusivePtr &&input); IntrusivePtr &operator=(T *input); operator bool() const; T &operator*() const; T *operator->() const; }; // Inlined definitions // template inline IntrusivePtr::~IntrusivePtr() { if (ptr) ptr->refDec(); } template inline IntrusivePtr::IntrusivePtr(const IntrusivePtr &input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } template inline IntrusivePtr::IntrusivePtr(IntrusivePtr &&input) : ptr(input.ptr) { input.ptr = nullptr; } template template inline IntrusivePtr::IntrusivePtr(const IntrusivePtr &input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } template inline IntrusivePtr::IntrusivePtr(T *const input) : ptr(input) { if (ptr) ptr->refInc(); } template inline IntrusivePtr &IntrusivePtr::operator=( const IntrusivePtr &input) { if (input.ptr) input.ptr->refInc(); if (ptr) ptr->refDec(); ptr = input.ptr; return *this; } template inline IntrusivePtr &IntrusivePtr::operator=(IntrusivePtr &&input) { if (ptr) ptr->refDec(); ptr = input.ptr; input.ptr = nullptr; return *this; } template inline IntrusivePtr &IntrusivePtr::operator=(T *input) { if (input) input->refInc(); if (ptr) ptr->refDec(); ptr = input; return *this; } template inline IntrusivePtr::operator bool() const { return ptr != nullptr; } template inline T &IntrusivePtr::operator*() const { return *ptr; } template inline T *IntrusivePtr::operator->() const { return ptr; } // Inlined operators ////////////////////////////////////////////////////// template inline bool operator<(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr < b.ptr; } template bool operator==(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr == b.ptr; } template bool operator!=(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr != b.ptr; } } // namespace memory } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/memory/RefCount.h000066400000000000000000000005471467524601100225720ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "./IntrusivePtr.h" namespace rkcommon { namespace memory { // Type aliases for backward compatibility template using Ref = IntrusivePtr; using RefCount = RefCountedObject; } // namespace memory } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/memory/malloc.cpp000066400000000000000000000024411467524601100226420ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "malloc.h" #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #include "tbb/scalable_allocator.h" #else #ifdef _WIN32 #include #elif defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) #include #else #include #endif #endif namespace rkcommon { namespace memory { void *alignedMalloc(size_t size, size_t align) { assert((align & (align - 1)) == 0); #if defined(RKCOMMON_TASKING_TBB) return scalable_aligned_malloc(size, align); #else #ifdef _WIN32 return _aligned_malloc(size, align); #elif defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) void *ptr = nullptr; return (posix_memalign(&ptr, align, size) == 0) ? ptr : nullptr; #else // __UNIX__ return _mm_malloc(size, align); #endif #endif } void alignedFree(void *ptr) { #if defined(RKCOMMON_TASKING_TBB) scalable_aligned_free(ptr); #else #ifdef _WIN32 _aligned_free(ptr); #elif defined(__APPLE__) && (defined(__aarch64__) || defined(__arm64__)) free(ptr); #else // __UNIX__ _mm_free(ptr); #endif #endif } } // namespace memory } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/memory/malloc.h000066400000000000000000000022401467524601100223040ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace memory { #define ALIGN_PTR(ptr, alignment) \ ((((size_t)ptr) + alignment - 1) & ((size_t) - (ssize_t)alignment)) /*! aligned allocation */ RKCOMMON_INTERFACE void *alignedMalloc(size_t size, size_t align = 64); RKCOMMON_INTERFACE void alignedFree(void *ptr); template __forceinline T *alignedMalloc(size_t nElements, size_t align = 64) { return (T *)alignedMalloc(nElements * sizeof(T), align); } inline bool isAligned(void *ptr, int alignment = 64) { return reinterpret_cast(ptr) % alignment == 0; } // NOTE(jda) - can't use function wrapped alloca solution as Clang won't // inline a function containing alloca()...but works w/ gcc+icc #if 0 template __forceinline T* stackBuffer(size_t nElements) { return static_cast(alloca(sizeof(T) * nElements)); } #else #define STACK_BUFFER(TYPE, nElements) (TYPE *)alloca(sizeof(TYPE) * nElements) #endif } // namespace memory } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/networking/000077500000000000000000000000001467524601100215455ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/networking/DataStreaming.cpp000066400000000000000000000045161467524601100250020ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "DataStreaming.h" #include "../common.h" #include namespace rkcommon { namespace networking { BufferWriter::BufferWriter() : buffer(std::make_shared>()) { } void BufferWriter::write(const void *mem, size_t size) { const size_t bsize = buffer->size(); buffer->resize(buffer->size() + size, 0); if (mem && size > 0) std::memcpy(buffer->begin() + bsize, mem, size); } BufferReader::BufferReader( const std::shared_ptr> &buf) : buffer(buf) { } void BufferReader::read(void *mem, size_t size) { if (cursor + size > buffer->size()) throw std::runtime_error("Attempt to read past end of BufferReader!"); if (mem && size > 0) std::memcpy(mem, buffer->begin() + cursor, size); cursor += size; } bool BufferReader::end() { return cursor >= buffer->size(); } void WriteSizeCalculator::write(const void *, size_t size) { writtenSize += size; } FixedBufferWriter::FixedBufferWriter(size_t size) : buffer(std::make_shared>(size)) { } void FixedBufferWriter::write(const void *mem, size_t size) { if (cursor + size >= buffer->size()) { throw std::runtime_error( "FixedBufferWriter::write size exceeds buffer"); } if (mem && size > 0) std::memcpy(buffer->begin() + cursor, mem, size); cursor += size; } void *FixedBufferWriter::reserve(size_t size) { if (cursor + size >= buffer->size()) { throw std::runtime_error( "FixedBufferWriter::reserve size exceeds buffer"); } void *mem = buffer->begin() + cursor; cursor += size; return mem; } std::shared_ptr::View> FixedBufferWriter::getWrittenView() { return std::make_shared::View>( buffer, 0, cursor); } size_t FixedBufferWriter::available() const { return buffer->size() - cursor; } size_t FixedBufferWriter::capacity() const { return buffer->size(); } } // namespace networking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/networking/DataStreaming.h000066400000000000000000000130051467524601100244400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "../utility/AbstractArray.h" #include "../utility/ArrayView.h" #include "../utility/FixedArray.h" #include "../utility/FixedArrayView.h" #include "../utility/OwnedArray.h" #include namespace rkcommon { namespace networking { /*! abstraction of an object that we can serailize/write (raw) data into */ struct RKCOMMON_INTERFACE WriteStream { virtual ~WriteStream() = default; virtual void write(const void *mem, size_t size) = 0; virtual void flush() {} }; /*! abstraction of an object that we can read (raw) data from to then de-serialize into work objects */ struct RKCOMMON_INTERFACE ReadStream { virtual ~ReadStream() = default; virtual void read(void *mem, size_t size) = 0; virtual bool end() = 0; }; struct RKCOMMON_INTERFACE BufferWriter : WriteStream { BufferWriter(); void write(const void *mem, size_t size) override; std::shared_ptr> buffer; }; struct RKCOMMON_INTERFACE BufferReader : ReadStream { BufferReader(const std::shared_ptr> &buf); void read(void *mem, size_t size) override; /* Get a view of the buffer at the current cursor with the desired number of * elements. This creates a view, not a copy of the data, so the underlying * buffer must be kept valid while the view is in use. The cursor will be * advanced to the data following this view */ template std::shared_ptr> getView(size_t count); bool end() override; size_t cursor = 0; const std::shared_ptr> buffer; }; /*! Utility which behaves as a write stream, but just computes the number of * bytes which have been written to it */ struct RKCOMMON_INTERFACE WriteSizeCalculator : public WriteStream { void write(const void *mem, size_t size) override; size_t writtenSize = 0; }; /*! Buffer writer for writing to a fixed size output buffer. The cursor * points to the next location to write at. Trying to write more than the * fixed buffer's size will throw an error */ struct RKCOMMON_INTERFACE FixedBufferWriter : public WriteStream { FixedBufferWriter() = default; FixedBufferWriter(size_t size); void write(const void *mem, size_t size) override; // Reserve space in the buffer and return the pointer to the start of it void *reserve(size_t size); // Get a view of the region written so far of the buffer std::shared_ptr::View> getWrittenView(); // Get the space available to write in the buffer size_t available() const; // Get the underlying buffer size being written to size_t capacity() const; size_t cursor = 0; std::shared_ptr> buffer; }; /*! generic stream operators into/out of streams, for raw data blocks */ template inline WriteStream &operator<<(WriteStream &buf, const T &rh) { buf.write((const byte_t *)&rh, sizeof(T)); return buf; } template inline ReadStream &operator>>(ReadStream &buf, T &rh) { buf.read((byte_t *)&rh, sizeof(T)); return buf; } /*! @{ stream operators into/out of read/write streams, for std::vectors * of non-POD types*/ template inline WriteStream &operator<<(WriteStream &buf, const std::vector &rh) { const size_t sz = rh.size(); buf << sz; for (const auto &x : rh) buf << x; return buf; } template inline ReadStream &operator>>(ReadStream &buf, std::vector &rh) { size_t sz; buf >> sz; rh.resize(sz); for (size_t i = 0; i < sz; ++i) buf >> rh[i]; return buf; } /*! @} */ /*! @{ stream operators into/out of read/write streams, for AbstractArray */ template inline WriteStream &operator<<(WriteStream &buf, const utility::AbstractArray &rh) { const size_t sz = rh.size(); buf << sz; buf.write((const byte_t *)rh.data(), sizeof(T) * sz); return buf; } /*! @} */ /*! @{ serialize operations for strings */ inline WriteStream &operator<<(WriteStream &buf, const std::string &rh) { const size_t sz = rh.size(); buf << sz; buf.write((const void *)rh.data(), sz); return buf; } inline WriteStream &operator<<(WriteStream &buf, const char *rh) { const size_t sz = std::strlen(rh); buf << sz; buf.write((const void *)rh, sz); return buf; } inline ReadStream &operator>>(ReadStream &buf, std::string &rh) { size_t sz; buf >> sz; rh.resize(sz); buf.read((void *)rh.data(), sz); return buf; } template std::shared_ptr> BufferReader::getView(size_t count) { const size_t size = count * sizeof(T); if (cursor + size > buffer->size()) { throw std::runtime_error("Attempt to read past end of BufferReader!"); } auto view = std::make_shared>(buffer->begin() + cursor, size); cursor += size; return view; } } // namespace networking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/networking/Fabric.cpp000066400000000000000000000004201467524601100234330ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include #include #include "../common.h" #include "../utility/AbstractArray.h" #include "Fabric.h" namespace rkcommon { namespace networking { Fabric::Fabric() {} } } RenderKit-rkcommon-988718e/rkcommon/networking/Fabric.h000066400000000000000000000025621467524601100231110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "../common.h" #include "../utility/AbstractArray.h" namespace rkcommon { namespace networking { /*! abstraction for a physical fabric that can transmit data - sockets, mpi, etc */ struct RKCOMMON_INTERFACE Fabric { Fabric(); virtual ~Fabric() = default; // Broadcast the data to all clients on the other end of the fabric // TODO: only makes sense to call on the root rank, so maybe a separate // "send" fabric ? virtual void sendBcast( std::shared_ptr> buf) = 0; virtual void flushBcastSends() = 0; // Receive a broadcast of data from the fabric sender // TODO: only makes sense to call on the receivers, so maybe a separate // "recv" fabric ? virtual void recvBcast(utility::AbstractArray &buf) = 0; // Send data to a specific rank in the fabric (callable on any rank) virtual void send(std::shared_ptr> buf, int rank) = 0; // Receive data from a specific rank on the fabric (callable on any rank) virtual void recv(utility::AbstractArray &buf, int rank) = 0; }; } // namespace networking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/os/000077500000000000000000000000001467524601100177775ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/os/FileName.cpp000066400000000000000000000107061467524601100221670ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "FileName.h" namespace rkcommon { #ifdef _WIN32 const char path_sep = '\\'; #else const char path_sep = '/'; #endif /*! create an empty filename */ FileName::FileName() {} /*! create a valid filename from a string */ FileName::FileName(const char *in) { filename = in; for (size_t i = 0; i < filename.size(); i++) if (filename[i] == '\\' || filename[i] == '/') filename[i] = path_sep; while (!filename.empty() && filename[filename.size() - 1] == path_sep) filename.resize(filename.size() - 1); } /*! create a valid filename from a string */ FileName::FileName(const std::string &in) { filename = in; for (size_t i = 0; i < filename.size(); i++) if (filename[i] == '\\' || filename[i] == '/') filename[i] = path_sep; while (!filename.empty() && filename[filename.size() - 1] == path_sep) filename.resize(filename.size() - 1); } /*! returns path to home folder */ FileName FileName::homeFolder() { #ifdef _WIN32 const char *home = getenv("UserProfile"); #else const char *home = getenv("HOME"); #endif if (home) return home; return ""; } /*! returns the canonical absolute path to filename */ FileName FileName::canonical() { /* pre-C++17 implementation of std::filesystem::canonical */ char *cTemp = nullptr; #ifdef _WIN32 cTemp = _fullpath(NULL, filename.c_str(), 0); #else // POSIX cTemp = realpath(filename.c_str(), NULL); #endif rkcommon::FileName canonical(cTemp ? cTemp : ""); free(cTemp); return canonical; } /*! returns the path */ std::string FileName::path() const { size_t pos = filename.find_last_of(path_sep); if (pos == std::string::npos) return ""; return filename.substr(0, pos + 1); } /*! returns the basename */ std::string FileName::base() const { size_t pos = filename.find_last_of(path_sep); if (pos == std::string::npos) return filename; return filename.substr(pos + 1); } /*! returns the extension */ std::string FileName::ext() const { size_t pos = filename.find_last_of('.'); if (pos == std::string::npos) return ""; return filename.substr(pos + 1); } /*! returns the extension */ FileName FileName::dropExt() const { size_t pos = filename.find_last_of('.'); if (pos == std::string::npos) return filename; return filename.substr(0, pos); } /*! returns the basename without extension */ std::string FileName::name() const { size_t start = filename.find_last_of(path_sep); if (start == std::string::npos) start = 0; else start++; size_t end = filename.find_last_of('.'); if (end == std::string::npos || end < start) end = filename.size(); return filename.substr(start, end - start); } /*! replaces the extension */ FileName FileName::setExt(const std::string &ext) const { size_t start = filename.find_last_of(path_sep); if (start == std::string::npos) start = 0; else start++; size_t end = filename.find_last_of('.'); if (end == std::string::npos || end < start) return FileName(filename + ext); return FileName(filename.substr(0, end) + ext); } /*! adds the extension */ FileName FileName::addExt(const std::string &ext) const { return FileName(filename + ext); } /*! concatenates two filenames to this/other */ FileName FileName::operator+(const FileName &other) const { if (filename == "") return FileName(other); else return FileName(filename + path_sep + other.filename); } /*! concatenates two filenames to this/other */ FileName FileName::operator+(const std::string &other) const { return operator+(FileName(other)); } /*! removes the base from a filename (if possible) */ FileName FileName::operator-(const FileName &base) const { size_t pos = filename.find_first_of(base); if (pos == std::string::npos) return *this; return FileName(filename.substr(pos + 1)); } /*! == operator */ bool operator==(const FileName &a, const FileName &b) { return a.filename == b.filename; } /*! != operator */ bool operator!=(const FileName &a, const FileName &b) { return a.filename != b.filename; } /*! output operator */ std::ostream &operator<<(std::ostream &cout, const FileName &filename) { return cout << filename.filename; } } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/os/FileName.h000066400000000000000000000053611467524601100216350ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { /*! Convenience class for handling file names and paths. */ class FileName { public: /*! create an empty filename */ RKCOMMON_INTERFACE FileName(); /*! create a valid filename from a string */ RKCOMMON_INTERFACE FileName(const char *filename); /*! create a valid filename from a string */ RKCOMMON_INTERFACE FileName(const std::string &filename); /*! returns path to home folder */ RKCOMMON_INTERFACE static FileName homeFolder(); /*! auto convert into a string */ RKCOMMON_INTERFACE operator std::string() const { return filename; } /*! returns a string of the filename */ RKCOMMON_INTERFACE const std::string &str() const { return filename; } /*! returns a c-string of the filename */ RKCOMMON_INTERFACE const char *c_str() const { return filename.c_str(); } /*! returns the canonical absolute path to filename */ /*! pre-C++17 implementation of std::filesystem::canonical */ RKCOMMON_INTERFACE FileName canonical(); /*! returns the path of a filename with separator at the end */ RKCOMMON_INTERFACE std::string path() const; /*! returns the file of a filename */ RKCOMMON_INTERFACE std::string base() const; /*! returns the base of a filename without extension */ RKCOMMON_INTERFACE std::string name() const; /*! returns the file extension */ RKCOMMON_INTERFACE std::string ext() const; /*! drops the file extension */ RKCOMMON_INTERFACE FileName dropExt() const; /*! replaces the file extension */ RKCOMMON_INTERFACE FileName setExt(const std::string &ext = "") const; /*! adds file extension */ RKCOMMON_INTERFACE FileName addExt(const std::string &ext = "") const; /*! concatenates two filenames to this/other */ RKCOMMON_INTERFACE FileName operator+(const FileName &other) const; /*! concatenates two filenames to this/other */ RKCOMMON_INTERFACE FileName operator+(const std::string &other) const; /*! removes the base from a filename (if possible) */ RKCOMMON_INTERFACE FileName operator-(const FileName &base) const; /*! == operator */ RKCOMMON_INTERFACE friend bool operator==(const FileName &a, const FileName &b); /*! != operator */ RKCOMMON_INTERFACE friend bool operator!=(const FileName &a, const FileName &b); /*! output operator */ RKCOMMON_INTERFACE friend std::ostream &operator<<( std::ostream &cout, const FileName &filename); private: std::string filename; }; } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/os/library.cpp000066400000000000000000000177461467524601100221660ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "library.h" #include "FileName.h" #include #ifndef _WIN32 #include #include #endif namespace { std::string directory_from_path(const std::string &path) { // Remove the filename from the path const size_t lastPathSep = path.find_last_of("/\\"); if (lastPathSep == std::string::npos) throw std::runtime_error("could not get absolute path of module directory"); return path.substr(0, lastPathSep + 1); } std::string library_location(const void *address) { // implementation taken from OIDN module.cpp if (address == nullptr) throw std::runtime_error("library_location(): NULL address provided"); #if defined(_WIN32) // Get the handle of the module which contains the address HMODULE module; const DWORD flags = GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT; if (!GetModuleHandleExA(flags, reinterpret_cast(address), &module)) throw std::runtime_error("GetModuleHandleExA failed"); // Get the path of the module // Since we don't know the length of the path, we use a buffer of increasing // size DWORD pathSize = MAX_PATH + 1; for (;;) { std::vector path(pathSize); DWORD result = GetModuleFileNameA(module, path.data(), pathSize); if (result == 0) throw std::runtime_error("GetModuleFileNameA failed"); else if (result < pathSize) return directory_from_path(path.data()); else pathSize *= 2; } #else // dladdr should return an absolute path on Linux except for the main // executable On macOS it should always return an absolute path Dl_info info; if (dladdr(address, &info)) { // Check whether the path is absolute if (info.dli_fname && info.dli_fname[0] == '/') return directory_from_path(info.dli_fname); } #if defined(__APPLE__) // This shouldn't happen throw std::runtime_error("failed to get absolute path with dladdr"); #else // We failed to get an absolute path, so we try to parse /proc/self/maps std::ifstream file("/proc/self/maps"); if (!file) throw std::runtime_error("could not open /proc/self/maps"); // Parse the lines for (std::string lineStr; std::getline(file, lineStr);) { std::istringstream line(lineStr); // Parse the address range uintptr_t addressBegin, addressEnd; line >> std::hex; line >> addressBegin; if (line.get() != '-') continue; // parse error line >> addressEnd; if (!isspace(line.peek()) || !line) continue; // parse error // Check whether the address is in this range if (reinterpret_cast(address) < addressBegin || reinterpret_cast(address) >= addressEnd) continue; // Skip the permissions, offset, device, inode std::string str; for (int i = 0; i < 4; ++i) line >> str; // Parse the path line >> std::ws; if (!std::getline(line, str)) continue; // no path or parse error // Check whether the path is absolute if (str[0] == '/') return directory_from_path(str); } throw std::runtime_error("could not find address in /proc/self/maps"); #endif #endif } } // namespace namespace rkcommon { Library::Library( const void *anchorAddress, const std::string &name, const Version &version) : libraryName(name), libraryVersion(version) { bool success = false; try { success = loadLibrary(anchorAddress); } catch (const std::exception &e) { // handle exceptions from e.g. library_location() throw std::runtime_error( "Load of " + name + " failed due to: '" + e.what() + "'"); } if (!success) { throw std::runtime_error( "Load of " + name + " failed due to: '" + errorMessage + "'"); } } Library::Library(void *const _lib) : libraryName(""), lib(_lib), freeLibOnDelete(false) { } bool Library::loadLibrary(const void *anchorAddress) { std::string file = libraryName; std::string errorMsg; std::string libLocation = anchorAddress != nullptr ? library_location(anchorAddress) : std::string(); #ifdef _WIN32 std::string fullName = libLocation + file + ".dll"; lib = LoadLibrary(fullName.c_str()); if (lib == nullptr) { DWORD err = GetLastError(); LPTSTR lpMsgBuf; FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, NULL); errorMsg = lpMsgBuf; LocalFree(lpMsgBuf); } #else std::string versionStr; for (int i: libraryVersion) versionStr += "." + std::to_string(i); std::string fullName = libLocation + "lib" + file; #if defined(__MACOSX__) || defined(__APPLE__) fullName += versionStr + ".dylib"; #else fullName += ".so" + versionStr; #endif lib = dlopen(fullName.c_str(), RTLD_LAZY | RTLD_LOCAL); if (lib == nullptr) errorMsg = dlerror(); #endif if (lib == nullptr) { errorMessage = "could not open module lib " + libraryName + ": " + errorMsg; return false; } return true; } Library::~Library() { /* Only dlclose/free libraries if we're not running through addrsan * so that the shared library symbols remain accessible to addrsan * at exit (see https://github.com/google/sanitizers/issues/89) */ #ifndef RKCOMMON_ADDRSAN if (freeLibOnDelete && lib) { #ifdef _WIN32 FreeLibrary((HMODULE)lib); #else dlclose(lib); #endif } #endif } void *Library::getSymbol(const std::string &sym) const { #ifdef _WIN32 return GetProcAddress((HMODULE)lib, sym.c_str()); #else return dlsym(lib, sym.c_str()); #endif } std::unique_ptr LibraryRepository::instance; LibraryRepository *LibraryRepository::getInstance() { if (instance.get() == nullptr) instance = std::unique_ptr(new LibraryRepository); return instance.get(); } void LibraryRepository::cleanupInstance() { LibraryRepository::instance.reset(); } LibraryRepository::~LibraryRepository() { // Close libraries in the opposite order that they were opened while (!repo.empty()) { repo.pop_back(); } } void LibraryRepository::add(const void *anchorAddress, const std::string &name, const Library::Version &version) { if (libraryExists(name)) return; // lib already loaded. repo.push_back(rkcommon::make_unique( anchorAddress, name, version)); } void LibraryRepository::remove(const std::string &name) { auto lib = findLibrary(name); if (lib != repo.end()) { repo.erase(lib); } } void *LibraryRepository::getSymbol(const std::string &name) const { void *sym = nullptr; for (auto lib = repo.cbegin(); sym == nullptr && lib != repo.end(); ++lib) { sym = (*lib)->getSymbol(name); } return sym; } bool LibraryRepository::libraryExists(const std::string &name) const { return findLibrary(name) != repo.end(); } LibraryRepository::const_library_iterator_t LibraryRepository::findLibrary( const std::string &name) const { auto fnd = std::find_if( repo.begin(), repo.end(), [&](const std::unique_ptr &l) { return l->libraryName == name; }); return fnd; } LibraryRepository::library_iterator_t LibraryRepository::findLibrary( const std::string &name) { auto fnd = std::find_if( repo.begin(), repo.end(), [&](const std::unique_ptr &l) { return l->libraryName == name; }); return fnd; } } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/os/library.h000066400000000000000000000041201467524601100216110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "../common.h" // std #include #include #include namespace rkcommon { class RKCOMMON_INTERFACE Library { public: using Version = std::vector; // Opens a shared library; anchorAddress = nullptr will disable anchored loads Library(const void *anchorAddress, const std::string &name, const Version &version); ~Library(); // Returns address of a symbol from the library void *getSymbol(const std::string &sym) const; private: Library(void *const lib); bool loadLibrary(const void *anchorAddress); std::string libraryName; Version libraryVersion; std::string errorMessage; void *lib{nullptr}; bool freeLibOnDelete{true}; friend class LibraryRepository; template friend inline std::unique_ptr make_unique(Args &&... args); }; class RKCOMMON_INTERFACE LibraryRepository { public: static LibraryRepository *getInstance(); static void cleanupInstance(); ~LibraryRepository(); LibraryRepository(const LibraryRepository &) = delete; LibraryRepository &operator=(const LibraryRepository &) = delete; // add/remove a library to/from the repo void add(const void *anchorAddress, const std::string &name, const Library::Version &version = {}); void remove(const std::string &name); // Returns address of a symbol from any library in the repo void *getSymbol(const std::string &sym) const; bool libraryExists(const std::string &name) const; private: using const_library_iterator_t = std::vector>::const_iterator; using library_iterator_t = std::vector>::iterator; const_library_iterator_t findLibrary(const std::string &name) const; library_iterator_t findLibrary(const std::string &name); static std::unique_ptr instance; LibraryRepository() = default; std::vector> repo; }; } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/platform.h000066400000000000000000000223641467524601100213620ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #ifndef NOMINMAX #define NOMINMAX #endif #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include #undef NOMINMAX #undef WIN32_LEAN_AND_MEAN #endif //////////////////////////////////////////////////////////////////////////////// /// Macros //////////////////////////////////////////////////////////////////////////////// #ifdef _WIN32 #undef __noinline #define __noinline __declspec(noinline) //#define __forceinline __forceinline //#define __restrict __restrict #ifdef __INTEL_COMPILER #define __restrict__ __restrict #else #define __restrict__ //__restrict // causes issues with MSVC #endif #define __thread __declspec(thread) #define __aligned(...) __declspec(align(__VA_ARGS__)) //#define __FUNCTION__ __FUNCTION__ #define debugbreak() __debugbreak() #else #undef __noinline #undef __forceinline #define __noinline __attribute__((noinline)) #define __forceinline inline __attribute__((always_inline)) //#define __restrict __restrict //#define __thread __thread #define __aligned(...) __attribute__((aligned(__VA_ARGS__))) #define __FUNCTION__ __PRETTY_FUNCTION__ #define debugbreak() asm("int $3") #endif #ifdef __GNUC__ #define MAYBE_UNUSED __attribute__((unused)) #else #define MAYBE_UNUSED #endif #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define likely(expr) (expr) #define unlikely(expr) (expr) #else #define likely(expr) __builtin_expect((bool)(expr), true) #define unlikely(expr) __builtin_expect((bool)(expr), false) #endif //////////////////////////////////////////////////////////////////////////////// /// Error handling and debugging //////////////////////////////////////////////////////////////////////////////// /* debug printing macros */ #define STRING(x) #x #define TOSTRING(x) STRING(x) #define CODE_LOCATION __FILE__ " (" TOSTRING(__LINE__) ")" #define PING \ { \ std::stringstream msg; \ msg << CODE_LOCATION << ": " << __FUNCTION__ << std::endl; \ std::cout << msg.str(); \ } #define PRINT(x) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << std::endl; \ std::cout << msg.str(); \ } #define PRINT2(x, y) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << std::endl; \ std::cout << msg.str(); \ } #define PRINT3(x, y, z) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << ", " << STRING(z) << " = " << (z) << std::endl; \ std::cout << msg.str(); \ } #define PRINT4(x, y, z, w) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " \ << (w) << std::endl; \ std::cout << msg.str(); \ } #define THROW_RUNTIME_ERROR(str) \ throw std::runtime_error(std::string(__FILE__) + " (" + \ std::to_string((long long)__LINE__) + \ "): " + std::string(str)) #define FATAL(x) THROW_RUNTIME_ERROR(x) #define WARNING(x) std::cerr << "Warning:" << std::string(x) << std::endl #define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") // NOTE(jda) - These macros are used to construct the last UNUSED(...) macro, // used to mark a variable number of arguments as unused so the // compiler doesn't warn when -Wextra (gcc/clang/icc) is used. Only // works with 1 to 5 passed arguments. #define UNUSED_1(x) (void)x #define UNUSED_2(x, y) \ UNUSED_1(x); \ UNUSED_1(y) #define UNUSED_3(x, ...) UNUSED_2(x, UNUSED_2(__VA_ARGS__)) #define UNUSED_4(x, ...) UNUSED_2(x, UNUSED_3(__VA_ARGS__)) #define UNUSED_5(x, ...) UNUSED_2(x, UNUSED_4(__VA_ARGS__)) // NUM_ARGS(...) evaluates to the literal number of the passed-in arguments. #define _NUM_ARGS2(X, X5, X4, X3, X2, X1, N, ...) N #define NUM_ARGS(...) _NUM_ARGS2(0, __VA_ARGS__, 5, 4, 3, 2, 1, 0) #define _UNUSED_N3(N, ...) UNUSED_##N(__VA_ARGS__) #define _UNUSED_N2(N, ...) _UNUSED_N3(N, __VA_ARGS__) #define UNUSED(...) _UNUSED_N2(NUM_ARGS(__VA_ARGS__), __VA_ARGS__) #if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) #define __X86_64__ #endif //////////////////////////////////////////////////////////////////////////////// /// Basic Types //////////////////////////////////////////////////////////////////////////////// /* windows does not have ssize_t */ #ifdef __WIN32 #ifdef __X86_64__ typedef int64_t ssize_t; #else typedef int32_t ssize_t; #endif #endif //////////////////////////////////////////////////////////////////////////////// /// Disable some compiler warnings //////////////////////////////////////////////////////////////////////////////// #if defined(__INTEL_COMPILER) #pragma warning( \ disable : 265) // floating-point operation result is out of range #pragma warning( \ disable : 383) // value copied to temporary, reference to temporary used #pragma warning(disable : 869) // parameter was never referenced #pragma warning(disable : 981) // operands are evaluated in unspecified order #pragma warning( \ disable : 1418) // external function definition with no prior declaration #pragma warning(disable : 1419) // external declaration in primary source file #pragma warning(disable : 1572) // floating-point equality and inequality // comparisons are unreliable #pragma warning(disable : 94) // the size of an array must be greater than zero #pragma warning(disable : 1599) // declaration hides parameter #pragma warning(disable : 424) // extra ";" ignored #pragma warning(disable : 2196) // routine is both "inline" and "noinline" #pragma warning(disable : 177) // label was declared but never referenced #pragma warning(disable : 114) // function was referenced but not defined #endif #if defined(_MSC_VER) #pragma warning(disable : 4200) // nonstandard extension used : zero-sized // array in struct/union #pragma warning(disable : 4800) // forcing value to bool 'true' or 'false' // (performance warning) #pragma warning(disable : 4267) // '=' : conversion from 'size_t' to 'unsigned // long', possible loss of data #pragma warning(disable : 4244) // 'argument' : conversion from 'ssize_t' to // 'unsigned int', possible loss of data #pragma warning( \ disable : 4355) // 'this' : used in base member initializer list #pragma warning(disable : 391) // '<=' : signed / unsigned mismatch #pragma warning(disable : 4018) // '<' : signed / unsigned mismatch #pragma warning( \ disable : 4305) // 'initializing' : truncation from 'double' to 'float' #pragma warning(disable : 4068) // unknown pragma #pragma warning(disable : 4146) // unary minus operator applied to unsigned // type, result still unsigned #pragma warning(disable : 4838) // conversion from 'unsigned int' to 'const // int' requires a narrowing conversion) #pragma warning( \ disable : 4227) // anachronism used : qualifiers on reference are ignored #pragma warning( \ disable : 4251) // class 'type1' needs to have dll-interface // to be used by clients of class 'type2' #endif #if defined(__clang__) && !defined(__INTEL_COMPILER) #pragma clang diagnostic ignored "-Wunknown-pragmas" #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wreorder" #pragma clang diagnostic ignored "-Wmicrosoft" #pragma clang diagnostic ignored "-Wunused-private-field" #pragma clang diagnostic ignored "-Wunused-local-typedef" #pragma clang diagnostic ignored "-Wunused-function" #endif RenderKit-rkcommon-988718e/rkcommon/rkcommon.rc000066400000000000000000000037041467524601100215350ustar00rootroot00000000000000’ž// Copyright 2016 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "rkcommon/version.h" 1 VERSIONINFO FILEVERSION RKCOMMON_VERSION_MAJOR,RKCOMMON_VERSION_MINOR,RKCOMMON_VERSION_PATCH,0 PRODUCTVERSION RKCOMMON_VERSION_MAJOR,RKCOMMON_VERSION_MINOR,RKCOMMON_VERSION_PATCH,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "040904b0" BEGIN VALUE "CompanyName", "Intel" VALUE "FileDescription", "Intel® oneAPI Rendering Toolkit Common Library" VALUE "FileVersion", RKCOMMON_VERSION VALUE "ProductVersion", RKCOMMON_VERSION VALUE "LegalCopyright", "© 2009 Intel Corporation" VALUE "InternalName", "rkCommon" VALUE "ProductName", "rkCommon" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x409, 1200 END END RenderKit-rkcommon-988718e/rkcommon/tasking/000077500000000000000000000000001467524601100210165ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/tasking/AsyncLoop.h000066400000000000000000000106301467524601100230760ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include #include #include "../traits/rktraits.h" #include "schedule.h" #include "tasking_system_init.h" namespace rkcommon { namespace tasking { /*! This calls a given function in a continuous loop on a background thread owned by AsyncLoop. While it is running, the function it was constructed with is called over and over in a loop. When stopped, the thread is put to sleep until it is started again. An AsyncLoop has to be explicitly started, it is not automatically started on construction. */ class AsyncLoop { public: enum LaunchMethod { AUTO = 0, THREAD = 1, TASK = 2 }; template AsyncLoop(LOOP_BODY_FCN &&fcn, LaunchMethod m = AUTO); ~AsyncLoop(); void start(); void stop(); private: // Struct shared with the background thread to avoid dangling ptrs or // tricky synchronization when destroying the AsyncLoop and scheduling // threads with TBB, since we don't have a join point to sync with // the running thread struct AsyncLoopData { std::atomic threadShouldBeAlive{true}; std::atomic shouldBeRunning{false}; std::atomic insideLoopBody{false}; std::condition_variable runningCond; std::mutex runningMutex; }; std::shared_ptr loop; std::thread backgroundThread; }; // Inlined members // ////////////////////////////////////////////////////////// template inline AsyncLoop::AsyncLoop(LOOP_BODY_FCN &&fcn, AsyncLoop::LaunchMethod m) : loop(nullptr) { static_assert(traits::has_operator_method::value, "rkcommon::AsyncLoop() requires the implementation of " "method 'void LOOP_BODY_FCN::operator()' in order to " "construct the loop instance."); std::shared_ptr l = std::make_shared(); loop = l; auto mainLoop = [l, fcn]() { while (l->threadShouldBeAlive) { if (!l->threadShouldBeAlive) return; if (l->shouldBeRunning) { l->insideLoopBody = true; fcn(); l->insideLoopBody = false; } else { std::unique_lock lock(l->runningMutex); l->runningCond.wait(lock, [&] { return l->shouldBeRunning.load() || !l->threadShouldBeAlive.load(); }); } } }; if (m == AUTO) m = tasking::numTaskingThreads() > 4 ? TASK : THREAD; if (m == THREAD) backgroundThread = std::thread(mainLoop); else // m == TASK tasking::schedule(mainLoop); } inline AsyncLoop::~AsyncLoop() { // Note that the mutex here is still required even though these vars // are atomic, because we need to sync with the condition variable waiting // state on the async thread. Otherwise we might signal and the thread // will miss it, since it wasn't watching. { std::unique_lock lock(loop->runningMutex); loop->threadShouldBeAlive = false; loop->shouldBeRunning = false; } loop->runningCond.notify_one(); if (backgroundThread.joinable()) { backgroundThread.join(); } } inline void AsyncLoop::start() { if (!loop->shouldBeRunning) { // Note that the mutex here is still required even though these vars // are atomic, because we need to sync with the condition variable // waiting state on the async thread. Otherwise we might signal and the // thread will miss it, since it wasn't watching. { std::unique_lock lock(loop->runningMutex); loop->shouldBeRunning = true; } loop->runningCond.notify_one(); } } inline void AsyncLoop::stop() { if (loop->shouldBeRunning) { loop->shouldBeRunning = false; while (loop->insideLoopBody.load()) { std::this_thread::yield(); } } } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/AsyncTask.h000066400000000000000000000020771467524601100230750ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "detail/async_task.inl" #include #include namespace rkcommon { namespace tasking { template struct AsyncTask { AsyncTask(std::function fcn) : taskImpl([this, fcn]() { retValue = fcn(); jobFinished = true; }) { } virtual ~AsyncTask() noexcept { wait(); } bool finished() const { return jobFinished; } bool valid() const { return jobFinished; } void wait() { taskImpl.wait(); } T get() { if (!jobFinished) wait(); return retValue; } private: // declaration before taskImpl: ensure initialization before task finishes std::atomic jobFinished{false}; detail::AsyncTaskImpl> taskImpl; T retValue; }; } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/async.h000066400000000000000000000025401467524601100223050ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "schedule.h" namespace rkcommon { namespace tasking { template #if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 using operator_return_t = std::invoke_result_t; #else using operator_return_t = typename std::result_of::type; #endif // NOTE(jda) - This abstraction takes a lambda which should take captured // variables by *value* to ensure no captured references race // with the task itself. template inline auto async(TASK_T &&fcn) -> std::future> { static_assert(traits::has_operator_method::value, "rkcommon::tasking::async() requires the implementation of" "method 'RETURN_T TASK_T::operator()', where RETURN_T " "is the return value of the passed in task."); using package_t = std::packaged_task()>; auto task = new package_t(std::forward(fcn)); auto future = task->get_future(); schedule([=]() { (*task)(); delete task; }); return future; } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/000077500000000000000000000000001467524601100222605ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/tasking/detail/TaskSys.cpp000066400000000000000000000022571467524601100243730ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "TaskSys.h" // ospray #include "../../platform.h" namespace rkcommon { namespace tasking { namespace detail { // TaskSys definitions ////////////////////////////////////////////////// static std::unique_ptr g_ts; // Interface definitions //////////////////////////////////////////////// void initTaskSystemInternal(int nThreads) { g_ts = std::unique_ptr(new enki::TaskScheduler()); if (nThreads < 1) nThreads = enki::GetNumHardwareThreads(); g_ts->Initialize(nThreads); } void shutdownTaskSystemInternal() { g_ts.reset(); } int numThreadsTaskSystemInternal() { return g_ts->GetNumTaskThreads(); } void scheduleTaskInternal(Task *task) { if (g_ts.get() == nullptr) initTaskSystemInternal(-1); g_ts->AddTaskSetToPipe(task); } void waitInternal(Task *task) { g_ts->WaitforTask(task); } } // namespace detail } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/TaskSys.h000066400000000000000000000025241467524601100240350ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../../common.h" // enkiTS #include "enkiTS/TaskScheduler.h" namespace rkcommon { namespace tasking { namespace detail { // Public interface to the tasking system /////////////////////////////// using Task = enki::ITaskSet; void initTaskSystemInternal(int numThreads = -1); void shutdownTaskSystemInternal(); int numThreadsTaskSystemInternal(); void RKCOMMON_INTERFACE scheduleTaskInternal(Task *task); void RKCOMMON_INTERFACE waitInternal(Task *task); template inline void parallel_for_internal(int nTasks, TASK_T &&fcn) { struct LocalTask : public Task { const TASK_T &t; LocalTask(int nunTasks, TASK_T &&fcn) : Task(nunTasks), t(std::forward(fcn)) { } ~LocalTask() override = default; void ExecuteRange(enki::TaskSetPartition tp, uint32_t) override { for (auto i = tp.start; i < tp.end; ++i) t(i); } }; LocalTask task(nTasks, std::forward(fcn)); scheduleTaskInternal(&task); waitInternal(&task); } } // namespace detail } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/async_task.inl000066400000000000000000000037711467524601100251330ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #include #elif defined(RKCOMMON_TASKING_OMP) #include #elif defined(RKCOMMON_TASKING_INTERNAL) #include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template struct AsyncTaskImpl { AsyncTaskImpl(TASK_T &&fcn); void wait(); private: #if defined(RKCOMMON_TASKING_TBB) tbb::task_group taskGroup; #elif defined(RKCOMMON_TASKING_OMP) std::thread thread; #elif defined(RKCOMMON_TASKING_INTERNAL) struct LocalTask : public enki::ITaskSet { TASK_T t; LocalTask(TASK_T &&fcn) : t(std::forward(fcn)) {} void ExecuteRange(enki::TaskSetPartition, uint32_t) override { t(); } }; LocalTask task; #endif }; // Inlined definitions // //////////////////////////////////////////////////// template inline AsyncTaskImpl::AsyncTaskImpl(TASK_T &&fcn) #if defined(RKCOMMON_TASKING_TBB) { taskGroup.run(std::forward(fcn)); } #elif defined(RKCOMMON_TASKING_OMP) : thread(std::forward(fcn)) { } #elif defined(RKCOMMON_TASKING_INTERNAL) : task(std::forward(fcn)) { detail::scheduleTaskInternal(&task); } #else { fcn(); } #endif template inline void AsyncTaskImpl::wait() { #if defined(RKCOMMON_TASKING_TBB) taskGroup.wait(); #elif defined(RKCOMMON_TASKING_OMP) if (thread.joinable()) thread.join(); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::waitInternal(&task); #endif } } // namespace detail } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/enkiTS/000077500000000000000000000000001467524601100234555ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/tasking/detail/enkiTS/LockLessMultiReadPipe.h000066400000000000000000000255201467524601100277760ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #include #include #ifndef ENKI_ASSERT #include #define ENKI_ASSERT(x) assert(x) #endif namespace enki { // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming // Readers can only read from the back of the pipe // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader) // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx // Note: using log2 sizes so we do not need to clamp (multi-operation) // T is the contained type // Note this is not true lockless as the use of flags as a form of lock state. template class LockLessMultiReadPipe { public: LockLessMultiReadPipe(); ~LockLessMultiReadPipe() {} // ReaderTryReadBack returns false if we were unable to read // This is thread safe for both multiple readers and the writer bool ReaderTryReadBack( T* pOut ); // WriterTryReadFront returns false if we were unable to read // This is thread safe for the single writer, but should not be called by readers bool WriterTryReadFront( T* pOut ); // WriterTryWriteFront returns false if we were unable to write // This is thread safe for the single writer, but should not be called by readers bool WriterTryWriteFront( const T& in ); // IsPipeEmpty() is a utility function, not intended for general use // Should only be used very prudently. bool IsPipeEmpty() const { return 0 == m_WriteIndex.load( std::memory_order_relaxed ) - m_ReadCount.load( std::memory_order_relaxed ); } void Clear() { m_WriteIndex = 0; m_ReadIndex = 0; m_ReadCount = 0; memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); } private: const static uint32_t ms_cSize = ( 1 << cSizeLog2 ); const static uint32_t ms_cIndexMask = ms_cSize - 1; const static uint32_t FLAG_INVALID = 0xFFFFFFFF; // 32bit for CAS const static uint32_t FLAG_CAN_WRITE = 0x00000000; // 32bit for CAS const static uint32_t FLAG_CAN_READ = 0x11111111; // 32bit for CAS T m_Buffer[ ms_cSize ]; // read and write indexes allow fast access to the pipe, but actual access // controlled by the access flags. std::atomic m_WriteIndex; std::atomic m_ReadCount; std::atomic m_Flags[ ms_cSize ]; std::atomic m_ReadIndex; }; template inline LockLessMultiReadPipe::LockLessMultiReadPipe() : m_WriteIndex(0) , m_ReadCount(0) , m_ReadIndex(0) { ENKI_ASSERT( cSizeLog2 < 32 ); memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); } template inline bool LockLessMultiReadPipe::ReaderTryReadBack( T* pOut ) { uint32_t actualReadIndex; uint32_t readCount = m_ReadCount.load( std::memory_order_relaxed ); // We get hold of read index for consistency // and do first pass starting at read count uint32_t readIndexToUse = readCount; while(true) { uint32_t writeIndex = m_WriteIndex.load( std::memory_order_relaxed ); // power of two sizes ensures we can use a simple calc without modulus uint32_t numInPipe = writeIndex - readCount; if( 0 == numInPipe ) { return false; } if( readIndexToUse >= writeIndex ) { readIndexToUse = m_ReadIndex.load( std::memory_order_relaxed ); } // power of two sizes ensures we can perform AND for a modulus actualReadIndex = readIndexToUse & ms_cIndexMask; // Multiple potential readers mean we should check if the data is valid, // using an atomic compare exchange uint32_t previous = FLAG_CAN_READ; bool bSuccess = m_Flags[ actualReadIndex ].compare_exchange_strong( previous, FLAG_INVALID, std::memory_order_acq_rel, std::memory_order_relaxed ); if( bSuccess ) { break; } ++readIndexToUse; // Update read count readCount = m_ReadCount.load( std::memory_order_relaxed ); } // we update the read index using an atomic add, as we've only read one piece of data. // this ensure consistency of the read index, and the above loop ensures readers // only read from unread data m_ReadCount.fetch_add(1, std::memory_order_relaxed ); // now read data, ensuring we do so after above reads & CAS *pOut = m_Buffer[ actualReadIndex ]; m_Flags[ actualReadIndex ].store( FLAG_CAN_WRITE, std::memory_order_release ); return true; } template inline bool LockLessMultiReadPipe::WriterTryReadFront( T* pOut ) { uint32_t writeIndex = m_WriteIndex.load( std::memory_order_relaxed ); uint32_t frontReadIndex = writeIndex; // Multiple potential readers mean we should check if the data is valid, // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really). uint32_t actualReadIndex = 0; while(true) { uint32_t readCount = m_ReadCount.load( std::memory_order_relaxed ); // power of two sizes ensures we can use a simple calc without modulus uint32_t numInPipe = writeIndex - readCount; if( 0 == numInPipe ) { m_ReadIndex.store( readCount, std::memory_order_release ); return false; } --frontReadIndex; actualReadIndex = frontReadIndex & ms_cIndexMask; uint32_t previous = FLAG_CAN_READ; bool success = m_Flags[ actualReadIndex ].compare_exchange_strong( previous, FLAG_INVALID, std::memory_order_acq_rel, std::memory_order_relaxed ); if( success ) { break; } else if( m_ReadIndex.load( std::memory_order_acquire ) >= frontReadIndex ) { return false; } } // now read data, ensuring we do so after above reads & CAS *pOut = m_Buffer[ actualReadIndex ]; m_Flags[ actualReadIndex ].store( FLAG_CAN_WRITE, std::memory_order_relaxed ); m_WriteIndex.store(writeIndex-1, std::memory_order_relaxed); return true; } template inline bool LockLessMultiReadPipe::WriterTryWriteFront( const T& in ) { // The writer 'owns' the write index, and readers can only reduce // the amount of data in the pipe. // We get hold of both values for consistency and to reduce false sharing // impacting more than one access uint32_t writeIndex = m_WriteIndex; // power of two sizes ensures we can perform AND for a modulus uint32_t actualWriteIndex = writeIndex & ms_cIndexMask; // a reader may still be reading this item, as there are multiple readers if( m_Flags[ actualWriteIndex ].load(std::memory_order_acquire) != FLAG_CAN_WRITE ) { return false; // still being read, so have caught up with tail. } // as we are the only writer we can update the data without atomics // whilst the write index has not been updated m_Buffer[ actualWriteIndex ] = in; m_Flags[ actualWriteIndex ].store( FLAG_CAN_READ, std::memory_order_release ); m_WriteIndex.fetch_add(1, std::memory_order_relaxed); return true; } // Lockless multiwriter intrusive list // Type T must implement T* volatile pNext; template class LocklessMultiWriteIntrusiveList { std::atomic pHead; T tail; public: LocklessMultiWriteIntrusiveList() : pHead( &tail ) { tail.pNext = NULL; } bool IsListEmpty() const { return pHead == &tail; } // Add - safe to perform from any thread void WriterWriteFront( T* pNode_ ) { ENKI_ASSERT( pNode_ ); pNode_->pNext = NULL; T* pPrev = pHead.exchange( pNode_ ); pPrev->pNext = pNode_; } // Remove - only thread safe for owner T* ReaderReadBack() { T* pTailPlus1 = tail.pNext; if( pTailPlus1 ) { T* pTailPlus2 = pTailPlus1->pNext; if( pTailPlus2 ) { //not head tail.pNext = pTailPlus2; } else { tail.pNext = NULL; T* pCompare = pTailPlus1; // we need preserve pTailPlus1 as compare will alter it on failure // pTailPlus1 is the head, attempt swap with tail if( !pHead.compare_exchange_strong( pCompare, &tail ) ) { // pCompare receives the revised pHead on failure. // pTailPlus1 is no longer the head, so pTailPlus1->pNext should be non NULL while( (T*)NULL == pTailPlus1->pNext ) {;} // wait for pNext to be updated as head may have just changed. tail.pNext = pTailPlus1->pNext.load(); pTailPlus1->pNext = NULL; } } } return pTailPlus1; } }; } RenderKit-rkcommon-988718e/rkcommon/tasking/detail/enkiTS/TaskScheduler.cpp000066400000000000000000001612651467524601100267350ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #include "TaskScheduler.h" #include "LockLessMultiReadPipe.h" #include #if defined __i386__ || defined __x86_64__ #include "x86intrin.h" #elif defined _WIN32 #include #endif using namespace enki; #if defined(ENKI_CUSTOM_ALLOC_FILE_AND_LINE) #define ENKI_FILE_AND_LINE __FILE__, __LINE__ #else namespace { const char* gc_File = ""; const uint32_t gc_Line = 0; } #define ENKI_FILE_AND_LINE gc_File, gc_Line #endif // UWP and MinGW don't have GetActiveProcessorCount #if defined(_WIN64) \ && !defined(__MINGW32__) \ && !(defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_PC_APP || WINAPI_FAMILY == WINAPI_FAMILY_PHONE_APP)) #define ENKI_USE_WINDOWS_PROCESSOR_API #endif #ifdef ENKI_USE_WINDOWS_PROCESSOR_API #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #ifndef NOMINMAX #define NOMINMAX #endif #include #endif uint32_t enki::GetNumHardwareThreads() { #ifdef ENKI_USE_WINDOWS_PROCESSOR_API return GetActiveProcessorCount(ALL_PROCESSOR_GROUPS); #else return std::thread::hardware_concurrency(); #endif } namespace enki { static constexpr int32_t gc_TaskStartCount = 2; static constexpr int32_t gc_TaskAlmostCompleteCount = 1; // GetIsComplete() will return false, but execution is done and about to complete static constexpr uint32_t gc_PipeSizeLog2 = 8; static constexpr uint32_t gc_SpinCount = 10; static constexpr uint32_t gc_SpinBackOffMultiplier = 100; static constexpr uint32_t gc_MaxNumInitialPartitions = 8; static constexpr uint32_t gc_MaxStolenPartitions = 1 << gc_PipeSizeLog2; static constexpr uint32_t gc_CacheLineSize = 64; // awaiting std::hardware_constructive_interference_size } // thread_local not well supported yet by some older C++11 compilers. // For XCode before version 8 thread_local is not defined, so add to your compile defines: ENKI_THREAD_LOCAL __thread #ifndef ENKI_THREAD_LOCAL #if defined(_MSC_VER) && _MSC_VER <= 1800 #define ENKI_THREAD_LOCAL __declspec(thread) // Removed below as XCode supports thread_local since version 8 // #elif __APPLE__ // // Apple thread_local currently not implemented in XCode before version 8 despite it being in Clang. // #define ENKI_THREAD_LOCAL __thread #else #define ENKI_THREAD_LOCAL thread_local #endif #endif // each software thread gets its own copy of gtl_threadNum, so this is safe to use as a static variable static ENKI_THREAD_LOCAL uint32_t gtl_threadNum = enki::NO_THREAD_NUM; namespace enki { struct SubTaskSet { ITaskSet* pTask; TaskSetPartition partition; }; // we derive class TaskPipe rather than typedef to get forward declaration working easily class TaskPipe : public LockLessMultiReadPipe {}; enum ThreadState : int32_t { ENKI_THREAD_STATE_NONE, // shouldn't get this value ENKI_THREAD_STATE_NOT_LAUNCHED, // for debug purposes - indicates enki task thread not yet launched ENKI_THREAD_STATE_RUNNING, ENKI_THREAD_STATE_PRIMARY_REGISTERED, // primary thread is the one enkiTS was initialized on ENKI_THREAD_STATE_EXTERNAL_REGISTERED, ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED, ENKI_THREAD_STATE_WAIT_TASK_COMPLETION, ENKI_THREAD_STATE_WAIT_NEW_TASKS, ENKI_THREAD_STATE_WAIT_NEW_PINNED_TASKS, ENKI_THREAD_STATE_STOPPED, }; struct ThreadArgs { uint32_t threadNum; TaskScheduler* pTaskScheduler; }; struct alignas(enki::gc_CacheLineSize) ThreadDataStore { semaphoreid_t* pWaitNewPinnedTaskSemaphore = nullptr; std::atomic threadState = { ENKI_THREAD_STATE_NONE }; uint32_t rndSeed = 0; char prevent_false_Share[ enki::gc_CacheLineSize - sizeof(std::atomic) - sizeof(semaphoreid_t*) - sizeof( uint32_t ) ]; // required to prevent alignment padding warning }; constexpr size_t SIZEOFTHREADDATASTORE = sizeof( ThreadDataStore ); // for easier inspection static_assert( SIZEOFTHREADDATASTORE == enki::gc_CacheLineSize, "ThreadDataStore may exhibit false sharing" ); class PinnedTaskList : public LocklessMultiWriteIntrusiveList {}; semaphoreid_t* SemaphoreCreate(); void SemaphoreDelete( semaphoreid_t* pSemaphore_ ); void SemaphoreWait( semaphoreid_t& semaphoreid ); void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ); } namespace { SubTaskSet SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ ) { SubTaskSet splitTask = subTask_; uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start; rangeToSplit_ = std::min( rangeToSplit_, rangeLeft ); splitTask.partition.end = subTask_.partition.start + rangeToSplit_; subTask_.partition.start = splitTask.partition.end; return splitTask; } #if ( defined _WIN32 && ( defined _M_IX86 || defined _M_X64 ) ) || ( defined __i386__ || defined __x86_64__ ) // Note: see https://software.intel.com/en-us/articles/a-common-construct-to-avoid-the-contention-of-threads-architecture-agnostic-spin-wait-loops void SpinWait( uint32_t spinCount_ ) { uint64_t end = __rdtsc() + spinCount_; while( __rdtsc() < end ) { _mm_pause(); } } #else void SpinWait( uint32_t spinCount_ ) { while( spinCount_ ) { // TODO: may have NOP or yield equiv --spinCount_; } } #endif void SafeCallback( ProfilerCallbackFunc func_, uint32_t threadnum_ ) { if( func_ != nullptr ) { func_( threadnum_ ); } } } ENKITS_API void* enki::DefaultAllocFunc( size_t align_, size_t size_, void* userData_, const char* file_, int line_ ) { (void)userData_; (void)file_; (void)line_; void* pRet; #ifdef _WIN32 pRet = (void*)_aligned_malloc( size_, align_ ); #else pRet = nullptr; if( align_ <= size_ && align_ <= alignof(int64_t) ) { // no need for alignment, use malloc pRet = malloc( size_ ); } else { int retval = posix_memalign( &pRet, align_, size_ ); (void)retval; // unused } #endif return pRet; } ENKITS_API void enki::DefaultFreeFunc( void* ptr_, size_t size_, void* userData_, const char* file_, int line_ ) { (void)size_; (void)userData_; (void)file_; (void)line_; #ifdef _WIN32 _aligned_free( ptr_ ); #else free( ptr_ ); #endif } bool TaskScheduler::RegisterExternalTaskThread() { bool bRegistered = false; while( !bRegistered && m_NumExternalTaskThreadsRegistered < (int32_t)m_Config.numExternalTaskThreads ) { for(uint32_t thread = GetNumFirstExternalTaskThread(); thread < GetNumFirstExternalTaskThread() + m_Config.numExternalTaskThreads; ++thread ) { ThreadState threadStateExpected = ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED; if( m_pThreadDataStore[thread].threadState.compare_exchange_strong( threadStateExpected, ENKI_THREAD_STATE_EXTERNAL_REGISTERED ) ) { ++m_NumExternalTaskThreadsRegistered; gtl_threadNum = thread; bRegistered = true; break; } } } return bRegistered; } bool TaskScheduler::RegisterExternalTaskThread( uint32_t threadNumToRegister_ ) { ENKI_ASSERT( threadNumToRegister_ >= GetNumFirstExternalTaskThread() ); ENKI_ASSERT( threadNumToRegister_ < ( GetNumFirstExternalTaskThread() + m_Config.numExternalTaskThreads ) ); ThreadState threadStateExpected = ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED; if( m_pThreadDataStore[threadNumToRegister_].threadState.compare_exchange_strong( threadStateExpected, ENKI_THREAD_STATE_EXTERNAL_REGISTERED ) ) { ++m_NumExternalTaskThreadsRegistered; gtl_threadNum = threadNumToRegister_; return true; } return false; } void TaskScheduler::DeRegisterExternalTaskThread() { ENKI_ASSERT( gtl_threadNum != enki::NO_THREAD_NUM ); ENKI_ASSERT( gtl_threadNum >= GetNumFirstExternalTaskThread() ); ThreadState threadState = m_pThreadDataStore[gtl_threadNum].threadState.load( std::memory_order_acquire ); ENKI_ASSERT( threadState == ENKI_THREAD_STATE_EXTERNAL_REGISTERED ); if( threadState == ENKI_THREAD_STATE_EXTERNAL_REGISTERED ) { --m_NumExternalTaskThreadsRegistered; m_pThreadDataStore[gtl_threadNum].threadState.store( ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED, std::memory_order_release ); gtl_threadNum = enki::NO_THREAD_NUM; } } uint32_t TaskScheduler::GetNumRegisteredExternalTaskThreads() { return m_NumExternalTaskThreadsRegistered; } void TaskScheduler::TaskingThreadFunction( const ThreadArgs& args_ ) { uint32_t threadNum = args_.threadNum; TaskScheduler* pTS = args_.pTaskScheduler; gtl_threadNum = threadNum; pTS->m_pThreadDataStore[threadNum].threadState.store( ENKI_THREAD_STATE_RUNNING, std::memory_order_release ); SafeCallback( pTS->m_Config.profilerCallbacks.threadStart, threadNum ); uint32_t spinCount = 0; uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped. while( pTS->GetIsRunningInt() ) { if( !pTS->TryRunTask( threadNum, hintPipeToCheck_io ) ) { // no tasks, will spin then wait ++spinCount; if( spinCount > gc_SpinCount ) { pTS->WaitForNewTasks( threadNum ); } else { uint32_t spinBackoffCount = spinCount * gc_SpinBackOffMultiplier; SpinWait( spinBackoffCount ); } } else { spinCount = 0; // have run a task so reset spin count. } } pTS->m_NumInternalTaskThreadsRunning.fetch_sub( 1, std::memory_order_release ); pTS->m_pThreadDataStore[threadNum].threadState.store( ENKI_THREAD_STATE_STOPPED, std::memory_order_release ); SafeCallback( pTS->m_Config.profilerCallbacks.threadStop, threadNum ); } void TaskScheduler::StartThreads() { if( m_bHaveThreads ) { return; } m_NumThreads = m_Config.numTaskThreadsToCreate + m_Config.numExternalTaskThreads + 1; for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { m_pPipesPerThread[ priority ] = NewArray( m_NumThreads, ENKI_FILE_AND_LINE ); m_pPinnedTaskListPerThread[ priority ] = NewArray( m_NumThreads, ENKI_FILE_AND_LINE ); } m_pNewTaskSemaphore = SemaphoreNew(); m_pTaskCompleteSemaphore = SemaphoreNew(); // we create one less thread than m_NumThreads as the main thread counts as one m_pThreadDataStore = NewArray( m_NumThreads, ENKI_FILE_AND_LINE ); m_pThreads = NewArray( m_NumThreads, ENKI_FILE_AND_LINE ); m_bRunning = true; m_bWaitforAllCalled = false; m_bShutdownRequested = false; // current thread is primary enkiTS thread m_pThreadDataStore[0].threadState = ENKI_THREAD_STATE_PRIMARY_REGISTERED; gtl_threadNum = 0; for( uint32_t thread = GetNumFirstExternalTaskThread(); thread < m_Config.numExternalTaskThreads + GetNumFirstExternalTaskThread(); ++thread ) { m_pThreadDataStore[thread].threadState = ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED; } for( uint32_t thread = m_Config.numExternalTaskThreads + GetNumFirstExternalTaskThread(); thread < m_NumThreads; ++thread ) { m_pThreadDataStore[thread].threadState = ENKI_THREAD_STATE_NOT_LAUNCHED; } // Create Wait New Pinned Task Semaphores and init rndSeed for( uint32_t threadNum = 0; threadNum < m_NumThreads; ++threadNum ) { m_pThreadDataStore[threadNum].pWaitNewPinnedTaskSemaphore = SemaphoreNew(); m_pThreadDataStore[threadNum].rndSeed = threadNum; } // only launch threads once all thread states are set for( uint32_t thread = m_Config.numExternalTaskThreads + GetNumFirstExternalTaskThread(); thread < m_NumThreads; ++thread ) { m_pThreads[thread] = std::thread( TaskingThreadFunction, ThreadArgs{ thread, this } ); ++m_NumInternalTaskThreadsRunning; } // ensure we have sufficient tasks to equally fill either all threads including main // or just the threads we've launched, this is outside the first init as we want to be able // to runtime change it if( 1 == m_NumThreads ) { m_NumPartitions = 1; m_NumInitialPartitions = 1; } else { // There could be more threads than hardware threads if external threads are // being intended for blocking functionality such as io etc. // We only need to partition for a maximum of the available processor parallelism. uint32_t numThreadsToPartitionFor = std::min( m_NumThreads, GetNumHardwareThreads() ); m_NumPartitions = numThreadsToPartitionFor * (numThreadsToPartitionFor - 1); // ensure m_NumPartitions, m_NumInitialPartitions non zero, can happen if m_NumThreads > 1 && GetNumHardwareThreads() == 1 m_NumPartitions = std::max( m_NumPartitions, (uint32_t)1 ); m_NumInitialPartitions = std::max( numThreadsToPartitionFor - 1, (uint32_t)1 ); m_NumInitialPartitions = std::min( m_NumInitialPartitions, gc_MaxNumInitialPartitions ); } #ifdef ENKI_USE_WINDOWS_PROCESSOR_API // x64 bit Windows may support >64 logical processors using processor groups, and only allocate threads to a default group. // We need to detect this and distribute threads accordingly if( GetNumHardwareThreads() > 64 && // only have processor groups if > 64 hardware threads std::thread::hardware_concurrency() < GetNumHardwareThreads() && // if std::thread sees > 64 hardware threads no need to distribute std::thread::hardware_concurrency() < m_NumThreads ) // no need to distribute if number of threads requested lower than std::thread sees { uint32_t numProcessorGroups = GetActiveProcessorGroupCount(); GROUP_AFFINITY mainThreadAffinity; BOOL success = GetThreadGroupAffinity( GetCurrentThread(), &mainThreadAffinity ); ENKI_ASSERT( success ); if( success ) { uint32_t mainProcessorGroup = mainThreadAffinity.Group; uint32_t currLogicalProcess = GetActiveProcessorCount( (WORD)mainProcessorGroup ); // we start iteration at end of current process group's threads // If more threads are created than there are logical processors then we still want to distribute them evenly amongst groups // so we iterate continuously around the groups until we reach m_NumThreads uint32_t group = 0; while( currLogicalProcess < m_NumThreads ) { ++group; // start at group 1 since we set currLogicalProcess to start of next group uint32_t currGroup = ( group + mainProcessorGroup ) % numProcessorGroups; // we start at mainProcessorGroup, go round in circles uint32_t groupNumLogicalProcessors = GetActiveProcessorCount( (WORD)currGroup ); ENKI_ASSERT( groupNumLogicalProcessors <= 64 ); uint64_t GROUPMASK = 0xFFFFFFFFFFFFFFFFULL >> (64-groupNumLogicalProcessors); // group mask should not have 1's where there are no processors for( uint32_t groupLogicalProcess = 0; ( groupLogicalProcess < groupNumLogicalProcessors ) && ( currLogicalProcess < m_NumThreads ); ++groupLogicalProcess, ++currLogicalProcess ) { if( currLogicalProcess > m_Config.numExternalTaskThreads + GetNumFirstExternalTaskThread() ) { auto thread_handle = m_pThreads[currLogicalProcess].native_handle(); // From https://learn.microsoft.com/en-us/windows/win32/procthread/processor-groups // If a thread is assigned to a different group than the process, the process's affinity is updated to include the thread's affinity // and the process becomes a multi-group process. GROUP_AFFINITY threadAffinity; success = GetThreadGroupAffinity( thread_handle, &threadAffinity ); ENKI_ASSERT(success); (void)success; if( threadAffinity.Group != currGroup ) { threadAffinity.Group = (WORD)currGroup; threadAffinity.Mask = GROUPMASK; success = SetThreadGroupAffinity( thread_handle, &threadAffinity, nullptr ); ENKI_ASSERT( success ); (void)success; } } } } } } #endif m_bHaveThreads = true; } void TaskScheduler::StopThreads( bool bWait_ ) { // we set m_bWaitforAllCalled to true to ensure any task which loop using this status exit m_bWaitforAllCalled.store( true, std::memory_order_release ); // set status m_bShutdownRequested.store( true, std::memory_order_release ); m_bRunning.store( false, std::memory_order_release ); if( m_bHaveThreads ) { // wait for threads to quit before deleting data while( bWait_ && m_NumInternalTaskThreadsRunning ) { // keep firing event to ensure all threads pick up state of m_bRunning WakeThreadsForNewTasks(); for( uint32_t threadId = 0; threadId < m_NumThreads; ++threadId ) { // send wait for new pinned tasks signal to ensure any waiting are awoken SemaphoreSignal( *m_pThreadDataStore[ threadId ].pWaitNewPinnedTaskSemaphore, 1 ); } } // detach threads starting with thread GetNumFirstExternalTaskThread() (as 0 is initialization thread). for( uint32_t thread = m_Config.numExternalTaskThreads + GetNumFirstExternalTaskThread(); thread < m_NumThreads; ++thread ) { ENKI_ASSERT( m_pThreads[thread].joinable() ); m_pThreads[thread].join(); } // delete any Wait New Pinned Task Semaphores for( uint32_t threadNum = 0; threadNum < m_NumThreads; ++threadNum ) { SemaphoreDelete( m_pThreadDataStore[threadNum].pWaitNewPinnedTaskSemaphore ); } DeleteArray( m_pThreadDataStore, m_NumThreads, ENKI_FILE_AND_LINE ); DeleteArray( m_pThreads, m_NumThreads, ENKI_FILE_AND_LINE ); m_pThreadDataStore = 0; m_pThreads = 0; SemaphoreDelete( m_pNewTaskSemaphore ); m_pNewTaskSemaphore = 0; SemaphoreDelete( m_pTaskCompleteSemaphore ); m_pTaskCompleteSemaphore = 0; m_bHaveThreads = false; m_NumThreadsWaitingForNewTasks = 0; m_NumThreadsWaitingForTaskCompletion = 0; m_NumInternalTaskThreadsRunning = 0; m_NumExternalTaskThreadsRegistered = 0; for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { DeleteArray( m_pPipesPerThread[ priority ], m_NumThreads, ENKI_FILE_AND_LINE ); m_pPipesPerThread[ priority ] = NULL; DeleteArray( m_pPinnedTaskListPerThread[ priority ], m_NumThreads, ENKI_FILE_AND_LINE ); m_pPinnedTaskListPerThread[ priority ] = NULL; } m_NumThreads = 0; } } bool TaskScheduler::TryRunTask( uint32_t threadNum_, uint32_t& hintPipeToCheck_io_ ) { for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { if( TryRunTask( threadNum_, priority, hintPipeToCheck_io_ ) ) { return true; } } return false; } static inline uint32_t RotateLeft( uint32_t value, int32_t count ) { return ( value << count ) | ( value >> ( 32 - count )); } /* xxHash variant based on documentation on https://github.com/Cyan4973/xxHash/blob/eec5700f4d62113b47ee548edbc4746f61ffb098/doc/xxhash_spec.md Copyright (c) Yann Collet Permission is granted to copy and distribute this document for any purpose and without charge, including translations into other languages and incorporation into compilations, provided that the copyright notice and this notice are preserved, and that any substantive changes or deletions from the original are clearly marked. Distribution of this document is unlimited. */ static inline uint32_t Hash32( uint32_t in_ ) { static const uint32_t PRIME32_1 = 2654435761U; // 0b10011110001101110111100110110001 static const uint32_t PRIME32_2 = 2246822519U; // 0b10000101111010111100101001110111 static const uint32_t PRIME32_3 = 3266489917U; // 0b11000010101100101010111000111101 static const uint32_t PRIME32_4 = 668265263U; // 0b00100111110101001110101100101111 static const uint32_t PRIME32_5 = 374761393U; // 0b00010110010101100110011110110001 static const uint32_t SEED = 0; // can configure seed if needed // simple hash of nodes, does not check if nodePool is compressed or not. uint32_t acc = SEED + PRIME32_5; // add node types to map, and also ensure that fully empty nodes are well distributed by hashing the pointer. acc += in_; acc = acc ^ (acc >> 15); acc = acc * PRIME32_2; acc = acc ^ (acc >> 13); acc = acc * PRIME32_3; acc = acc ^ (acc >> 16); return acc; } bool TaskScheduler::TryRunTask( uint32_t threadNum_, uint32_t priority_, uint32_t& hintPipeToCheck_io_ ) { // Run any tasks for this thread RunPinnedTasks( threadNum_, priority_ ); // check for tasks SubTaskSet subTask; bool bHaveTask = m_pPipesPerThread[ priority_ ][ threadNum_ ].WriterTryReadFront( &subTask ); uint32_t threadToCheckStart = hintPipeToCheck_io_ % m_NumThreads; uint32_t threadToCheck = threadToCheckStart; uint32_t checkCount = 0; if( !bHaveTask ) { bHaveTask = m_pPipesPerThread[ priority_ ][ threadToCheck ].ReaderTryReadBack( &subTask ); if( !bHaveTask ) { // To prevent many threads checking the same task pipe for work we pseudorandomly distribute // the starting thread which we start checking for tasks to run uint32_t& rndSeed = m_pThreadDataStore[threadNum_].rndSeed; ++rndSeed; uint32_t threadToCheckOffset = Hash32( rndSeed * threadNum_ ); while( !bHaveTask && checkCount < m_NumThreads ) { threadToCheck = ( threadToCheckOffset + checkCount ) % m_NumThreads; if( threadToCheck != threadNum_ && threadToCheckOffset != threadToCheckStart ) { bHaveTask = m_pPipesPerThread[ priority_ ][ threadToCheck ].ReaderTryReadBack( &subTask ); } ++checkCount; } } } if( bHaveTask ) { // update hint, will preserve value unless actually got task from another thread. hintPipeToCheck_io_ = threadToCheck; uint32_t partitionSize = subTask.partition.end - subTask.partition.start; if( subTask.pTask->m_RangeToRun < partitionSize ) { SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun ); uint32_t rangeToSplit = subTask.pTask->m_RangeToRun; if( threadNum_ != threadToCheck ) { // task was stolen from another thread // in order to ensure other threads can get enough work we need to split into larger ranges // these larger splits are then stolen and split themselves // otherwise other threads must keep stealing from this thread, which may stall when pipe is full rangeToSplit = std::max( rangeToSplit, (subTask.partition.end - subTask.partition.start) / gc_MaxStolenPartitions ); } SplitAndAddTask( threadNum_, subTask, rangeToSplit ); taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum_ ); int prevCount = taskToRun.pTask->m_RunningCount.fetch_sub(1,std::memory_order_acq_rel ); if( gc_TaskStartCount == prevCount ) { TaskComplete( taskToRun.pTask, true, threadNum_ ); } } else { // the task has already been divided up by AddTaskSetToPipe, so just run it subTask.pTask->ExecuteRange( subTask.partition, threadNum_ ); int prevCount = subTask.pTask->m_RunningCount.fetch_sub(1,std::memory_order_acq_rel ); if( gc_TaskStartCount == prevCount ) { TaskComplete( subTask.pTask, true, threadNum_ ); } } } return bHaveTask; } void TaskScheduler::TaskComplete( ICompletable* pTask_, bool bWakeThreads_, uint32_t threadNum_ ) { // It must be impossible for a thread to enter the sleeping wait prior to the load of m_WaitingForTaskCount // in this function, so we introduce a gc_TaskAlmostCompleteCount to prevent this. ENKI_ASSERT( gc_TaskAlmostCompleteCount == pTask_->m_RunningCount.load( std::memory_order_acquire ) ); bool bCallWakeThreads = bWakeThreads_ && pTask_->m_WaitingForTaskCount.load( std::memory_order_acquire ); Dependency* pDependent = pTask_->m_pDependents; // Do not access pTask_ below this line unless we have dependencies. pTask_->m_RunningCount.store( 0, std::memory_order_release ); if( bCallWakeThreads ) { WakeThreadsForTaskCompletion(); } while( pDependent ) { // access pTaskToRunOnCompletion member data before incrementing m_DependenciesCompletedCount so // they do not get deleted when another thread completes the pTaskToRunOnCompletion int32_t dependenciesCount = pDependent->pTaskToRunOnCompletion->m_DependenciesCount; // get temp copy of pDependent so OnDependenciesComplete can delete task if needed. Dependency* pDependentCurr = pDependent; pDependent = pDependent->pNext; int32_t prevDeps = pDependentCurr->pTaskToRunOnCompletion->m_DependenciesCompletedCount.fetch_add( 1, std::memory_order_release ); ENKI_ASSERT( prevDeps < dependenciesCount ); if( dependenciesCount == ( prevDeps + 1 ) ) { // reset dependencies // only safe to access pDependentCurr here after above fetch_add because this is the thread // which calls OnDependenciesComplete after store with memory_order_release pDependentCurr->pTaskToRunOnCompletion->m_DependenciesCompletedCount.store( 0, std::memory_order_release ); pDependentCurr->pTaskToRunOnCompletion->OnDependenciesComplete( this, threadNum_ ); } } } bool TaskScheduler::HaveTasks( uint32_t threadNum_ ) { for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { for( uint32_t thread = 0; thread < m_NumThreads; ++thread ) { if( !m_pPipesPerThread[ priority ][ thread ].IsPipeEmpty() ) { return true; } } if( !m_pPinnedTaskListPerThread[ priority ][ threadNum_ ].IsListEmpty() ) { return true; } } return false; } void TaskScheduler::WaitForNewTasks( uint32_t threadNum_ ) { // We don't want to suspend this thread if there are task threads // with pinned tasks suspended, as it could result in this thread // being unsuspended and not the thread with pinned tasks if( WakeSuspendedThreadsWithPinnedTasks( threadNum_ ) ) { return; } // We increment the number of threads waiting here in order // to ensure that the check for tasks occurs after the increment // to prevent a task being added after a check, then the thread waiting. // This will occasionally result in threads being mistakenly awoken, // but they will then go back to sleep. m_NumThreadsWaitingForNewTasks.fetch_add( 1, std::memory_order_acquire ); ThreadState prevThreadState = m_pThreadDataStore[threadNum_].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum_].threadState.store( ENKI_THREAD_STATE_WAIT_NEW_TASKS, std::memory_order_seq_cst ); if( HaveTasks( threadNum_ ) ) { m_NumThreadsWaitingForNewTasks.fetch_sub( 1, std::memory_order_release ); } else { SafeCallback( m_Config.profilerCallbacks.waitForNewTaskSuspendStart, threadNum_ ); SemaphoreWait( *m_pNewTaskSemaphore ); SafeCallback( m_Config.profilerCallbacks.waitForNewTaskSuspendStop, threadNum_ ); } m_pThreadDataStore[threadNum_].threadState.store( prevThreadState, std::memory_order_release ); } void TaskScheduler::WaitForTaskCompletion( const ICompletable* pCompletable_, uint32_t threadNum_ ) { // We don't want to suspend this thread if there are task threads // with pinned tasks suspended, as the completable could be a pinned task // or it could be waiting on one. if( WakeSuspendedThreadsWithPinnedTasks( threadNum_ ) ) { return; } m_NumThreadsWaitingForTaskCompletion.fetch_add( 1, std::memory_order_acq_rel ); pCompletable_->m_WaitingForTaskCount.fetch_add( 1, std::memory_order_acq_rel ); ThreadState prevThreadState = m_pThreadDataStore[threadNum_].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum_].threadState.store( ENKI_THREAD_STATE_WAIT_TASK_COMPLETION, std::memory_order_seq_cst ); // do not wait on semaphore if task in gc_TaskAlmostCompleteCount state. if( gc_TaskAlmostCompleteCount >= pCompletable_->m_RunningCount.load( std::memory_order_acquire ) || HaveTasks( threadNum_ ) ) { m_NumThreadsWaitingForTaskCompletion.fetch_sub( 1, std::memory_order_acq_rel ); } else { SafeCallback( m_Config.profilerCallbacks.waitForTaskCompleteSuspendStart, threadNum_ ); std::atomic_thread_fence(std::memory_order_acquire); SemaphoreWait( *m_pTaskCompleteSemaphore ); if( !pCompletable_->GetIsComplete() ) { // This thread which may not the one which was supposed to be awoken WakeThreadsForTaskCompletion(); } SafeCallback( m_Config.profilerCallbacks.waitForTaskCompleteSuspendStop, threadNum_ ); } m_pThreadDataStore[threadNum_].threadState.store( prevThreadState, std::memory_order_release ); pCompletable_->m_WaitingForTaskCount.fetch_sub( 1, std::memory_order_acq_rel ); } void TaskScheduler::WakeThreadsForNewTasks() { int32_t waiting = m_NumThreadsWaitingForNewTasks.load( std::memory_order_relaxed ); while( waiting > 0 && !m_NumThreadsWaitingForNewTasks.compare_exchange_weak(waiting, 0, std::memory_order_release, std::memory_order_relaxed ) ) {} if( waiting > 0 ) { SemaphoreSignal( *m_pNewTaskSemaphore, waiting ); } // We also wake tasks waiting for completion as they can run tasks WakeThreadsForTaskCompletion(); } void TaskScheduler::WakeThreadsForTaskCompletion() { // m_NumThreadsWaitingForTaskCompletion can go negative as this indicates that // we signalled more threads than the number which ended up waiting int32_t waiting = m_NumThreadsWaitingForTaskCompletion.load( std::memory_order_relaxed ); while( waiting > 0 && !m_NumThreadsWaitingForTaskCompletion.compare_exchange_weak(waiting, 0, std::memory_order_release, std::memory_order_relaxed ) ) {} if( waiting > 0 ) { SemaphoreSignal( *m_pTaskCompleteSemaphore, waiting ); } } bool TaskScheduler::WakeSuspendedThreadsWithPinnedTasks( uint32_t threadNum_ ) { for( uint32_t t = 1; t < m_NumThreads; ++t ) { // distribute thread checks more evenly by starting at our thread number rather than 0. uint32_t thread = ( threadNum_ + t ) % m_NumThreads; ThreadState state = m_pThreadDataStore[ thread ].threadState.load( std::memory_order_acquire ); ENKI_ASSERT( state != ENKI_THREAD_STATE_NONE ); if( state == ENKI_THREAD_STATE_WAIT_NEW_TASKS || state == ENKI_THREAD_STATE_WAIT_TASK_COMPLETION ) { // thread is suspended, check if it has pinned tasks for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { if( !m_pPinnedTaskListPerThread[ priority ][ thread ].IsListEmpty() ) { WakeThreadsForNewTasks(); return true; } } } } return false; } void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, uint32_t rangeToSplit_ ) { int32_t numAdded = 0; int32_t numNewTasksSinceNotification = 0; int32_t numRun = 0; int32_t upperBoundNumToAdd = 2 + (int32_t)( ( subTask_.partition.end - subTask_.partition.start ) / rangeToSplit_ ); // ensure that an artificial completion is not registered whilst adding tasks by incrementing count subTask_.pTask->m_RunningCount.fetch_add( upperBoundNumToAdd, std::memory_order_acquire ); while( subTask_.partition.start != subTask_.partition.end ) { SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ ); // add the partition to the pipe ++numAdded; ++numNewTasksSinceNotification; if( !m_pPipesPerThread[ subTask_.pTask->m_Priority ][ threadNum_ ].WriterTryWriteFront( taskToAdd ) ) { --numAdded; // we were unable to add the task if( numNewTasksSinceNotification > 1 ) { WakeThreadsForNewTasks(); } numNewTasksSinceNotification = 0; // alter range to run the appropriate fraction if( taskToAdd.pTask->m_RangeToRun < taskToAdd.partition.end - taskToAdd.partition.start ) { taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun; ENKI_ASSERT( taskToAdd.partition.end <= taskToAdd.pTask->m_SetSize ); subTask_.partition.start = taskToAdd.partition.end; } taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ ); ++numRun; } } int32_t countToRemove = upperBoundNumToAdd - numAdded; ENKI_ASSERT( countToRemove > 0 ); int prevCount = subTask_.pTask->m_RunningCount.fetch_sub( countToRemove, std::memory_order_acq_rel ); if( countToRemove-1 + gc_TaskStartCount == prevCount ) { TaskComplete( subTask_.pTask, false, threadNum_ ); } // WakeThreadsForNewTasks also calls WakeThreadsForTaskCompletion() so do not need to do so above WakeThreadsForNewTasks(); } TaskSchedulerConfig TaskScheduler::GetConfig() const { return m_Config; } void TaskScheduler::AddTaskSetToPipeInt( ITaskSet* pTaskSet_, uint32_t threadNum_ ) { ENKI_ASSERT( pTaskSet_->m_RunningCount == gc_TaskStartCount ); ThreadState prevThreadState = m_pThreadDataStore[threadNum_].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum_].threadState.store( ENKI_THREAD_STATE_RUNNING, std::memory_order_relaxed ); std::atomic_thread_fence(std::memory_order_acquire); // divide task up and add to pipe pTaskSet_->m_RangeToRun = pTaskSet_->m_SetSize / m_NumPartitions; pTaskSet_->m_RangeToRun = std::max( pTaskSet_->m_RangeToRun, pTaskSet_->m_MinRange ); // Note: if m_SetSize is < m_RangeToRun this will be handled by SplitTask and so does not need to be handled here uint32_t rangeToSplit = pTaskSet_->m_SetSize / m_NumInitialPartitions; rangeToSplit = std::max( rangeToSplit, pTaskSet_->m_MinRange ); SubTaskSet subTask; subTask.pTask = pTaskSet_; subTask.partition.start = 0; subTask.partition.end = pTaskSet_->m_SetSize; SplitAndAddTask( threadNum_, subTask, rangeToSplit ); int prevCount = pTaskSet_->m_RunningCount.fetch_sub(1, std::memory_order_acq_rel ); if( gc_TaskStartCount == prevCount ) { TaskComplete( pTaskSet_, true, threadNum_ ); } m_pThreadDataStore[threadNum_].threadState.store( prevThreadState, std::memory_order_release ); } void TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet_ ) { ENKI_ASSERT( pTaskSet_->m_RunningCount == 0 ); InitDependencies( pTaskSet_ ); pTaskSet_->m_RunningCount.store( gc_TaskStartCount, std::memory_order_relaxed ); AddTaskSetToPipeInt( pTaskSet_, gtl_threadNum ); } void TaskScheduler::AddPinnedTaskInt( IPinnedTask* pTask_ ) { ENKI_ASSERT( pTask_->m_RunningCount == gc_TaskStartCount ); m_pPinnedTaskListPerThread[ pTask_->m_Priority ][ pTask_->threadNum ].WriterWriteFront( pTask_ ); ThreadState statePinnedTaskThread = m_pThreadDataStore[ pTask_->threadNum ].threadState.load( std::memory_order_acquire ); if( statePinnedTaskThread == ENKI_THREAD_STATE_WAIT_NEW_PINNED_TASKS ) { SemaphoreSignal( *m_pThreadDataStore[ pTask_->threadNum ].pWaitNewPinnedTaskSemaphore, 1 ); } else { WakeThreadsForNewTasks(); } } void TaskScheduler::AddPinnedTask( IPinnedTask* pTask_ ) { ENKI_ASSERT( pTask_->m_RunningCount == 0 ); InitDependencies( pTask_ ); pTask_->m_RunningCount = gc_TaskStartCount; AddPinnedTaskInt( pTask_ ); } void TaskScheduler::InitDependencies( ICompletable* pCompletable_ ) { // go through any dependencies and set their running count so they show as not complete // and increment dependency count if( pCompletable_->m_RunningCount.load( std::memory_order_relaxed ) ) { // already initialized return; } Dependency* pDependent = pCompletable_->m_pDependents; while( pDependent ) { InitDependencies( pDependent->pTaskToRunOnCompletion ); pDependent->pTaskToRunOnCompletion->m_RunningCount.store( gc_TaskStartCount, std::memory_order_relaxed ); pDependent = pDependent->pNext; } } void TaskScheduler::RunPinnedTasks() { ENKI_ASSERT( gtl_threadNum != enki::NO_THREAD_NUM ); uint32_t threadNum = gtl_threadNum; ThreadState prevThreadState = m_pThreadDataStore[threadNum].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum].threadState.store( ENKI_THREAD_STATE_RUNNING, std::memory_order_relaxed ); std::atomic_thread_fence(std::memory_order_acquire); for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { RunPinnedTasks( threadNum, priority ); } m_pThreadDataStore[threadNum].threadState.store( prevThreadState, std::memory_order_release ); } void TaskScheduler::RunPinnedTasks( uint32_t threadNum_, uint32_t priority_ ) { IPinnedTask* pPinnedTaskSet = NULL; do { pPinnedTaskSet = m_pPinnedTaskListPerThread[ priority_ ][ threadNum_ ].ReaderReadBack(); if( pPinnedTaskSet ) { pPinnedTaskSet->Execute(); pPinnedTaskSet->m_RunningCount.fetch_sub(1,std::memory_order_acq_rel); TaskComplete( pPinnedTaskSet, true, threadNum_ ); } } while( pPinnedTaskSet ); } void TaskScheduler::WaitforTask( const ICompletable* pCompletable_, enki::TaskPriority priorityOfLowestToRun_ ) { ENKI_ASSERT( gtl_threadNum != enki::NO_THREAD_NUM ); uint32_t threadNum = gtl_threadNum; uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped. // waiting for a task is equivalent to 'running' for thread state purpose as we may run tasks whilst waiting ThreadState prevThreadState = m_pThreadDataStore[threadNum].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum].threadState.store( ENKI_THREAD_STATE_RUNNING, std::memory_order_relaxed ); std::atomic_thread_fence(std::memory_order_acquire); if( pCompletable_ && !pCompletable_->GetIsComplete() ) { SafeCallback( m_Config.profilerCallbacks.waitForTaskCompleteStart, threadNum ); // We need to ensure that the task we're waiting on can complete even if we're the only thread, // so we clamp the priorityOfLowestToRun_ to no smaller than the task we're waiting for priorityOfLowestToRun_ = std::max( priorityOfLowestToRun_, pCompletable_->m_Priority ); uint32_t spinCount = 0; while( !pCompletable_->GetIsComplete() && GetIsRunningInt() ) { ++spinCount; for( int priority = 0; priority <= priorityOfLowestToRun_; ++priority ) { if( TryRunTask( threadNum, priority, hintPipeToCheck_io ) ) { spinCount = 0; // reset spin as ran a task break; } } if( spinCount > gc_SpinCount ) { WaitForTaskCompletion( pCompletable_, threadNum ); spinCount = 0; } else { uint32_t spinBackoffCount = spinCount * gc_SpinBackOffMultiplier; SpinWait( spinBackoffCount ); } } SafeCallback( m_Config.profilerCallbacks.waitForTaskCompleteStop, threadNum ); } else if( nullptr == pCompletable_ ) { for( int priority = 0; priority <= priorityOfLowestToRun_; ++priority ) { if( TryRunTask( threadNum, priority, hintPipeToCheck_io ) ) { break; } } } m_pThreadDataStore[threadNum].threadState.store( prevThreadState, std::memory_order_release ); } class TaskSchedulerWaitTask : public IPinnedTask { void Execute() override { // do nothing } }; void TaskScheduler::WaitforAll() { ENKI_ASSERT( gtl_threadNum != enki::NO_THREAD_NUM ); m_bWaitforAllCalled.store( true, std::memory_order_release ); bool bHaveTasks = true; uint32_t ourThreadNum = gtl_threadNum; uint32_t hintPipeToCheck_io = ourThreadNum + 1; // does not need to be clamped. bool otherThreadsRunning = false; // account for this thread uint32_t spinCount = 0; TaskSchedulerWaitTask dummyWaitTask; dummyWaitTask.threadNum = 0; while( GetIsRunningInt() && ( bHaveTasks || otherThreadsRunning ) ) { bHaveTasks = TryRunTask( ourThreadNum, hintPipeToCheck_io ); ++spinCount; if( bHaveTasks ) { spinCount = 0; // reset spin as ran a task } if( spinCount > gc_SpinCount ) { // find a running thread and add a dummy wait task int32_t countThreadsToCheck = m_NumThreads - 1; bool bHaveThreadToWaitOn = false; do { --countThreadsToCheck; dummyWaitTask.threadNum = ( dummyWaitTask.threadNum + 1 ) % m_NumThreads; // We can only add a pinned task to wait on if we find an enki Task Thread which isn't this thread. // Otherwise, we have to busy wait. if( dummyWaitTask.threadNum != ourThreadNum && dummyWaitTask.threadNum > m_Config.numExternalTaskThreads ) { ThreadState state = m_pThreadDataStore[ dummyWaitTask.threadNum ].threadState.load( std::memory_order_acquire ); if( state == ENKI_THREAD_STATE_RUNNING || state == ENKI_THREAD_STATE_WAIT_TASK_COMPLETION ) { bHaveThreadToWaitOn = true; break; } } } while( countThreadsToCheck ); if( bHaveThreadToWaitOn ) { ENKI_ASSERT( dummyWaitTask.threadNum != ourThreadNum ); AddPinnedTask( &dummyWaitTask ); WaitforTask( &dummyWaitTask ); } spinCount = 0; } else { uint32_t spinBackoffCount = spinCount * gc_SpinBackOffMultiplier; SpinWait( spinBackoffCount ); } // count threads running otherThreadsRunning = false; for(uint32_t thread = 0; thread < m_NumThreads && !otherThreadsRunning; ++thread ) { // ignore our thread if( thread != ourThreadNum ) { switch( m_pThreadDataStore[thread].threadState.load( std::memory_order_acquire ) ) { case ENKI_THREAD_STATE_NONE: ENKI_ASSERT(false); break; case ENKI_THREAD_STATE_NOT_LAUNCHED: case ENKI_THREAD_STATE_RUNNING: case ENKI_THREAD_STATE_WAIT_TASK_COMPLETION: otherThreadsRunning = true; break; case ENKI_THREAD_STATE_WAIT_NEW_PINNED_TASKS: otherThreadsRunning = true; SemaphoreSignal( *m_pThreadDataStore[thread].pWaitNewPinnedTaskSemaphore, 1 ); break; case ENKI_THREAD_STATE_PRIMARY_REGISTERED: case ENKI_THREAD_STATE_EXTERNAL_REGISTERED: case ENKI_THREAD_STATE_EXTERNAL_UNREGISTERED: case ENKI_THREAD_STATE_WAIT_NEW_TASKS: case ENKI_THREAD_STATE_STOPPED: break; } } } if( !otherThreadsRunning ) { // check there are no tasks for(uint32_t thread = 0; thread < m_NumThreads && !otherThreadsRunning; ++thread ) { // ignore our thread if( thread != ourThreadNum ) { otherThreadsRunning = HaveTasks( thread ); } } } } m_bWaitforAllCalled.store( false, std::memory_order_release ); } void TaskScheduler::WaitforAllAndShutdown() { m_bWaitforAllCalled.store( true, std::memory_order_release ); m_bShutdownRequested.store( true, std::memory_order_release ); if( m_bHaveThreads ) { WaitforAll(); StopThreads(true); } } void TaskScheduler::ShutdownNow() { m_bWaitforAllCalled.store( true, std::memory_order_release ); m_bShutdownRequested.store( true, std::memory_order_release ); if( m_bHaveThreads ) { StopThreads(true); } } void TaskScheduler::WaitForNewPinnedTasks() { ENKI_ASSERT( gtl_threadNum != enki::NO_THREAD_NUM ); uint32_t threadNum = gtl_threadNum; ThreadState prevThreadState = m_pThreadDataStore[threadNum].threadState.load( std::memory_order_relaxed ); m_pThreadDataStore[threadNum].threadState.store( ENKI_THREAD_STATE_WAIT_NEW_PINNED_TASKS, std::memory_order_seq_cst ); // check if have tasks inside threadState change but before waiting bool bHavePinnedTasks = false; for( int priority = 0; priority < TASK_PRIORITY_NUM; ++priority ) { if( !m_pPinnedTaskListPerThread[ priority ][ threadNum ].IsListEmpty() ) { bHavePinnedTasks = true; break; } } if( !bHavePinnedTasks ) { SafeCallback( m_Config.profilerCallbacks.waitForNewTaskSuspendStart, threadNum ); SemaphoreWait( *m_pThreadDataStore[threadNum].pWaitNewPinnedTaskSemaphore ); SafeCallback( m_Config.profilerCallbacks.waitForNewTaskSuspendStop, threadNum ); } m_pThreadDataStore[threadNum].threadState.store( prevThreadState, std::memory_order_release ); } uint32_t TaskScheduler::GetNumTaskThreads() const { return m_NumThreads; } uint32_t TaskScheduler::GetThreadNum() const { return gtl_threadNum; } template T* TaskScheduler::NewArray( size_t num_, const char* file_, int line_ ) { T* pRet = (T*)m_Config.customAllocator.alloc( alignof(T), num_*sizeof(T), m_Config.customAllocator.userData, file_, line_ ); if( !std::is_trivial::value ) { T* pCurr = pRet; for( size_t i = 0; i < num_; ++i ) { void* pBuffer = pCurr; pCurr = new(pBuffer) T; ++pCurr; } } return pRet; } template void TaskScheduler::DeleteArray( T* p_, size_t num_, const char* file_, int line_ ) { if( !std::is_trivially_destructible::value ) { size_t i = num_; while(i) { p_[--i].~T(); } } m_Config.customAllocator.free( p_, sizeof(T)*num_, m_Config.customAllocator.userData, file_, line_ ); } template T* TaskScheduler::New( const char* file_, int line_, Args&&... args_ ) { T* pRet = this->Alloc( file_, line_ ); return new(pRet) T( std::forward(args_)... ); } template< typename T > void TaskScheduler::Delete( T* p_, const char* file_, int line_ ) { p_->~T(); this->Free(p_, file_, line_ ); } template< typename T > T* TaskScheduler::Alloc( const char* file_, int line_ ) { T* pRet = (T*)m_Config.customAllocator.alloc( alignof(T), sizeof(T), m_Config.customAllocator.userData, file_, line_ ); return pRet; } template< typename T > void TaskScheduler::Free( T* p_, const char* file_, int line_ ) { m_Config.customAllocator.free( p_, sizeof(T), m_Config.customAllocator.userData, file_, line_ ); } TaskScheduler::TaskScheduler() : m_pPipesPerThread() , m_pPinnedTaskListPerThread() , m_NumThreads(0) , m_pThreadDataStore(NULL) , m_pThreads(NULL) , m_bRunning(false) , m_NumInternalTaskThreadsRunning(0) , m_NumThreadsWaitingForNewTasks(0) , m_NumThreadsWaitingForTaskCompletion(0) , m_NumPartitions(0) , m_pNewTaskSemaphore(NULL) , m_pTaskCompleteSemaphore(NULL) , m_NumInitialPartitions(0) , m_bHaveThreads(false) , m_NumExternalTaskThreadsRegistered(0) { } TaskScheduler::~TaskScheduler() { StopThreads( true ); // Stops threads, waiting for them. } void TaskScheduler::Initialize( uint32_t numThreadsTotal_ ) { ENKI_ASSERT( numThreadsTotal_ >= 1 ); StopThreads( true ); // Stops threads, waiting for them. m_Config.numTaskThreadsToCreate = numThreadsTotal_ - 1; m_Config.numExternalTaskThreads = 0; StartThreads();} void TaskScheduler::Initialize( TaskSchedulerConfig config_ ) { StopThreads( true ); // Stops threads, waiting for them. m_Config = config_; StartThreads(); } void TaskScheduler::Initialize() { Initialize( std::thread::hardware_concurrency() ); } // Semaphore implementation #ifdef _WIN32 #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #ifndef NOMINMAX #define NOMINMAX #endif #include namespace enki { struct semaphoreid_t { HANDLE sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { #ifdef _XBOX_ONE semaphoreid.sem = CreateSemaphoreExW( NULL, 0, MAXLONG, NULL, 0, SEMAPHORE_ALL_ACCESS ); #else semaphoreid.sem = CreateSemaphore( NULL, 0, MAXLONG, NULL ); #endif } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { CloseHandle( semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE ); ENKI_ASSERT( retval != WAIT_FAILED ); (void)retval; // only needed for ENKI_ASSERT } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { if( countWaiting ) { ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL ); } } } #elif defined(__MACH__) // OS X does not have POSIX semaphores // Mach semaphores can now only be created by the kernel // Named semaphores work, but would require unique name construction to ensure // they are isolated to this process. // Dispatch semaphores appear to be the way other developers use OSX Semaphores, e.g. Boost // However the API could change // OSX below 10.6 does not support dispatch, but I do not have an earlier OSX version // to test alternatives #include namespace enki { struct semaphoreid_t { dispatch_semaphore_t sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { semaphoreid.sem = dispatch_semaphore_create(0); } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { dispatch_release( semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { dispatch_semaphore_wait( semaphoreid.sem, DISPATCH_TIME_FOREVER ); } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { while( countWaiting-- > 0 ) { dispatch_semaphore_signal( semaphoreid.sem ); } } } #else // POSIX #include #include namespace enki { struct semaphoreid_t { sem_t sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { int err = sem_init( &semaphoreid.sem, 0, 0 ); ENKI_ASSERT( err == 0 ); (void)err; } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { sem_destroy( &semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { while( sem_wait( &semaphoreid.sem ) == -1 && errno == EINTR ) {} } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { while( countWaiting-- > 0 ) { sem_post( &semaphoreid.sem ); } } } #endif semaphoreid_t* TaskScheduler::SemaphoreNew() { semaphoreid_t* pSemaphore = this->Alloc( ENKI_FILE_AND_LINE ); SemaphoreCreate( *pSemaphore ); return pSemaphore; } void TaskScheduler::SemaphoreDelete( semaphoreid_t* pSemaphore_ ) { SemaphoreClose( *pSemaphore_ ); this->Free( pSemaphore_, ENKI_FILE_AND_LINE ); } void TaskScheduler::SetCustomAllocator( CustomAllocator customAllocator_ ) { m_Config.customAllocator = customAllocator_; } Dependency::Dependency( const ICompletable* pDependencyTask_, ICompletable* pTaskToRunOnCompletion_ ) : pTaskToRunOnCompletion( pTaskToRunOnCompletion_ ) , pDependencyTask( pDependencyTask_ ) , pNext( pDependencyTask->m_pDependents ) { ENKI_ASSERT( pDependencyTask->GetIsComplete() ); ENKI_ASSERT( pTaskToRunOnCompletion->GetIsComplete() ); pDependencyTask->m_pDependents = this; ++pTaskToRunOnCompletion->m_DependenciesCount; } Dependency::Dependency( Dependency&& rhs_ ) noexcept { pDependencyTask = rhs_.pDependencyTask; pTaskToRunOnCompletion = rhs_.pTaskToRunOnCompletion; pNext = rhs_.pNext; if( rhs_.pDependencyTask ) { ENKI_ASSERT( rhs_.pTaskToRunOnCompletion ); ENKI_ASSERT( rhs_.pDependencyTask->GetIsComplete() ); ENKI_ASSERT( rhs_.pTaskToRunOnCompletion->GetIsComplete() ); Dependency** ppDependent = &(pDependencyTask->m_pDependents); while( *ppDependent ) { if( &rhs_ == *ppDependent ) { *ppDependent = this; break; } ppDependent = &((*ppDependent)->pNext); } } } Dependency::~Dependency() { ClearDependency(); } void Dependency::SetDependency( const ICompletable* pDependencyTask_, ICompletable* pTaskToRunOnCompletion_ ) { ClearDependency(); ENKI_ASSERT( pDependencyTask_->GetIsComplete() ); ENKI_ASSERT( pTaskToRunOnCompletion_->GetIsComplete() ); pDependencyTask = pDependencyTask_; pTaskToRunOnCompletion = pTaskToRunOnCompletion_; pNext = pDependencyTask->m_pDependents; pDependencyTask->m_pDependents = this; ++pTaskToRunOnCompletion->m_DependenciesCount; } void Dependency::ClearDependency() { if( pDependencyTask ) { ENKI_ASSERT( pTaskToRunOnCompletion ); ENKI_ASSERT( pDependencyTask->GetIsComplete() ); ENKI_ASSERT( pTaskToRunOnCompletion->GetIsComplete() ); ENKI_ASSERT( pTaskToRunOnCompletion->m_DependenciesCount > 0 ); Dependency* pDependent = pDependencyTask->m_pDependents; --pTaskToRunOnCompletion->m_DependenciesCount; if( this == pDependent ) { pDependencyTask->m_pDependents = pDependent->pNext; } else { while( pDependent ) { Dependency* pPrev = pDependent; pDependent = pDependent->pNext; if( this == pDependent ) { pPrev->pNext = pDependent->pNext; break; } } } } pDependencyTask = NULL; pDependencyTask = NULL; pNext = NULL; } RenderKit-rkcommon-988718e/rkcommon/tasking/detail/enkiTS/TaskScheduler.h000066400000000000000000000705111467524601100263730ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #include #include #include #include // ENKITS_TASK_PRIORITIES_NUM can be set from 1 to 5. // 1 corresponds to effectively no priorities. #ifndef ENKITS_TASK_PRIORITIES_NUM #define ENKITS_TASK_PRIORITIES_NUM 3 #endif #ifndef ENKITS_API #if defined(_WIN32) && defined(ENKITS_BUILD_DLL) // Building enkiTS as a DLL #define ENKITS_API __declspec(dllexport) #elif defined(_WIN32) && defined(ENKITS_DLL) // Using enkiTS as a DLL #define ENKITS_API __declspec(dllimport) #elif defined(__GNUC__) && defined(ENKITS_BUILD_DLL) // Building enkiTS as a shared library #define ENKITS_API __attribute__((visibility("default"))) #else #define ENKITS_API #endif #endif // Define ENKI_CUSTOM_ALLOC_FILE_AND_LINE (at project level) to get file and line report in custom allocators, // this is default in Debug - to turn off define ENKI_CUSTOM_ALLOC_NO_FILE_AND_LINE #ifndef ENKI_CUSTOM_ALLOC_FILE_AND_LINE #if defined(_DEBUG ) && !defined(ENKI_CUSTOM_ALLOC_NO_FILE_AND_LINE) #define ENKI_CUSTOM_ALLOC_FILE_AND_LINE #endif #endif #ifndef ENKI_ASSERT #include #define ENKI_ASSERT(x) assert(x) #endif #if (!defined(_MSVC_LANG) && __cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) #define ENKI_DEPRECATED [[deprecated]] #else #define ENKI_DEPRECATED #endif namespace enki { struct TaskSetPartition { uint32_t start; uint32_t end; }; class TaskScheduler; class TaskPipe; class PinnedTaskList; class Dependency; struct ThreadArgs; struct ThreadDataStore; struct SubTaskSet; struct semaphoreid_t; static constexpr uint32_t NO_THREAD_NUM = 0xFFFFFFFF; ENKITS_API uint32_t GetNumHardwareThreads(); enum TaskPriority { TASK_PRIORITY_HIGH = 0, #if ( ENKITS_TASK_PRIORITIES_NUM > 3 ) TASK_PRIORITY_MED_HI, #endif #if ( ENKITS_TASK_PRIORITIES_NUM > 2 ) TASK_PRIORITY_MED, #endif #if ( ENKITS_TASK_PRIORITIES_NUM > 4 ) TASK_PRIORITY_MED_LO, #endif #if ( ENKITS_TASK_PRIORITIES_NUM > 1 ) TASK_PRIORITY_LOW, #endif TASK_PRIORITY_NUM }; // ICompletable is a base class used to check for completion. // Can be used with dependencies to wait for their completion. // Derive from ITaskSet or IPinnedTask for running parallel tasks. class ICompletable { public: bool GetIsComplete() const { return 0 == m_RunningCount.load( std::memory_order_acquire ); } virtual ~ICompletable(); // Dependency helpers, see Dependencies.cpp void SetDependency( Dependency& dependency_, const ICompletable* pDependencyTask_ ); template void SetDependenciesArr( D& dependencyArray_ , const T(&taskArray_)[SIZE] ); template void SetDependenciesArr( D& dependencyArray_, std::initializer_list taskpList_ ); template void SetDependenciesArr( D(&dependencyArray_)[SIZE], const T(&taskArray_)[SIZE] ); template void SetDependenciesArr( D(&dependencyArray_)[SIZE], std::initializer_list taskpList_ ); template void SetDependenciesVec( D& dependencyVec_, const T(&taskArray_)[SIZE] ); template void SetDependenciesVec( D& dependencyVec_, std::initializer_list taskpList_ ); TaskPriority m_Priority = TASK_PRIORITY_HIGH; protected: // Deriving from an ICompletable and overriding OnDependenciesComplete is advanced use. // If you do override OnDependenciesComplete() call: // ICompletable::OnDependenciesComplete( pTaskScheduler_, threadNum_ ); // in your implementation. virtual void OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ); private: friend class TaskScheduler; friend class Dependency; std::atomic m_RunningCount = {0}; std::atomic m_DependenciesCompletedCount = {0}; int32_t m_DependenciesCount = 0; mutable std::atomic m_WaitingForTaskCount = {0}; mutable Dependency* m_pDependents = NULL; }; // Subclass ITaskSet to create tasks. // TaskSets can be re-used, but check completion first. class ITaskSet : public ICompletable { public: ITaskSet() = default; ITaskSet( uint32_t setSize_ ) : m_SetSize( setSize_ ) {} ITaskSet( uint32_t setSize_, uint32_t minRange_ ) : m_SetSize( setSize_ ) , m_MinRange( minRange_ ) , m_RangeToRun(minRange_) {} // Execute range should be overloaded to process tasks. It will be called with a // range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize; // The range values should be mapped so that linearly processing them in order is cache friendly // i.e. neighbouring values should be close together. // threadnum_ should not be used for changing processing of data, its intended purpose // is to allow per-thread data buckets for output. virtual void ExecuteRange( TaskSetPartition range_, uint32_t threadnum_ ) = 0; // Set Size - usually the number of data items to be processed, see ExecuteRange. Defaults to 1 uint32_t m_SetSize = 1; // Min Range - Minimum size of TaskSetPartition range when splitting a task set into partitions. // Designed for reducing scheduling overhead by preventing set being // divided up too small. Ranges passed to ExecuteRange will *not* be a multiple of this, // only attempts to deliver range sizes larger than this most of the time. // This should be set to a value which results in computation effort of at least 10k // clock cycles to minimize task scheduler overhead. // NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple // of m_MinRange. // Also known as grain size in literature. uint32_t m_MinRange = 1; private: friend class TaskScheduler; void OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ) final; uint32_t m_RangeToRun = 1; }; // Subclass IPinnedTask to create tasks which can be run on a given thread only. class IPinnedTask : public ICompletable { public: IPinnedTask() = default; IPinnedTask( uint32_t threadNum_ ) : threadNum(threadNum_) {} // default is to run a task on main thread // IPinnedTask needs to be non-abstract for intrusive list functionality. // Should never be called as is, should be overridden. virtual void Execute() { ENKI_ASSERT(false); } uint32_t threadNum = 0; // thread to run this pinned task on std::atomic pNext = {NULL}; private: void OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ) final; }; // TaskSet - a utility task set for creating tasks based on std::function. typedef std::function TaskSetFunction; class TaskSet : public ITaskSet { public: TaskSet() = default; TaskSet( TaskSetFunction func_ ) : m_Function( std::move(func_) ) {} TaskSet( uint32_t setSize_, TaskSetFunction func_ ) : ITaskSet( setSize_ ), m_Function( std::move(func_) ) {} void ExecuteRange( TaskSetPartition range_, uint32_t threadnum_ ) override { m_Function( range_, threadnum_ ); } TaskSetFunction m_Function; }; // LambdaPinnedTask - a utility pinned task for creating tasks based on std::func. typedef std::function PinnedTaskFunction; class LambdaPinnedTask : public IPinnedTask { public: LambdaPinnedTask() = default; LambdaPinnedTask( PinnedTaskFunction func_ ) : m_Function( std::move(func_) ) {} LambdaPinnedTask( uint32_t threadNum_, PinnedTaskFunction func_ ) : IPinnedTask( threadNum_ ), m_Function( std::move(func_) ) {} void Execute() override { m_Function(); } PinnedTaskFunction m_Function; }; class Dependency { public: Dependency() = default; Dependency( const Dependency& ) = delete; ENKITS_API Dependency( Dependency&& ) noexcept; ENKITS_API Dependency( const ICompletable* pDependencyTask_, ICompletable* pTaskToRunOnCompletion_ ); ENKITS_API ~Dependency(); ENKITS_API void SetDependency( const ICompletable* pDependencyTask_, ICompletable* pTaskToRunOnCompletion_ ); ENKITS_API void ClearDependency(); ICompletable* GetTaskToRunOnCompletion() { return pTaskToRunOnCompletion; } const ICompletable* GetDependencyTask() { return pDependencyTask; } private: friend class TaskScheduler; friend class ICompletable; ICompletable* pTaskToRunOnCompletion = NULL; const ICompletable* pDependencyTask = NULL; Dependency* pNext = NULL; }; // TaskScheduler implements several callbacks intended for profilers typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ ); struct ProfilerCallbacks { ProfilerCallbackFunc threadStart; ProfilerCallbackFunc threadStop; ProfilerCallbackFunc waitForNewTaskSuspendStart; // thread suspended waiting for new tasks ProfilerCallbackFunc waitForNewTaskSuspendStop; // thread unsuspended ProfilerCallbackFunc waitForTaskCompleteStart; // thread waiting for task completion ProfilerCallbackFunc waitForTaskCompleteStop; // thread stopped waiting ProfilerCallbackFunc waitForTaskCompleteSuspendStart; // thread suspended waiting task completion ProfilerCallbackFunc waitForTaskCompleteSuspendStop; // thread unsuspended }; // Custom allocator, set in TaskSchedulerConfig. Also see ENKI_CUSTOM_ALLOC_FILE_AND_LINE for file_ and line_ typedef void* (*AllocFunc)( size_t align_, size_t size_, void* userData_, const char* file_, int line_ ); typedef void (*FreeFunc)( void* ptr_, size_t size_, void* userData_, const char* file_, int line_ ); ENKITS_API void* DefaultAllocFunc( size_t align_, size_t size_, void* userData_, const char* file_, int line_ ); ENKITS_API void DefaultFreeFunc( void* ptr_, size_t size_, void* userData_, const char* file_, int line_ ); struct CustomAllocator { AllocFunc alloc = DefaultAllocFunc; FreeFunc free = DefaultFreeFunc; void* userData = nullptr; }; // TaskSchedulerConfig - configuration struct for advanced Initialize struct TaskSchedulerConfig { // numTaskThreadsToCreate - Number of tasking threads the task scheduler will create. Must be > 0. // Defaults to GetNumHardwareThreads()-1 threads as thread which calls initialize is thread 0. uint32_t numTaskThreadsToCreate = GetNumHardwareThreads()-1; // numExternalTaskThreads - Advanced use. Number of external threads which need to use TaskScheduler API. // See TaskScheduler::RegisterExternalTaskThread() for usage. // Defaults to 0. The thread used to initialize the TaskScheduler can also use the TaskScheduler API. // Thus there are (numTaskThreadsToCreate + numExternalTaskThreads + 1) able to use the API, with this // defaulting to the number of hardware threads available to the system. uint32_t numExternalTaskThreads = 0; ProfilerCallbacks profilerCallbacks = {}; CustomAllocator customAllocator; }; class TaskScheduler { public: ENKITS_API TaskScheduler(); ENKITS_API ~TaskScheduler(); // Call an Initialize function before adding tasks. // Initialize() will create GetNumHardwareThreads()-1 tasking threads, which is // sufficient to fill the system when including the main thread. // Initialize can be called multiple times - it will wait for completion // before re-initializing. ENKITS_API void Initialize(); // Initialize( numThreadsTotal_ ) // will create numThreadsTotal_-1 threads, as thread 0 is // the thread on which the initialize was called. // numThreadsTotal_ must be > 0 ENKITS_API void Initialize( uint32_t numThreadsTotal_ ); // Initialize with advanced TaskSchedulerConfig settings. See TaskSchedulerConfig. ENKITS_API void Initialize( TaskSchedulerConfig config_ ); // Get config. Can be called before Initialize to get the defaults. ENKITS_API TaskSchedulerConfig GetConfig() const; // while( !GetIsShutdownRequested() ) {} can be used in tasks which loop, to check if enkiTS has been requested to shutdown. // If GetIsShutdownRequested() returns true should then exit. Not required for finite tasks // Safe to use with WaitforAllAndShutdown() and ShutdownNow() where this will be set // Not safe to use with WaitforAll(), use GetIsWaitforAllCalled() instead. inline bool GetIsShutdownRequested() const { return m_bShutdownRequested.load( std::memory_order_acquire ); } // while( !GetIsWaitforAllCalled() ) {} can be used in tasks which loop, to check if WaitforAll() has been called. // If GetIsWaitforAllCalled() returns false should then exit. Not required for finite tasks // This is intended to be used with code which calls WaitforAll(). // This is also set when the task manager is shutting down, so no need to have an additional check for GetIsShutdownRequested() inline bool GetIsWaitforAllCalled() const { return m_bWaitforAllCalled.load( std::memory_order_acquire ); } // Adds the TaskSet to pipe and returns if the pipe is not full. // If the pipe is full, pTaskSet is run. // should only be called from main thread, or within a task ENKITS_API void AddTaskSetToPipe( ITaskSet* pTaskSet_ ); // Thread 0 is main thread, otherwise use threadNum // Pinned tasks can be added from any thread ENKITS_API void AddPinnedTask( IPinnedTask* pTask_ ); // This function will run any IPinnedTask* for current thread, but not run other // Main thread should call this or use a wait to ensure its tasks are run. ENKITS_API void RunPinnedTasks(); // Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete(); // Should only be called from thread which created the task scheduler, or within a task. // If called with 0 it will try to run tasks, and return if none are available. // To run only a subset of tasks, set priorityOfLowestToRun_ to a high priority. // Default is lowest priority available. // Only wait for child tasks of the current task otherwise a deadlock could occur. // WaitforTask will exit if ShutdownNow() is called even if pCompletable_ is not complete. ENKITS_API void WaitforTask( const ICompletable* pCompletable_, enki::TaskPriority priorityOfLowestToRun_ = TaskPriority(TASK_PRIORITY_NUM - 1) ); // Waits for all task sets to complete - not guaranteed to work unless we know we // are in a situation where tasks aren't being continuously added. // If you are running tasks which loop, make sure to check GetIsWaitforAllCalled() and exit // WaitforAll will exit if ShutdownNow() is called even if there are still tasks to run or currently running ENKITS_API void WaitforAll(); // Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we // are in a situation where tasks aren't being continuously added. // This function can be safely called even if TaskScheduler::Initialize() has not been called. ENKITS_API void WaitforAllAndShutdown(); // Shutdown threads without waiting for all tasks to complete. // Intended to be used to exit an application quickly. // This function can be safely called even if TaskScheduler::Initialize() has not been called. // This function will still wait for any running tasks to exit before the task threads exit. // ShutdownNow will cause tasks which have been added to the scheduler but not completed // to be in an undefined state in which should not be re-launched. ENKITS_API void ShutdownNow(); // Waits for the current thread to receive a PinnedTask. // Will not run any tasks - use with RunPinnedTasks(). // Can be used with both ExternalTaskThreads or with an enkiTS tasking thread to create // a thread which only runs pinned tasks. If enkiTS threads are used can create // extra enkiTS task threads to handle non-blocking computation via normal tasks. ENKITS_API void WaitForNewPinnedTasks(); // Returns the number of threads created for running tasks + number of external threads // plus 1 to account for the thread used to initialize the task scheduler. // Equivalent to config values: numTaskThreadsToCreate + numExternalTaskThreads + 1. // It is guaranteed that GetThreadNum() < GetNumTaskThreads() ENKITS_API uint32_t GetNumTaskThreads() const; // Returns the current task threadNum. // Will return 0 for thread which initialized the task scheduler, // and NO_THREAD_NUM for all other non-enkiTS threads which have not been registered ( see RegisterExternalTaskThread() ), // and < GetNumTaskThreads() for all registered and internal enkiTS threads. // It is guaranteed that GetThreadNum() < GetNumTaskThreads() unless it is NO_THREAD_NUM ENKITS_API uint32_t GetThreadNum() const; // Call on a thread to register the thread to use the TaskScheduling API. // This is implicitly done for the thread which initializes the TaskScheduler // Intended for developers who have threads who need to call the TaskScheduler API // Returns true if successful, false if not. // Can only have numExternalTaskThreads registered at any one time, which must be set // at initialization time. ENKITS_API bool RegisterExternalTaskThread(); // As RegisterExternalTaskThread() but explicitly requests a given thread number. // threadNumToRegister_ must be >= GetNumFirstExternalTaskThread() // and < ( GetNumFirstExternalTaskThread() + numExternalTaskThreads ). ENKITS_API bool RegisterExternalTaskThread( uint32_t threadNumToRegister_ ); // Call on a thread on which RegisterExternalTaskThread has been called to deregister that thread. ENKITS_API void DeRegisterExternalTaskThread(); // Get the number of registered external task threads. ENKITS_API uint32_t GetNumRegisteredExternalTaskThreads(); // Get the thread number of the first external task thread. This thread // is not guaranteed to be registered, but threads are registered in order // from GetNumFirstExternalTaskThread() up to ( GetNumFirstExternalTaskThread() + numExternalTaskThreads ) // Note that if numExternalTaskThreads == 0 a for loop using this will be valid: // for( uint32_t externalThreadNum = GetNumFirstExternalTaskThread(); // externalThreadNum < ( GetNumFirstExternalTaskThread() + numExternalTaskThreads // ++externalThreadNum ) { // do something with externalThreadNum } inline static constexpr uint32_t GetNumFirstExternalTaskThread() { return 1; } // ------------- Start DEPRECATED Functions ------------- // DEPRECATED: use GetIsShutdownRequested() instead of GetIsRunning() in external code // while( GetIsRunning() ) {} can be used in tasks which loop, to check if enkiTS has been shutdown. // If GetIsRunning() returns false should then exit. Not required for finite tasks. ENKI_DEPRECATED inline bool GetIsRunning() const { return !GetIsShutdownRequested(); } // DEPRECATED - WaitforTaskSet, deprecated interface use WaitforTask. ENKI_DEPRECATED inline void WaitforTaskSet( const ICompletable* pCompletable_ ) { WaitforTask( pCompletable_ ); } // DEPRECATED - GetProfilerCallbacks. Use TaskSchedulerConfig instead. // Returns the ProfilerCallbacks structure so that it can be modified to // set the callbacks. Should be set prior to initialization. ENKI_DEPRECATED inline ProfilerCallbacks* GetProfilerCallbacks() { return &m_Config.profilerCallbacks; } // ------------- End DEPRECATED Functions ------------- private: friend class ICompletable; friend class ITaskSet; friend class IPinnedTask; static void TaskingThreadFunction( const ThreadArgs& args_ ); bool HaveTasks( uint32_t threadNum_ ); void WaitForNewTasks( uint32_t threadNum_ ); void WaitForTaskCompletion( const ICompletable* pCompletable_, uint32_t threadNum_ ); void RunPinnedTasks( uint32_t threadNum_, uint32_t priority_ ); bool TryRunTask( uint32_t threadNum_, uint32_t& hintPipeToCheck_io_ ); bool TryRunTask( uint32_t threadNum_, uint32_t priority_, uint32_t& hintPipeToCheck_io_ ); void StartThreads(); void StopThreads( bool bWait_ ); void SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, uint32_t rangeToSplit_ ); void WakeThreadsForNewTasks(); void WakeThreadsForTaskCompletion(); bool WakeSuspendedThreadsWithPinnedTasks( uint32_t threadNum_ ); void InitDependencies( ICompletable* pCompletable_ ); inline bool GetIsRunningInt() const { return m_bRunning.load( std::memory_order_acquire ); } ENKITS_API void TaskComplete( ICompletable* pTask_, bool bWakeThreads_, uint32_t threadNum_ ); ENKITS_API void AddTaskSetToPipeInt( ITaskSet* pTaskSet_, uint32_t threadNum_ ); ENKITS_API void AddPinnedTaskInt( IPinnedTask* pTask_ ); template< typename T > T* NewArray( size_t num_, const char* file_, int line_ ); template< typename T > void DeleteArray( T* p_, size_t num_, const char* file_, int line_ ); template T* New( const char* file_, int line_, Args&&... args_ ); template< typename T > void Delete( T* p_, const char* file_, int line_ ); template< typename T > T* Alloc( const char* file_, int line_ ); template< typename T > void Free( T* p_, const char* file_, int line_ ); semaphoreid_t* SemaphoreNew(); void SemaphoreDelete( semaphoreid_t* pSemaphore_ ); TaskPipe* m_pPipesPerThread[ TASK_PRIORITY_NUM ]; PinnedTaskList* m_pPinnedTaskListPerThread[ TASK_PRIORITY_NUM ]; uint32_t m_NumThreads; ThreadDataStore* m_pThreadDataStore; std::thread* m_pThreads; std::atomic m_bRunning; std::atomic m_bShutdownRequested; std::atomic m_bWaitforAllCalled; std::atomic m_NumInternalTaskThreadsRunning; std::atomic m_NumThreadsWaitingForNewTasks; std::atomic m_NumThreadsWaitingForTaskCompletion; uint32_t m_NumPartitions; semaphoreid_t* m_pNewTaskSemaphore; semaphoreid_t* m_pTaskCompleteSemaphore; uint32_t m_NumInitialPartitions; bool m_bHaveThreads; TaskSchedulerConfig m_Config; std::atomic m_NumExternalTaskThreadsRegistered; TaskScheduler( const TaskScheduler& nocopy_ ); TaskScheduler& operator=( const TaskScheduler& nocopy_ ); protected: void SetCustomAllocator( CustomAllocator customAllocator_ ); // for C interface }; inline void ICompletable::OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ) { m_RunningCount.fetch_sub( 1, std::memory_order_acq_rel ); pTaskScheduler_->TaskComplete( this, true, threadNum_ ); } inline void ITaskSet::OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ) { pTaskScheduler_->AddTaskSetToPipeInt( this, threadNum_ ); } inline void IPinnedTask::OnDependenciesComplete( TaskScheduler* pTaskScheduler_, uint32_t threadNum_ ) { (void)threadNum_; pTaskScheduler_->AddPinnedTaskInt( this ); } inline ICompletable::~ICompletable() { ENKI_ASSERT( GetIsComplete() ); // this task is still waiting to run Dependency* pDependency = m_pDependents; while( pDependency ) { Dependency* pNext = pDependency->pNext; pDependency->pDependencyTask = NULL; pDependency->pNext = NULL; pDependency = pNext; } } inline void ICompletable::SetDependency( Dependency& dependency_, const ICompletable* pDependencyTask_ ) { ENKI_ASSERT( pDependencyTask_ != this ); dependency_.SetDependency( pDependencyTask_, this ); } template void ICompletable::SetDependenciesArr( D& dependencyArray_ , const T(&taskArray_)[SIZE] ) { static_assert( std::tuple_size::value >= SIZE, "Size of dependency array too small" ); for( int i = 0; i < SIZE; ++i ) { dependencyArray_[i].SetDependency( &taskArray_[i], this ); } } template void ICompletable::SetDependenciesArr( D& dependencyArray_, std::initializer_list taskpList_ ) { ENKI_ASSERT( std::tuple_size::value >= taskpList_.size() ); int i = 0; for( auto pTask : taskpList_ ) { dependencyArray_[i++].SetDependency( pTask, this ); } } template void ICompletable::SetDependenciesArr( D(&dependencyArray_)[SIZE], const T(&taskArray_)[SIZE] ) { for( int i = 0; i < SIZE; ++i ) { dependencyArray_[i].SetDependency( &taskArray_[i], this ); } } template void ICompletable::SetDependenciesArr( D(&dependencyArray_)[SIZE], std::initializer_list taskpList_ ) { ENKI_ASSERT( SIZE >= taskpList_.size() ); int i = 0; for( auto pTask : taskpList_ ) { dependencyArray_[i++].SetDependency( pTask, this ); } } template void ICompletable::SetDependenciesVec( D& dependencyVec_, const T(&taskArray_)[SIZE] ) { dependencyVec_.resize( SIZE ); for( int i = 0; i < SIZE; ++i ) { dependencyVec_[i].SetDependency( &taskArray_[i], this ); } } template void ICompletable::SetDependenciesVec( D& dependencyVec_, std::initializer_list taskpList_ ) { dependencyVec_.resize( taskpList_.size() ); int i = 0; for( auto pTask : taskpList_ ) { dependencyVec_[i++].SetDependency( pTask, this ); } } } RenderKit-rkcommon-988718e/rkcommon/tasking/detail/parallel_for.inl000066400000000000000000000022131467524601100254240ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #ifdef RKCOMMON_TASKING_TBB # define __TBB_NO_IMPLICIT_LINKAGE 1 # define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 # include #elif defined(RKCOMMON_TASKING_INTERNAL) # include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template inline void parallel_for_impl(INDEX_T nTasks, TASK_T&& fcn) { #ifdef RKCOMMON_TASKING_TBB tbb::parallel_for(INDEX_T(0), nTasks, std::forward(fcn)); #elif defined(RKCOMMON_TASKING_OMP) # pragma omp parallel for schedule(dynamic) for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) { fcn(taskIndex); } #elif defined(RKCOMMON_TASKING_INTERNAL) detail::parallel_for_internal(nTasks, std::forward(fcn)); #else // Debug (no tasking system) for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) { fcn(taskIndex); } #endif } } // ::rkcommon::tasking::detail } // ::rkcommon::tasking } // ::rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/schedule.inl000066400000000000000000000016061467524601100245630ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #ifdef RKCOMMON_TASKING_TBB # define __TBB_NO_IMPLICIT_LINKAGE 1 # define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 # include "tbb/task_arena.h" #elif defined(RKCOMMON_TASKING_OMP) # include #elif defined(RKCOMMON_TASKING_INTERNAL) # include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template inline void schedule_impl(TASK_T fcn) { #ifdef RKCOMMON_TASKING_TBB tbb::task_arena ta = tbb::task_arena(tbb::task_arena::attach()); ta.enqueue(fcn); #elif defined(RKCOMMON_TASKING_OMP) std::thread thread(fcn); thread.detach(); #else// Internal & Debug --> synchronous! fcn(); #endif } } // ::rkcommon::tasking::detail } // ::rkcommon::tasking } // ::rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/detail/tasking_system_init.cpp000066400000000000000000000064371467524601100270650ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "../tasking_system_init.h" // tasking system internals #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #define TBB_PREVIEW_GLOBAL_CONTROL 1 #include #elif defined(RKCOMMON_TASKING_OMP) #include #elif defined(RKCOMMON_TASKING_INTERNAL) #include "TaskSys.h" #endif // std #include // intrinsics #ifndef RKCOMMON_NO_SIMD #if !defined(__ARM_NEON) #include #elif !defined(_WIN32) #include "math/arm/emulation.h" #endif /* normally defined in pmmintrin.h, but we always need this */ #if !defined(_MM_SET_DENORMALS_ZERO_MODE) #define _MM_DENORMALS_ZERO_ON (0x0040) #define _MM_DENORMALS_ZERO_OFF (0x0000) #define _MM_DENORMALS_ZERO_MASK (0x0040) #define _MM_SET_DENORMALS_ZERO_MODE(x) \ (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) #endif #else #if !defined(_MM_SET_DENORMALS_ZERO_MODE) #define _MM_SET_FLUSH_ZERO_MODE(x) \ do { \ } while (0) #define _MM_SET_DENORMALS_ZERO_MODE(x) \ do { \ } while (0) #endif #endif // rkcommon #include "../../common.h" namespace rkcommon { namespace tasking { struct tasking_system_handle { tasking_system_handle(int numThreads) : numThreads(numThreads) { #if defined(RKCOMMON_TASKING_TBB) if (numThreads > 0) tbb_gc = make_unique( tbb::global_control::max_allowed_parallelism, numThreads); #elif defined(RKCOMMON_TASKING_OMP) if (numThreads > 0) omp_set_num_threads(numThreads); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::initTaskSystemInternal(numThreads <= 0 ? -1 : numThreads); #endif } ~tasking_system_handle() { #if defined(RKCOMMON_TASKING_TBB) tbb_gc.reset(); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::shutdownTaskSystemInternal(); #endif } int num_threads() { #if defined(RKCOMMON_TASKING_TBB) return tbb::global_control::active_value( tbb::global_control::max_allowed_parallelism); #elif defined(RKCOMMON_TASKING_OMP) return omp_get_max_threads(); #elif defined(RKCOMMON_TASKING_INTERNAL) return detail::numThreadsTaskSystemInternal(); #else return 1; #endif } int numThreads{-1}; #if defined(RKCOMMON_TASKING_TBB) std::unique_ptr tbb_gc; #endif }; static std::unique_ptr g_tasking_handle; void initTaskingSystem(int numThreads, bool flushDenormals) { if (flushDenormals) { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); } g_tasking_handle = make_unique(numThreads); } void shutdownTaskingSystem() { g_tasking_handle.reset(); } int numTaskingThreads() { if (!g_tasking_handle.get()) return 0; else return g_tasking_handle->num_threads(); } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/parallel_for.h000066400000000000000000000062711467524601100236370ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../traits/rktraits.h" #include "detail/parallel_for.inl" #include namespace rkcommon { namespace tasking { // NOTE(jda) - This abstraction wraps "fork-join" parallelism, with an // implied synchronizsation after all of the tasks have run. template inline void parallel_for(INDEX_T nTasks, TASK_T &&fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::parallel_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " long long, unsigned long long or size_t."); static_assert(has_operator_method_matching_param::value, "rkcommon::tasking::parallel_for() requires the " "implementation of method " "'void TASK_T::operator(P taskIndex), where P is of " "type INDEX_T [first parameter of parallel_for()]."); detail::parallel_for_impl(nTasks, std::forward(fcn)); } // NOTE(jda) - Allow serial version of parallel_for() without the need to // change the entire tasking system backend template inline void serial_for(INDEX_T nTasks, const TASK_T &fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::serial_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " long long, unsigned long long or size_t."); static_assert(has_operator_method_matching_param::value, "rkcommon::tasking::serial_for() requires the " "implementation of method " "'void TASK_T::operator(P taskIndex), where P is of " "type INDEX_T [first parameter of serial_for()]."); for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) fcn(taskIndex); } /* NOTE(iw) - This abstraction extends the 'parallel_for' to mixed parallel/serial: we logically view the domain of N input tasks as grouped into roundUp(N/M) blocks of (at most) M items each; then 'itearte over the N/M blocks in parallel, and process each block serailly */ template inline void parallel_in_blocks_of(INDEX_T nTasks, TASK_T &&fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::parallel_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " or size_t."); INDEX_T numBlocks = (nTasks + BLOCK_SIZE - 1) / BLOCK_SIZE; parallel_for(numBlocks, [&](INDEX_T blockID) { INDEX_T begin = blockID * (INDEX_T)BLOCK_SIZE; INDEX_T end = std::min(begin + (INDEX_T)BLOCK_SIZE, nTasks); fcn(begin, end); }); } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/parallel_foreach.h000066400000000000000000000020071467524601100244510ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "parallel_for.h" #include namespace rkcommon { namespace tasking { template inline void parallel_foreach(ITERATOR_T begin, ITERATOR_T end, TASK_T &&f) { using ITERATOR_KIND = typename std::iterator_traits::iterator_category; static_assert( std::is_same::value, "rkcommon::tasking::parallel_foreach() requires random-" "access iterators!"); const size_t count = std::distance(begin, end); auto *v = &(*begin); parallel_for(count, [&](size_t i) { f(v[i]); }); } template inline void parallel_foreach(CONTAINER_T &&c, TASK_T &&f) { parallel_foreach(std::begin(c), std::end(c), std::forward(f)); } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/schedule.h000066400000000000000000000017371467524601100227730ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../traits/rktraits.h" #include "detail/schedule.inl" namespace rkcommon { namespace tasking { // NOTE(jda) - This abstraction takes a lambda which should take captured // variables by *value* to ensure no captured references race // with the task itself. // NOTE(jda) - No priority is associated with this call, but could be added // later with a hint enum, using a default value for the // priority to not require specifying it. template inline void schedule(TASK_T fcn) { static_assert(traits::has_operator_method::value, "rkcommon::tasking::schedule() requires the " "implementation of method 'void TASK_T::operator()'."); detail::schedule_impl(std::move(fcn)); } } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tasking/tasking_system_init.h000066400000000000000000000007151467524601100252610ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace tasking { void RKCOMMON_INTERFACE initTaskingSystem(int numThreads = -1, bool flushDenormals = false); void RKCOMMON_INTERFACE shutdownTaskingSystem(); int RKCOMMON_INTERFACE numTaskingThreads(); } // namespace tasking } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tracing/000077500000000000000000000000001467524601100210055ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/tracing/Tracing.cpp000066400000000000000000000254661467524601100231150ustar00rootroot00000000000000// Copyright 2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif // We force the define on here to build the right header // at compile time, but apps that build with profiling off // will see the empty defines #define RKCOMMON_ENABLE_PROFILING #include "Tracing.h" #define THREAD_EVENT_CHUNK_SIZE 8192 namespace rkcommon { namespace tracing { using namespace std::chrono; static std::unique_ptr traceRecorder = rkcommon::make_unique(); static thread_local std::shared_ptr threadEventList = nullptr; std::ostream &operator<<(std::ostream &os, const EventType &ty) { switch (ty) { case EventType::INVALID: os << "INVALID"; break; case EventType::BEGIN: os << "B"; break; case EventType::END: os << "E"; break; case EventType::MARKER: os << "i"; break; case EventType::COUNTER: os << "C"; break; default: break; } return os; } TraceEvent::TraceEvent() { #ifdef __linux__ rusage usage; getrusage(RUSAGE_SELF, &usage); ru_utime = usage.ru_utime; ru_stime = usage.ru_stime; #endif time = steady_clock::now(); } TraceEvent::TraceEvent(const EventType ty) : TraceEvent() { type = ty; } TraceEvent::TraceEvent(const EventType type, const char *n, const char *c) : TraceEvent(type) { name = n; category = c; } TraceEvent::TraceEvent( const EventType type, const char *name, const uint64_t value) : TraceEvent(type, name, nullptr) { counterValue = value; } void ThreadEventList::beginEvent(const char *name, const char *category) { getCurrentEventList().push_back(TraceEvent( EventType::BEGIN, getCachedString(name), getCachedString(category))); } void ThreadEventList::endEvent() { getCurrentEventList().push_back(TraceEvent(EventType::END)); } void ThreadEventList::setMarker(const char *name, const char *category) { getCurrentEventList().push_back(TraceEvent( EventType::MARKER, getCachedString(name), getCachedString(category))); } void ThreadEventList::setCounter(const char *name, const uint64_t counterValue) { getCurrentEventList().push_back( TraceEvent(EventType::COUNTER, getCachedString(name), counterValue)); } std::vector &ThreadEventList::getCurrentEventList() { if (events.empty() || events.back().size() >= THREAD_EVENT_CHUNK_SIZE) { events.push_back(std::vector()); events.back().reserve(THREAD_EVENT_CHUNK_SIZE); } return events.back(); } const char *ThreadEventList::getCachedString(const char *str) { if (!str) { return nullptr; } // Lookup string in the uniqueEventNames list, since most strings are likely // to just be static/constant data strings (e.g., rkTraceBeginEvent("X")) // this caching is just based on the pointer and we skip doing more // expensive string comparison. Dynamically generated strings will likely // have different ptrs, though this will be wrong if some memory is // re-used with different text content. auto fnd = stringCache.find(str); if (fnd == stringCache.end()) { auto en = std::make_shared(str); stringCache[str] = en; return en->c_str(); } return fnd->second->c_str(); } std::shared_ptr TraceRecorder::getThreadTraceList( const std::thread::id &id) { std::lock_guard lock(threadTraceMutex); auto fnd = threadTrace.find(id); if (fnd == threadTrace.end()) { auto threadEventList = std::make_shared(); threadTrace[id] = threadEventList; return threadEventList; } return fnd->second; } void TraceRecorder::saveLog(const char *logFile, const char *processName) { std::lock_guard lock(threadTraceMutex); // chrome:://tracing / ui.perfetto.dev takes a JSON array of events, but to // keep dependencies down we don't need a JSON library to produce this simple // format std::ofstream fout(logFile); #ifdef _WIN32 const int pid = _getpid(); #else const int pid = getpid(); #endif fout << "["; // Emit metadata about the process name if (processName) { // Emit metadata event for the thread's ID/name fout << "{" << "\"ph\": \"M\"," << "\"pid\":" << pid << "," << "\"tid\":" << 0 << "," << "\"name\":" << "\"process_name\"," << "\"args\":{\"name\":\"" << processName << "\"}" << "},"; } // Go through each thread and output its data // We renumber thread IDs here because chrome:://tracing UI doesn't display // the true thread ID numbers well int nextTid = 0; for (const auto &trace : threadTrace) { const std::thread::id tid = trace.first; // Emit metadata event for the thread's ID/name fout << "{" << "\"ph\": \"M\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"name\":" << "\"thread_name\"," << "\"args\":{\"name\":\""; if (!trace.second->threadName.empty()) { fout << trace.second->threadName << "\"}"; } else { fout << tid << "\"}"; } fout << "},"; // Track the begin events so that when we hit an end we can compute CPU % // and other stats to include std::stack beginEvents; for (const auto &evtChunk : trace.second->events) { for (const auto &evt : evtChunk) { if (evt.type == EventType::INVALID) { std::cerr << "Got invalid event type!?\n"; } if (evt.type == EventType::BEGIN) { beginEvents.push(&evt); } if (evt.type == EventType::END && beginEvents.empty()) { std::cerr << "Tracing Error: Too many rkTraceEndEvent calls!\n"; break; } const uint64_t timestamp = std::chrono::duration_cast( evt.time.time_since_epoch()) .count(); fout << "{" << "\"ph\": \"" << evt.type << "\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"ts\":" << timestamp << "," << "\"name\":\"" << (evt.name ? evt.name : "") << "\""; if (evt.type != EventType::END && evt.category) { fout << ",\"cat\":\"" << evt.category << "\""; } // Compute CPU utilization % over the begin/end interval for end events float utilization = 0.f; uint64_t duration = 0; const TraceEvent *begin = nullptr; if (evt.type == EventType::END) { begin = beginEvents.top(); utilization = cpuUtilization(*begin, evt); duration = std::chrono::duration_cast( evt.time - begin->time) .count(); fout << ",\"args\":{\"cpuUtilization\":" << utilization << "}"; beginEvents.pop(); } else if (evt.type == EventType::COUNTER) { fout << ",\"args\":{\"value\":" << evt.counterValue << "}"; } fout << "},"; // For each end event also emit an update of the CPU % utilization // counter for events that were long enough to reasonably measure // utilization. CPU % is emitted at the time of the beginning of the // event to display the counter properly over the interval if (evt.type == EventType::END && duration > 100 && begin) { const uint64_t beginTimestamp = std::chrono::duration_cast( begin->time.time_since_epoch()) .count(); fout << "{" << "\"ph\": \"C\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"ts\":" << beginTimestamp << "," << "\"name\":\"cpuUtilization\"," << "\"cat\":\"builtin\"," << "\"args\":{\"value\":" << utilization << "}},"; } } } if (!beginEvents.empty()) { std::cerr << "Tracing Error: Missing end for some events!\n"; while (!beginEvents.empty()) { std::cerr << "\t" << beginEvents.top()->name << "\n"; beginEvents.pop(); } } ++nextTid; } // We need to remove the last , we output to ensure the JSON array is correct // Overwrite it with the ] character. fout.seekp(-1, std::ios::cur); fout << "]"; } float cpuUtilization(const TraceEvent &start, const TraceEvent &end) { #ifdef __linux__ const double elapsed_cpu = end.ru_utime.tv_sec + end.ru_stime.tv_sec - (start.ru_utime.tv_sec + start.ru_stime.tv_sec) + 1e-6f * (end.ru_utime.tv_usec + end.ru_stime.tv_usec - (start.ru_utime.tv_usec + start.ru_stime.tv_usec)); const double elapsed_wall = duration_cast>(end.time - start.time).count(); return elapsed_cpu / elapsed_wall * 100.0; #else return -1.f; #endif } std::string getProcStatus() { // Note: this file doesn't exist on OS X, would we want some alternative to // fetch this info? std::ifstream file("/proc/self/status"); if (!file.is_open()) { return ""; } return std::string( std::istreambuf_iterator(file), std::istreambuf_iterator()); } void getProcMemUse(uint64_t &virtMem, uint64_t &resMem) { virtMem = 0; resMem = 0; #ifdef __linux__ // TODO: Windows? FILE *file = std::fopen("/proc/self/statm", "r"); if (file) { // These values are measured in pages if (std::fscanf(file, "%lu %lu", &virtMem, &resMem) == 2) { const int pageSize = getpagesize(); virtMem *= pageSize; resMem *= pageSize; } std::fclose(file); } #endif } void initThreadEventList() { if (!threadEventList) { threadEventList = traceRecorder->getThreadTraceList(std::this_thread::get_id()); } } void beginEvent(const char *name, const char *category) { initThreadEventList(); threadEventList->beginEvent(name, category); } void endEvent() { // Begin takes care of getting the threadEventList set // in thread_local storage so we can assume it exists here threadEventList->endEvent(); } void setMarker(const char *name, const char *category) { initThreadEventList(); threadEventList->setMarker(name, category); } void setCounter(const char *name, uint64_t value) { initThreadEventList(); threadEventList->setCounter(name, value); } void recordMemUse() { initThreadEventList(); uint64_t virtMem = 0; uint64_t resMem = 0; getProcMemUse(virtMem, resMem); threadEventList->setCounter("rkTraceVirtMem_B", virtMem); threadEventList->setCounter("rkTraceRssMem_B", resMem); } void setThreadName(const char *name) { initThreadEventList(); threadEventList->threadName = name; } void saveLog(const char *logFile, const char *processName) { traceRecorder->saveLog(logFile, processName); } } // namespace tracing } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/tracing/Tracing.h000066400000000000000000000074561467524601100225610ustar00rootroot00000000000000// Copyright 2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include #include #include #include #include #include #ifdef __linux__ #include #include #include #include #include #endif #include "rkcommon/common.h" namespace rkcommon { namespace tracing { enum class EventType { INVALID, BEGIN, END, MARKER, COUNTER }; struct RKCOMMON_INTERFACE TraceEvent { EventType type = EventType::INVALID; // Refers to a string in the thread's stringCache, nullptr for end events const char *name = nullptr; // Refers to the event category in the thread's stringCache, may be null const char *category = nullptr; #ifdef __linux__ timeval ru_utime; timeval ru_stime; #endif std::chrono::steady_clock::time_point time; uint64_t counterValue = 0; TraceEvent(); TraceEvent(const EventType type); TraceEvent(const EventType type, const char *name, const char *category); TraceEvent( const EventType type, const char *name, const uint64_t counterValue); }; struct RKCOMMON_INTERFACE ThreadEventList { // We store events in chunks to reduce memory copy // costs when when tracking very large numbers of events std::list> events; std::string threadName; // Applications are typically running a rendering loop, emitting // the same event name repeatedly. If these names are inline // strings they will have the same pointer and we can cache // them in a map to reduce string copying costs and overhead // Note: the string is wrapped in a shared/unique ptr // to guard against copy ctor use when adding to the map which would // invalidate the pointer to the string data std::unordered_map> stringCache; void beginEvent(const char *name, const char *category); void endEvent(); void setMarker(const char *name, const char *category); void setCounter(const char *name, const uint64_t value); private: std::vector &getCurrentEventList(); const char *getCachedString(const char *str); }; class RKCOMMON_INTERFACE TraceRecorder { std::unordered_map> threadTrace; std::mutex threadTraceMutex; public: /* Get the thread trace list, creating it if this is the first time * this thread has requested its list. This call locks the TraceRecorder, * so threads cache the returned value in thread_local storage to avoid * calling this each event. */ std::shared_ptr getThreadTraceList( const std::thread::id &id); void saveLog(const char *logFile, const char *processName); }; float cpuUtilization(const TraceEvent &start, const TraceEvent &end); std::string getProcStatus(); // Begin an event, must be paired with an end event. Name is required, // category is optional void beginEvent(const char *name, const char *category); void endEvent(); // Set a marker in the trace timeline, e.g., for things that have no duration // Name is required, category is optional void setMarker(const char *name, const char *category); // Counter values are displayed per-process by chrome:://tracing // but are recorded per-thread without synchronization void setCounter(const char *name, uint64_t value); // Record the built-in counters traceVirtMem and traceRssMem tracking the // virtual and resident memory sizes respectively void recordMemUse(); void setThreadName(const char *name); void saveLog(const char *logFile, const char *processName); } // namespace tracing } // namespace rkcommon #ifdef RKCOMMON_ENABLE_PROFILING #define RKCOMMON_IF_TRACING_ENABLED(CMD) CMD #else #define RKCOMMON_IF_TRACING_ENABLED(CMD) #endif RenderKit-rkcommon-988718e/rkcommon/traits/000077500000000000000000000000001467524601100206645ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/traits/rktraits.h000066400000000000000000000114401467524601100227000ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include namespace rkcommon { namespace traits { using byte_t = unsigned char; // C++14 traits for C++11 ///////////////////////////////////////////////// template using enable_if_t = typename std::enable_if::type; // Helper operators /////////////////////////////////////////////////////// template std::true_type operator==(const T &, const Arg &); // type 'T' having '==' operator ////////////////////////////////////////// template struct HasOperatorEqualsT { enum { value = !std::is_same::value }; }; template using HasOperatorEquals = typename std::enable_if::value, TYPE>::type; template using NoOperatorEquals = typename std::enable_if::value, TYPE>::type; // type 'T' (decayed) is a valid parallel_for() index type //////////////// template struct is_valid_index { using TYPE = typename std::decay::type; enum { value = std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value }; }; // type 'T' implementing T::operator() //////////////////////////////////// // NOTE(jda) - This checks at compile time if T implements the method // 'void T::operator()'. template struct has_operator_method { using TASK_T = typename std::decay::type; template class checker; template static std::true_type test(checker *); template static std::false_type test(...); using type = decltype(test(nullptr)); static const bool value = std::is_same::value; }; // type 'T' implementing T::operator(P) with P being integral ///////////// #ifdef _WIN32 template using has_operator_method_matching_param = has_operator_method; #else // NOTE(jda) - This checks at compile time if T implements the method // 'void T::operator(P taskIndex)', where P matches the second // template parameter 'EXPECTED_PARAM_T' template struct has_operator_method_matching_param { using TASK_T = typename std::decay::type; template using t_param = void (TASK_T::*)(P) const; using operator_t = decltype(&TASK_T::operator()); using valid_param = std::is_same, operator_t>; static const bool value = has_operator_method::value && valid_param::value; }; #endif // type 'DERIVED' (decayed) comes from 'BASE' ///////////////////////////// template using is_base_of_t = enable_if_t< std::is_base_of::type>::value>; // type 'T' (decayed) is a class/struct /////////////////////////////////// template using is_class_t = enable_if_t::type>::value>; // type 'T1' and 'T2' are not the same //////////////////////////////////// template using is_not_same_t = enable_if_t::value>; // If a single type is convertible to another ///////////////////////////// template using can_convert = std::is_convertible; template using can_convert_t = enable_if_t::value>; // type 'T' is arithmetic ///////////////////////////////////////////////// template using is_arithmetic_t = enable_if_t::value>; // type 'T1' and 'T2' are not the same and arithmetic ///////////////////// template using is_not_same_and_arithmetic_t = enable_if_t::value && std::is_arithmetic::value && std::is_arithmetic::value>; } // namespace traits } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/000077500000000000000000000000001467524601100210615ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/utility/AbstractArray.h000066400000000000000000000050751467524601100240030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include #include #include namespace rkcommon { namespace utility { /* 'AbstractArray' implements an array interface on a pointer * to data which may be owned by the object. */ template struct AbstractArray { virtual ~AbstractArray() = default; size_t size() const; T &operator[](size_t offset) const; // with bounds checking T &at(size_t offset) const; operator bool() const; explicit operator T *() const; T *data() const; T *begin() const; T *end() const; const T *cbegin() const; const T *cend() const; protected: // Can only be constructed by child classes AbstractArray() = default; // Called by children to initialize the ptr/numItems values void setPtr(T *ptr, size_t numItems); private: T *ptr{nullptr}; size_t numItems{0}; }; // Inlined definitions //////////////////////////////////////////////////// template inline size_t AbstractArray::size() const { return numItems; } template inline T &AbstractArray::operator[](size_t offset) const { return *(begin() + offset); } template inline T &AbstractArray::at(size_t offset) const { if (offset >= size()) throw std::runtime_error("ArrayView: out of bounds access!"); return *(begin() + offset); } template inline AbstractArray::operator bool() const { return size() != 0; } template inline AbstractArray::operator T *() const { return begin(); } template inline T *AbstractArray::data() const { return begin(); } template inline T *AbstractArray::begin() const { return ptr; } template inline T *AbstractArray::end() const { return ptr + size(); } template inline const T *AbstractArray::cbegin() const { return begin(); } template inline const T *AbstractArray::cend() const { return end(); } template inline void AbstractArray::setPtr(T *ptr, size_t numItems) { this->ptr = numItems > 0 ? ptr : nullptr; this->numItems = numItems; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/Any.h000066400000000000000000000156371467524601100217750ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include "../common.h" #include "../traits/rktraits.h" #include "demangle.h" namespace rkcommon { namespace utility { /* 'Any' implements a single item container which erases its type (can hold * any value which is copyable). The value can be extracted successfully * only if the correct type is queried for the held value, where an * exception is thrown otherwise. Similar (but perhaps not identical to) * 'boost::any' or C++17's 'std::any'. * * Example: * * Any myAny = 1; // myAny is an 'int' w/ value of '1' * int value = myAny.get(); // get value of '1' out of myAny * char bad = myAny.get(); // throws exception */ struct Any { Any() = default; Any(const Any ©); template Any(T value); ~Any() = default; Any &operator=(const Any &rhs); template Any &operator=(T rhs); bool operator==(const Any &rhs) const; bool operator!=(const Any &rhs) const; template T &get(); template const T &get() const; template bool is() const; bool valid() const; std::string toString() const; private: // Helper types // struct handle_base { virtual ~handle_base() = default; virtual handle_base *clone() const = 0; virtual const std::type_info &valueTypeID() const = 0; virtual bool isSame(handle_base *other) const = 0; virtual void *data() = 0; }; template struct handle : public handle_base { handle(T value); handle_base *clone() const override; const std::type_info &valueTypeID() const override; bool isSame(handle_base *other) const override; void *data() override; T value; // NOTE(jda) - Use custom type trait to select a real implementation of // isSame(), or one that always returns 'false' if the // template type 'T' does not implement operator==() with // itself. template inline traits::HasOperatorEquals //<-- substitues to 'bool' isSameImpl(handle_base *other) const; template inline traits::NoOperatorEquals //<-- substitutes to 'bool' isSameImpl(handle_base *other) const; }; // Data members // std::unique_ptr currentValue; }; // Inlined Any definitions //////////////////////////////////////////////// template inline Any::Any(T value) : currentValue(new handle::type>( std::forward(value))) { static_assert(std::is_copy_constructible::value && std::is_copy_assignable::value, "Any can only be constructed with copyable values!"); } inline Any::Any(const Any ©) : currentValue(copy.valid() ? copy.currentValue->clone() : nullptr) { } inline Any &Any::operator=(const Any &rhs) { Any temp(rhs); currentValue = std::move(temp.currentValue); return *this; } template inline Any &Any::operator=(T rhs) { static_assert(std::is_copy_constructible::value && std::is_copy_assignable::value, "Any can only be assigned values which are copyable!"); currentValue = std::unique_ptr( new handle::type>( std::forward(rhs))); return *this; } inline bool Any::operator==(const Any &rhs) const { return currentValue->isSame(rhs.currentValue.get()); } inline bool Any::operator!=(const Any &rhs) const { return !(*this == rhs); } template inline T &Any::get() { if (!valid()) throw std::runtime_error("Can't query value from an empty Any!"); if (is()) return *(static_cast(currentValue->data())); else { std::stringstream msg; msg << "Incorrect type queried for Any!" << '\n'; msg << " queried type == " << nameOf() << '\n'; msg << " current type == " << demangle(currentValue->valueTypeID().name()) << '\n'; throw std::runtime_error(msg.str()); } } template inline const T &Any::get() const { if (!valid()) throw std::runtime_error("Can't query value from an empty Any!"); if (is()) return *(static_cast(currentValue->data())); else { std::stringstream msg; msg << "Incorrect type queried for Any!" << '\n'; msg << " queried type == " << nameOf() << '\n'; msg << " current type == " << demangle(currentValue->valueTypeID().name()) << '\n'; throw std::runtime_error(msg.str()); } } template inline bool Any::is() const { return valid() && (strcmp(typeid(T).name(), currentValue->valueTypeID().name()) == 0); } inline bool Any::valid() const { return currentValue.get() != nullptr; } inline std::string Any::toString() const { std::stringstream retval; retval << "Any : (currently holds value of type) --> " << demangle(currentValue->valueTypeID().name()); return retval.str(); } template inline Any::handle::handle(T v) : value(std::move(v)) { } template inline Any::handle_base *Any::handle::clone() const { return new handle(value); } template inline const std::type_info &Any::handle::valueTypeID() const { return typeid(T); } template inline bool Any::handle::isSame(Any::handle_base *other) const { return isSameImpl(other); } template template inline traits::HasOperatorEquals Any::handle::isSameImpl( Any::handle_base *other) const { handle *otherHandle = dynamic_cast *>(other); return (otherHandle != nullptr) && (otherHandle->value == this->value); } template template inline traits::NoOperatorEquals Any::handle::isSameImpl( Any::handle_base *) const { return false; } template inline void *Any::handle::data() { return &value; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/ArgumentList.h000066400000000000000000000074441467524601100236610ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /*! \file ArgumentList.h Defines an interface for storing - and consuming - command line parameters The way this is supposed to work is that the app creates an arglist from the ac/av parameters passed to it, then individual modules can scan through this class, check if they recognize any, parse those, and 'consume' them (it, take them off this list), thus indicating that those have been properly processed. This in particular allows an app to determine if any command lin eparameters have _not_ been processed by any modules, which typically indicates a user having used a deprecated way of specifying an input parameter (or simply, had a typo :-/ ). */ #pragma once #include #include namespace rkcommon { namespace utility { /*! class that abstracts command line arguments */ struct ArgumentList { /*! initialize a new argument list. note that we will _drop_ av[0], as this isn't actually an argument */ ArgumentList(int ac, const char **av); /*! return (a copy of) the idx'th argument. Note that unlike 'real' ac/av numbering of the args starts at 0, not 1 (because we drop the binary name, and only store arguments) */ std::string operator[](const int idx) const; /*! return number of arguments still in list */ int size() const; /*! return number of arguments still in list */ bool empty() const; /*! remove given number of arguments at given index in list */ void remove(int where, int howMany = 1); private: std::vector arg; }; /*! helper abstraction for any class that can wants to arguments - rather than having to manually iterate over all arguments, this class allows any class that is derived from it to simply call ArgsParser::parseAndConsume(), and do all its detection of parsing of command line arguments by overriding 'tryConsume()' */ struct ArgumentsParser { virtual ~ArgumentsParser() = default; /*! check if given arg is one of ours. if so, consume it (and all its successive parameters that depend on it, and return the total number of arguments consumed */ virtual int tryConsume(ArgumentList &argList, int argID) = 0; /*! This function goes over an argument list, and calls 'tryConsume()' for every argument, then takes those that _have_ been indicated as 'yes, we rcongized those' from the argument list. (usually does not have to be overridden) */ void parseAndRemove(ArgumentList &args); }; // ------------------------------------------------------------------ // (header-only) implementatoin section from here on: // ------------------------------------------------------------------ inline ArgumentList::ArgumentList(int ac, const char **av) { for (int i = 1; i < ac; i++) arg.push_back(av[i]); } inline std::string ArgumentList::operator[](int idx) const { return arg.at(idx); } inline int ArgumentList::size() const { return static_cast(arg.size()); } inline bool ArgumentList::empty() const { return arg.empty(); } inline void ArgumentList::remove(int where, int howMany) { for (int i = 0; i < howMany; i++) arg.erase(arg.begin() + where, arg.begin() + where + 1); } inline void ArgumentsParser::parseAndRemove(ArgumentList &argList) { for (int argID = 0; argID < argList.size(); /*no-op*/) { int numConsumed = tryConsume(argList, argID); if (numConsumed == 0) ++argID; else argList.remove(argID, numConsumed); } } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/ArrayView.h000066400000000000000000000046421467524601100231510ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include namespace rkcommon { namespace utility { /* 'ArrayView' implements an array interface on a pointer to data which * is *NOT* owned by ArrayView. If you want ArrayView to own data, then * instead use std::array or std::vector. */ template struct ArrayView : public AbstractArray { ArrayView() = default; ~ArrayView() override = default; template ArrayView(std::array &init); ArrayView(std::vector &init); explicit ArrayView(T *data, size_t size); void reset(); void reset(T *data, size_t size); template ArrayView &operator=(std::array &rhs); ArrayView &operator=(std::vector &rhs); }; // Inlined ArrayView definitions ////////////////////////////////////////// template inline ArrayView::ArrayView(T *_data, size_t _size) { AbstractArray::setPtr(_data, _size); } template template inline ArrayView::ArrayView(std::array &init) { AbstractArray::setPtr(init.data(), init.size()); } template inline ArrayView::ArrayView(std::vector &init) { AbstractArray::setPtr(init.data(), init.size()); } template inline void ArrayView::reset() { AbstractArray::setPtr(nullptr, 0); } template inline void ArrayView::reset(T *_data, size_t _size) { AbstractArray::setPtr(_data, _size); } template template inline ArrayView &ArrayView::operator=(std::array &rhs) { AbstractArray::setPtr(rhs.data(), rhs.size()); return *this; } template inline ArrayView &ArrayView::operator=(std::vector &rhs) { AbstractArray::setPtr(rhs.data(), rhs.size()); return *this; } // ArrayView utility functions //////////////////////////////////////////// template inline ArrayView make_ArrayView(T *data, size_t size) { return ArrayView(data, size); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/CodeTimer.h000066400000000000000000000036221467524601100231100ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // std #include namespace rkcommon { namespace utility { /*! Helper class that assists with timing a region of code. */ struct CodeTimer { void start(); void stop(); double seconds() const; double milliseconds() const; double perSecond() const; double secondsSmoothed() const; double millisecondsSmoothed() const; double perSecondSmoothed() const; private: double smooth_nom{0.0}; double smooth_den{0.0}; std::chrono::time_point frameEndTime; std::chrono::time_point frameStartTime; }; // Inlined CodeTimer definitions ////////////////////////////////////////// inline void CodeTimer::start() { frameStartTime = std::chrono::steady_clock::now(); } inline void CodeTimer::stop() { frameEndTime = std::chrono::steady_clock::now(); smooth_nom = smooth_nom * 0.8f + seconds(); smooth_den = smooth_den * 0.8f + 1.f; } inline double CodeTimer::seconds() const { auto diff = frameEndTime - frameStartTime; return std::chrono::duration(diff).count(); } inline double CodeTimer::milliseconds() const { auto diff = frameEndTime - frameStartTime; return std::chrono::duration(diff).count(); } inline double CodeTimer::perSecond() const { return 1.0 / seconds(); } inline double CodeTimer::secondsSmoothed() const { return 1.0 / perSecondSmoothed(); } inline double CodeTimer::millisecondsSmoothed() const { return secondsSmoothed() * 1000.0; } inline double CodeTimer::perSecondSmoothed() const { return smooth_den / smooth_nom; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/DataView.h000066400000000000000000000022511467524601100227360ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace utility { template struct DataView { DataView() = default; ~DataView() = default; DataView(const void *data, size_t stride = sizeof(T)); void reset(const void *data, size_t stride = sizeof(T)); const T &operator[](size_t index) const; protected: const byte_t *ptr{nullptr}; size_t stride{1}; }; // Inlined member definitions // /////////////////////////////////////////////// template inline DataView::DataView(const void *_data, size_t _stride) : ptr(static_cast(_data)), stride(_stride) { } template inline void DataView::reset(const void *_data, size_t _stride) { ptr = static_cast(_data); stride = _stride; } template inline const T &DataView::operator[](size_t index) const { return *reinterpret_cast(ptr + (index * stride)); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/DoubleBufferedValue.h000066400000000000000000000041431467524601100251060ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include namespace rkcommon { namespace utility { /*! This class represents two values which are double buffered. This is useful if one thread wants to work on a piece of data while another "uses" it. Then at some point, the caller can swap() the front and back values, where front() and back() references will be exchanged. Example: A rendering thread wants to work on a framebuffer while a GUI thread wants to continuously draw the latest complete framebuffer. Once the new frame is ready, they are swapped. NOTE: This isn't thread safe! Any references to front() and back() must be synchronized with when swap() gets called. */ template class DoubleBufferedValue { public: // This assumes that T is default constructable. If you want to use this // abstraction with non default constructable types, you will need to add // additional constructors. DoubleBufferedValue() = default; ~DoubleBufferedValue() = default; T &front(); const T &front() const; T &back(); const T &back() const; void swap(); private: int front_value{0}; int back_value{1}; T values[2]; }; // Inlined members //////////////////////////////////////////////////////// template inline T &DoubleBufferedValue::front() { return values[front_value]; } template inline const T &DoubleBufferedValue::front() const { return values[front_value]; } template inline T &DoubleBufferedValue::back() { return values[back_value]; } template inline const T &DoubleBufferedValue::back() const { return values[back_value]; } template inline void DoubleBufferedValue::swap() { std::swap(front_value, back_value); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/FixedArray.h000066400000000000000000000061721467524601100232760ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include #include namespace rkcommon { namespace utility { template struct FixedArrayView; /* 'FixedArray' implements an array interface on a pointer to * data which is owned by the FixedArray. The array is not * initialized on creation and cannot be resized, though it can * be recreated with a new size */ template struct FixedArray : public AbstractArray { using View = FixedArrayView; FixedArray() = default; ~FixedArray() override = default; explicit FixedArray(size_t size); explicit FixedArray(T *data, size_t size); template FixedArray(std::array &init); FixedArray(std::vector &init); template FixedArray &operator=(std::array &rhs); FixedArray &operator=(std::vector &rhs); private: // We use a shared ptr to actually manage lifetime the data lifetime std::shared_ptr array = nullptr; }; // Inlined FixedArray definitions ///////////////////////////////////////// template inline FixedArray::FixedArray(size_t _size) : array(std::shared_ptr(new T[_size], std::default_delete())) { AbstractArray::setPtr(array.get(), _size); } template inline FixedArray::FixedArray(T *_data, size_t _size) : FixedArray(_size) { // Note: // UB in memcpy if // 1) source / destination are NULL, even if _size is 0 // 2) buffers overlap // 3) destination is too small. // We catch the first case here, and the others are impossible // since we just allocated the destination. if (_data && _size > 0) std::memcpy(array.get(), _data, _size * sizeof(T)); } template template inline FixedArray::FixedArray(std::array &init) : FixedArray(init.data(), init.size()) { } template inline FixedArray::FixedArray(std::vector &init) : FixedArray(init.data(), init.size()) { } template template inline FixedArray &FixedArray::operator=(std::array &rhs) { array = std::shared_ptr(new T[rhs.size()], std::default_delete()); AbstractArray::setPtr(array.get(), rhs.size()); if (rhs.data() && rhs.size() > 0) std::memcpy(array.get(), rhs.data(), rhs.size() * sizeof(T)); return *this; } template inline FixedArray &FixedArray::operator=(std::vector &rhs) { array = std::shared_ptr(new T[rhs.size()], std::default_delete()); AbstractArray::setPtr(array.get(), rhs.size()); if (rhs.data() && rhs.size() > 0) std::memcpy(array.get(), rhs.data(), rhs.size() * sizeof(T)); return *this; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/FixedArrayView.h000066400000000000000000000025151467524601100241260ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include "FixedArrayView.h" namespace rkcommon { namespace utility { /* 'FixedArrayView' implements an array interface on a pointer to * data which is owned by the FixedArrayView. The array is not * initialized on creation and cannot be resized, though it can * be recreated with a new size */ template struct FixedArrayView : public AbstractArray { FixedArrayView() = default; ~FixedArrayView() override = default; FixedArrayView(std::shared_ptr> &data, size_t offset, size_t size); private: // The underlying array from the fixed array being viewed, to keep // the data alive for the view's lifetime std::shared_ptr> data; }; // Inlined FixedArrayView definitions template FixedArrayView::FixedArrayView(std::shared_ptr> &_data, size_t offset, size_t size) : data(_data) { AbstractArray::setPtr(data->begin() + offset, size); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/Observer.h000066400000000000000000000045611467524601100230270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "TimeStamp.h" #include #include namespace rkcommon { namespace utility { struct Observer; // Something that an observer looks at // // NOTE(jda) - This can either be used as a base class or as a stand-alone // member of a class. It is up to the user to decide best how to // use this abstraction. struct Observable { Observable() = default; virtual ~Observable(); void notifyObservers(); private: friend Observer; void registerObserver(Observer &newObserver); void removeObserver(Observer &toRemove); TimeStamp lastNotified; std::vector observers; }; // Something that looks an an observable instance. // // NOTE(jda) - I think this makes more sense for objects to hold an instance // of an Observer and avoid _being_ and observer. struct Observer { Observer(Observable &observee); ~Observer(); bool wasNotified(); private: friend Observable; TimeStamp lastObserved; Observable *observee{nullptr}; }; // Inlined definitions //////////////////////////////////////////////////// // Observable // inline Observable::~Observable() { for (auto *observer : observers) observer->observee = nullptr; } inline void Observable::notifyObservers() { lastNotified.renew(); } inline void Observable::registerObserver(Observer &newObserver) { observers.push_back(&newObserver); } inline void Observable::removeObserver(Observer &toRemove) { auto &o = observers; o.erase(std::remove(o.begin(), o.end(), &toRemove), o.end()); } // Observer // inline Observer::Observer(Observable &_observee) : observee(&_observee) { observee->registerObserver(*this); } inline Observer::~Observer() { if (observee) observee->removeObserver(*this); } inline bool Observer::wasNotified() { if (!observee) return false; bool notified = lastObserved < observee->lastNotified; if (notified) lastObserved.renew(); return notified; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/OnScopeExit.h000066400000000000000000000015641467524601100234400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../traits/rktraits.h" namespace rkcommon { namespace utility { /* Execute a given function when a scope exits */ struct OnScopeExit { template OnScopeExit(FCN_T &&_fcn); ~OnScopeExit(); private: std::function fcn; }; // Inlined OnScopeExit definitions //////////////////////////////////////// template inline OnScopeExit::OnScopeExit(FCN_T &&_fcn) { static_assert(traits::has_operator_method::value, "FCN_T must implement operator() with no arguments!"); fcn = std::forward(_fcn); } inline OnScopeExit::~OnScopeExit() { fcn(); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/Optional.h000066400000000000000000000244541467524601100230300ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "../traits/rktraits.h" #include namespace rkcommon { namespace utility { /* 'Optional' implements a single item container which only _may_ have a * value in it. Use Optional<>::value_or() as a way to get either the value * or some default if the value doesn't exist --> makes it easier to read * "branchy" code. * * NOTE: Similar (but perhaps not identical to) 'boost::optional' or C++17's * 'std::optional'. * * Example: * * Optional myOpt; // 'myOpt' doesn't contain a value * assert(!myOpt.has_value()); // true. * assert(!myOpt); // true, shorthand for has_value(). * * myOpt = 5; // 'myOpt' now has a valid value, above * // asserts no longer true. * * assert(myOpt.value()); // true. * assert(*myOpt == 5); // true, shorthand for value(). * assert(myOpt.value_or(3) // true because myOpt has a value. * == 5); * * myOpt.reset(); // destroy held value. * assert(myOpt.value_or(3) // now true because myOpt had value * == 3); // removed by reset(). */ template struct Optional { // Members in C++17 specified std::optional interface in C++11 // Optional() = default; Optional(const Optional &other); template Optional(const Optional &other); Optional(Optional &&other); template Optional(Optional &&other); #if 0 // NOTE(jda) - can't get this to NOT conflict with copy/move ctors... template Optional(Args&&... args); #else Optional(const T &value); #endif ~Optional(); Optional &operator=(const Optional &other); Optional &operator=(Optional &&other); template Optional &operator=(U &&value); template Optional &operator=(const Optional &other); template Optional &operator=(Optional &&other); const T *operator->() const; T *operator->(); const T &operator*() const; T &operator*(); bool has_value() const; explicit operator bool() const; const T &value() const; T &value(); template T value_or(U &&default_value) const; void reset(); template T &emplace(Args &&... args); // Extra members // std::string toString() const; private: // Helper functions // void default_construct_storage_if_needed(); // Data members // std::array storage; bool hasValue{false}; }; // Inlined Optional definitions /////////////////////////////////////////// template inline Optional::Optional(const Optional &other) : Optional() { if (other.has_value()) *this = other.value(); } template template inline Optional::Optional(const Optional &other) : Optional() { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); if (other.has_value()) *this = other.value(); } template inline Optional::Optional(Optional &&other) : Optional() { if (other.has_value()) { reset(); value() = std::move(other.value()); hasValue = true; } } template template inline Optional::Optional(Optional &&other) : Optional() { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); if (other.has_value()) { reset(); value() = std::move(other.value()); hasValue = true; } } #if 0 // NOTE(jda) - see comment in declaration... template template inline Optional::Optional(Args&&... args) { emplace(std::forward(args)...); } #else template inline Optional::Optional(const T &value) { emplace(value); } #endif template inline Optional::~Optional() { reset(); } template inline Optional &Optional::operator=(const Optional &other) { default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template inline Optional &Optional::operator=(Optional &&other) { default_construct_storage_if_needed(); value() = std::move(other.value()); hasValue = true; return *this; } template template inline Optional &Optional::operator=(U &&rhs) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " being assigned from be convertible to the type parameter" " of the destination Optional<>."); default_construct_storage_if_needed(); this->value() = rhs; hasValue = true; return *this; } template template inline Optional &Optional::operator=(const Optional &other) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template template inline Optional &Optional::operator=(Optional &&other) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being moved-from be" " convertible to the type parameter of the destination" " Optional<>."); default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template inline const T *Optional::operator->() const { return &value(); } template inline T *Optional::operator->() { return &value(); } template inline const T &Optional::operator*() const { return value(); } template inline T &Optional::operator*() { return value(); } template inline bool Optional::has_value() const { return hasValue; } template inline Optional::operator bool() const { return has_value(); } template inline const T &Optional::value() const { return *(reinterpret_cast(storage.data())); } template inline T &Optional::value() { return *(reinterpret_cast(storage.data())); } template template inline T Optional::value_or(U &&default_value) const { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type given" " to value_or() to be convertible to type T, the type" " parameter of Optional<>."); return has_value() ? value() : static_cast(std::forward(default_value)); } template inline void Optional::reset() { if (!std::is_trivially_destructible::value && has_value()) value().~T(); hasValue = false; } template template inline T &Optional::emplace(Args &&... args) { reset(); new (storage.data()) T(std::forward(args)...); hasValue = true; return value(); } template inline std::string Optional::toString() const { return "rkcommon::utility::Optional"; } template inline void Optional::default_construct_storage_if_needed() { if (!has_value()) new (storage.data()) T(); } // Comparison functions /////////////////////////////////////////////////// template inline bool operator==(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs == *rhs); } template inline bool operator!=(const Optional &lhs, const Optional &rhs) { return !(lhs == rhs); } template inline bool operator<(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs < *rhs); } template inline bool operator<=(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs <= *rhs); } template inline bool operator>(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs > *rhs); } template inline bool operator>=(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs >= *rhs); } template inline Optional make_optional(Args &&... args) { Optional ret; ret.emplace(std::forward(args)...); return ret; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/OwnedArray.h000066400000000000000000000054441467524601100233140ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include namespace rkcommon { namespace utility { /* 'OwnedArray' implements an array interface on a pointer to * data which is owned by the OwnedArray. */ template struct OwnedArray : public AbstractArray { OwnedArray() = default; ~OwnedArray() override = default; template OwnedArray(std::array &init); OwnedArray(std::vector &init); explicit OwnedArray(T *data, size_t size); template OwnedArray &operator=(std::array &rhs); OwnedArray &operator=(std::vector &rhs); void reset(); void reset(T *_data, size_t _size); void resize(size_t size, const T &val); private: std::vector dataBuf; }; // Inlined OwnedArray definitions ///////////////////////////////////////// template inline OwnedArray::OwnedArray(T *_data, size_t _size) : dataBuf(_data, _data + _size) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template template inline OwnedArray::OwnedArray(std::array &init) : dataBuf(init.begin(), init.end()) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template inline OwnedArray::OwnedArray(std::vector &init) : dataBuf(init) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template template inline OwnedArray &OwnedArray::operator=(std::array &rhs) { dataBuf = std::vector(rhs.begin(), rhs.end()); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); return *this; } template inline OwnedArray &OwnedArray::operator=(std::vector &rhs) { dataBuf = std::vector(rhs.begin(), rhs.end()); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); return *this; } template inline void OwnedArray::reset() { dataBuf.clear(); dataBuf.shrink_to_fit(); AbstractArray::setPtr(nullptr, 0); } template inline void OwnedArray::reset(T *_data, size_t _size) { dataBuf = std::vector(_data, _data + _size); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template inline void OwnedArray::resize(size_t size, const T &val) { dataBuf.resize(size, val); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/ParameterizedObject.cpp000066400000000000000000000022701467524601100255110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "ParameterizedObject.h" #include namespace rkcommon { namespace utility { ParameterizedObject::Param::Param(const std::string &_name) : name(_name) {} void ParameterizedObject::removeParam(const std::string &name) { auto foundParam = std::find_if( paramList.begin(), paramList.end(), [&](const std::shared_ptr &p) { return p->name == name; }); if (foundParam != paramList.end()) { paramList.erase(foundParam); } } ParameterizedObject::Param *ParameterizedObject::findParam( const std::string &name, bool addIfNotExist) { auto foundParam = std::find_if( paramList.begin(), paramList.end(), [&](const std::shared_ptr &p) { return p->name == name; }); if (foundParam != paramList.end()) return foundParam->get(); else if (addIfNotExist) { paramList.push_back(std::make_shared(name)); return paramList[paramList.size() - 1].get(); } else return nullptr; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/ParameterizedObject.h000066400000000000000000000064531467524601100251650ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // stl #include // rkcommon #include "Any.h" namespace rkcommon { namespace utility { /*! \brief defines a basic object whose lifetime is managed by ospray */ struct RKCOMMON_INTERFACE ParameterizedObject { ParameterizedObject() = default; virtual ~ParameterizedObject() = default; /*! \brief container for _any_ sort of parameter an app can assign to an ospray object */ struct RKCOMMON_INTERFACE Param { Param(const std::string &name); ~Param() = default; template void set(const T &v); utility::Any data; std::string name; bool query = false; }; /*! \brief check if a given parameter is available */ bool hasParam(const std::string &name); /*! set a parameter with given name to given value, create param if not * existing */ template void setParam(const std::string &name, const T &t); template T getParam(const std::string &name, T valIfNotFound); void removeParam(const std::string &name); void resetAllParamQueryStatus(); protected: Param *findParam(const std::string &name, bool addIfNotExist = false); std::vector>::iterator params_begin(); std::vector>::iterator params_end(); private: // Data members // /*! \brief list of parameters attached to this object */ // NOTE(jda) - Use std::shared_ptr because copy/move of a // ParameterizedObject would end up copying parameters, where // destruction of each copy should only result in freeing the // parameters *once* std::vector> paramList; }; // Inlined ParameterizedObject definitions //////////////////////////////// template inline void ParameterizedObject::Param::set(const T &v) { data = v; } inline bool ParameterizedObject::hasParam(const std::string &name) { return findParam(name, false) != nullptr; } template inline void ParameterizedObject::setParam(const std::string &name, const T &t) { findParam(name, true)->set(t); } template inline T ParameterizedObject::getParam(const std::string &name, T valIfNotFound) { Param *param = findParam(name); if (!param) return valIfNotFound; if (!param->data.is()) return valIfNotFound; param->query = true; return param->data.get(); } inline void ParameterizedObject::resetAllParamQueryStatus() { for (auto p = params_begin(); p != params_end(); ++p) (*p)->query = false; } inline std::vector>::iterator ParameterizedObject::params_begin() { return paramList.begin(); } inline std::vector>::iterator ParameterizedObject::params_end() { return paramList.end(); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/PseudoURL.cpp000066400000000000000000000102421467524601100234060ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /*! \file PseudoURL: splits a 'pseudo-url' of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'lines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb') */ #include "PseudoURL.h" namespace rkcommon { namespace utility { void tokenize(const std::string &str, const char delim, std::vector &tokens) { size_t prev = 0; size_t fnd = str.find(delim); for (; fnd != std::string::npos; prev = fnd + 1, fnd = str.find(delim, prev)) { // Discard repeated tokens in the string, e.g. tokeninzing a::c::b on // ':' should just return a, c, b if (fnd - prev > 1) { tokens.push_back(str.substr(prev, fnd - prev)); } } // Grab the last token in the string, if the string didn't terminate with // a delimiter if (str.size() - prev > 1) { tokens.push_back(str.substr(prev)); } } /*! constructor - parse the given string into its components */ PseudoURL::PseudoURL(const std::string &inputString) { std::string tmp = inputString; const size_t separator = tmp.find("://"); if (separator != std::string::npos) { // separator specified: cut off 'type' before that separator, // and reset 'tmp' to everything behind it type = tmp.substr(0, separator); tmp = tmp.substr(separator + 3); } else { // no separator -> empty type specifier string, tmp returns // un-modified type = ""; } /* now, split remainder into its colon-separated components (the first of those is the filename, all other ones are params */ std::vector colonSeparatedComponents; tokenize(tmp, ':', colonSeparatedComponents); if (colonSeparatedComponents.empty()) // degenerate case of "type://" - return empty filename and // empty params return; fileName = colonSeparatedComponents[0]; for (size_t arg_it = 1; arg_it < colonSeparatedComponents.size(); arg_it++) { std::string arg = colonSeparatedComponents[arg_it]; const size_t equalSign = arg.find('='); if (equalSign != std::string::npos) { params.push_back(std::make_pair(arg.substr(0, equalSign), arg.substr(equalSign + 1))); } else { params.push_back(std::make_pair(arg, std::string(""))); } } } /*! return the parsed type. may we empty string if none was specified */ std::string PseudoURL::getType() const { return type; } /*! return the parsed file name specifier. cannot be empty string */ std::string PseudoURL::getFileName() const { return fileName; } /*! return value for given parameters name, or throw an exception if not specified */ std::string PseudoURL::getValue(const std::string &name) const { /* note(iw) - we do _not_ do a immediate 'return' upon the first param with mathcin gname we find so as to ensure that we use the _last_ time any parameter was written. it's more intuitive to have the last value override earlier ones, but i didn't want the parser (ie, constructor) to mess with the input data (maybe in some cases a class using this _wants_ to have multiple instances of the same parameter!?), so let's fix that here */ int found = -1; for (size_t i = 0; i < params.size(); i++) { if (params[i].first == name) found = i; } if (found < 0) { throw std::runtime_error( "PseudoURL::getValue queried value of " "not-specified parameter"); } return params[found].second; } /*! check if the given parameter was specified */ bool PseudoURL::hasParam(const std::string &name) { for (auto ¶m : params) if (param.first == name) return true; return false; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/PseudoURL.h000066400000000000000000000040311467524601100230520ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once /*! \file PseudoURL: splits a 'pseudo-url' of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'slines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb') */ #include #include #include "../common.h" namespace rkcommon { namespace utility { //! \brief Tokenize the string passed on the desired delimiter void tokenize(const std::string &str, const char delim, std::vector &tokens); /* a pseudo-url is of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'lines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb'). This class takes a string and splits it into these components */ struct PseudoURL { /*! constructor - parse the given string into its components */ PseudoURL(const std::string &inputString); /*! return the parsed type. may we empty string if none was specified */ std::string getType() const; /*! return the parsed file name specifier. cannot be empty string */ std::string getFileName() const; /*! return value for given parameters name, or throw an exception if not specified */ std::string getValue(const std::string &name) const; /*! check if the given parameter was specified */ bool hasParam(const std::string &name); private: /*! the type of the psueod-url, eg, for 'points://file.raw' this would be 'points'. If no "://" is specified, this gets set to "" */ std::string type; /*! the filename - the thing after the ://, and before the ":" that starts parameters */ std::string fileName; /*! the name-value pairs specified as parameters */ std::vector> params; }; } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/SaveImage.h000066400000000000000000000063551467524601100231040ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // Quiet `fopen` MSVC warning #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include "../math/vec.h" #include "../memory/malloc.h" namespace rkcommon { namespace utility { template inline void writeImage(const std::string &fileName, const char *const header, const int sizeX, const int sizeY, const PIXEL_T *const pixel) { FILE *file = fopen(fileName.c_str(), "wb"); if (file == nullptr) throw std::runtime_error("Can't open file for writeP[FP]M!"); fprintf(file, header, sizeX, sizeY); auto out = STACK_BUFFER(COMP_T, N_COMP * sizeX); for (int y = 0; y < sizeY; y++) { auto *in = (const COMP_T *)&pixel[(FLIP ? sizeY - 1 - y : y) * sizeX]; for (int x = 0; x < sizeX; x++) for (int c = 0; c < N_COMP; c++) out[N_COMP * x + c] = in[PIXEL_COMP * x + (N_COMP == 1 ? 3 : c)]; fwrite(out, N_COMP * sizeX, sizeof(COMP_T), file); } fprintf(file, "\n"); fclose(file); } inline void writePPM(const std::string &fileName, const int sizeX, const int sizeY, const uint32_t *pixel) { writeImage( fileName, "P6\n%i %i\n255\n", sizeX, sizeY, pixel); } inline void writePGM(const std::string &fileName, const int sizeX, const int sizeY, const uint32_t *pixel) { writeImage( fileName, "P5\n%i %i\n255\n", sizeX, sizeY, pixel); } template inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const T *p) = delete; using namespace rkcommon::math; template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const float *p) { writeImage( fName, "Pf\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec3f *p) { writeImage( fName, "PF\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec3fa *p) { writeImage( fName, "PF\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec4f *p) { writeImage( fName, "PF4\n%i %i\n-1.0\n", sizeX, sizeY, p); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/StringManip.h000066400000000000000000000055001467524601100234650ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include namespace rkcommon { namespace utility { /* return a string which is the two inputs match from the beginning of each */ inline std::string longestBeginningMatch(const std::string &first, const std::string &second) { // NOTE(jda) - If length of the second string is shorter than the first, // then we can only iterate through the first string the // number of characters of the second string. auto maxMatchLength = std::min(first.size(), second.size()); auto start1 = first.begin(); auto start2 = second.begin(); auto end = first.begin() + maxMatchLength; return std::string(start1, std::mismatch(start1, end, start2).first); } inline bool beginsWith(const std::string &inputString, const std::string &startsWithString) { auto startingMatch = longestBeginningMatch(inputString, startsWithString); return startingMatch.size() == startsWithString.size(); } /* split a string on a single character delimiter */ inline std::vector split(const std::string &input, char delim) { std::stringstream ss(input); std::string item; std::vector elems; while (std::getline(ss, item, delim)) elems.push_back(std::move(item)); return elems; } /* split a string on a set of delimiters */ inline std::vector split(const std::string &input, const std::string &delim, const bool keepDelim = false) { std::vector tokens; size_t pos = 0; while (1) { size_t begin = input.find_first_not_of(delim, pos); if (begin == input.npos) return tokens; size_t end = input.find_first_of(delim, begin); if (keepDelim && begin != 0) begin--; tokens.push_back(input.substr( begin, (end == input.npos) ? input.npos : (end - begin))); pos = end; } } /* return lower case version of the input string */ inline std::string lowerCase(const std::string &str) { std::string retval = str; std::transform(retval.begin(), retval.end(), retval.begin(), ::tolower); return retval; } /* return upper case version of the input string */ inline std::string upperCase(const std::string &str) { std::string retval = str; std::transform(retval.begin(), retval.end(), retval.begin(), ::toupper); return retval; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/TimeStamp.cpp000066400000000000000000000016201467524601100234670ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "TimeStamp.h" namespace rkcommon { namespace utility { std::atomic TimeStamp::global{0}; TimeStamp::TimeStamp(const TimeStamp &other) { this->value = other.value.load(); } TimeStamp::TimeStamp(TimeStamp &&other) { this->value = other.value.load(); } TimeStamp &TimeStamp::operator=(const TimeStamp &other) { this->value = other.value.load(); return *this; } TimeStamp &TimeStamp::operator=(TimeStamp &&other) { this->value = other.value.load(); return *this; } TimeStamp::operator size_t() const { return value; } void TimeStamp::renew() { value = nextValue(); } size_t TimeStamp::nextValue() { return global++; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/TimeStamp.h000066400000000000000000000013451467524601100231400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include namespace rkcommon { namespace utility { struct RKCOMMON_INTERFACE TimeStamp { TimeStamp() = default; TimeStamp(const TimeStamp &); TimeStamp(TimeStamp &&); TimeStamp &operator=(const TimeStamp &); TimeStamp &operator=(TimeStamp &&); operator size_t() const; void renew(); private: static size_t nextValue(); // Data members // std::atomic value{nextValue()}; //! \brief the uint64_t that stores the time value static std::atomic global; }; } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/TransactionalValue.h000066400000000000000000000047251467524601100250410ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include namespace rkcommon { namespace utility { /* This implements a 1-to-1 value fence. One thread can set (or "queue") a * value for another thread to later get. This is conceptually similar to * "doublebuffering" a single value. Note that all values from the producer * thread overwrite the "queued" value, where the consumer thread will * always get the last value set by the producer thread. */ template class TransactionalValue { public: TransactionalValue() = default; ~TransactionalValue() = default; template TransactionalValue(const OtherType &ot); template TransactionalValue &operator=(const OtherType &ot); TransactionalValue &operator=(const TransactionalValue &fp); T &ref(); T get(); bool update(); private: bool newValue{false}; T queuedValue; T currentValue; std::mutex mutex; }; // Inlined TransactionalValue Members ///////////////////////////////////// template template inline TransactionalValue::TransactionalValue(const OtherType &ot) { currentValue = ot; } template template inline TransactionalValue &TransactionalValue::operator=( const OtherType &ot) { std::lock_guard lock{mutex}; queuedValue = ot; newValue = true; return *this; } template inline TransactionalValue &TransactionalValue::operator=( const TransactionalValue &fp) { std::lock_guard lock{mutex}; queuedValue = fp.ref(); newValue = true; return *this; } template inline T &TransactionalValue::ref() { return currentValue; } template inline T TransactionalValue::get() { return currentValue; } template inline bool TransactionalValue::update() { bool didUpdate = false; if (newValue) { std::lock_guard lock{mutex}; currentValue = std::move(queuedValue); newValue = false; didUpdate = true; } return didUpdate; } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/demangle.cpp000066400000000000000000000011611467524601100233400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "demangle.h" #ifdef __GNUG__ #include #include #include #endif namespace rkcommon { namespace utility { #ifdef __GNUG__ std::string demangle(const char *name) { int status = 0; std::unique_ptr res{ abi::__cxa_demangle(name, NULL, NULL, &status), std::free}; return (status == 0) ? res.get() : name; } #else std::string demangle(const char *name) { return name; } #endif } // namespace utility } // namespace rkcommonRenderKit-rkcommon-988718e/rkcommon/utility/demangle.h000066400000000000000000000006441467524601100230120ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "../common.h" namespace rkcommon { namespace utility { RKCOMMON_INTERFACE std::string demangle(const char *name); template inline std::string nameOf() { return demangle(typeid(T).name()); } } // namespace utility } // namespace rkcommonRenderKit-rkcommon-988718e/rkcommon/utility/detail/000077500000000000000000000000001467524601100223235ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/utility/detail/pcg_extras.hpp000066400000000000000000000474261467524601100252100ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2017 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This file provides support code that is useful for random-number generation * but not specific to the PCG generation scheme, including: * - 128-bit int support for platforms where it isn't available natively * - bit twiddling operations * - I/O of 128-bit and 8-bit integers * - Handling the evilness of SeedSeq * - Support for efficiently producing random numbers less than a given * bound */ #ifndef PCG_EXTRAS_HPP_INCLUDED #define PCG_EXTRAS_HPP_INCLUDED 1 #include #include #include #include #include #include #include #include #include #include #include #ifdef __GNUC__ #include #endif /* * Abstractions for compiler-specific directives */ #ifdef __GNUC__ #define PCG_NOINLINE __attribute__((noinline)) #else #define PCG_NOINLINE #endif /* * Some members of the PCG library use 128-bit math. When compiling on 64-bit * platforms, both GCC and Clang provide 128-bit integer types that are ideal * for the job. * * On 32-bit platforms (or with other compilers), we fall back to a C++ * class that provides 128-bit unsigned integers instead. It may seem * like we're reinventing the wheel here, because libraries already exist * that support large integers, but most existing libraries provide a very * generic multiprecision code, but here we're operating at a fixed size. * Also, most other libraries are fairly heavyweight. So we use a direct * implementation. Sadly, it's much slower than hand-coded assembly or * direct CPU support. * */ #if __SIZEOF_INT128__ namespace pcg_extras { typedef __uint128_t pcg128_t; } #define PCG_128BIT_CONSTANT(high,low) \ ((pcg_extras::pcg128_t(high) << 64) + low) #else #include "pcg_uint128.hpp" namespace pcg_extras { typedef pcg_extras::uint_x4 pcg128_t; } #define PCG_128BIT_CONSTANT(high,low) \ pcg_extras::pcg128_t(high,low) #define PCG_EMULATED_128BIT_MATH 1 #endif namespace pcg_extras { /* * We often need to represent a "number of bits". When used normally, these * numbers are never greater than 128, so an unsigned char is plenty. * If you're using a nonstandard generator of a larger size, you can set * PCG_BITCOUNT_T to have it define it as a larger size. (Some compilers * might produce faster code if you set it to an unsigned int.) */ #ifndef PCG_BITCOUNT_T typedef uint8_t bitcount_t; #else typedef PCG_BITCOUNT_T bitcount_t; #endif /* * C++ requires us to be able to serialize RNG state by printing or reading * it from a stream. Because we use 128-bit ints, we also need to be able * ot print them, so here is code to do so. * * This code provides enough functionality to print 128-bit ints in decimal * and zero-padded in hex. It's not a full-featured implementation. */ template std::basic_ostream& operator<<(std::basic_ostream& out, pcg128_t value) { auto desired_base = out.flags() & out.basefield; bool want_hex = desired_base == out.hex; if (want_hex) { uint64_t highpart = uint64_t(value >> 64); uint64_t lowpart = uint64_t(value); auto desired_width = out.width(); if (desired_width > 16) { out.width(desired_width - 16); } if (highpart != 0 || desired_width > 16) out << highpart; CharT oldfill = '\0'; if (highpart != 0) { out.width(16); oldfill = out.fill('0'); } auto oldflags = out.setf(decltype(desired_base){}, out.showbase); out << lowpart; out.setf(oldflags); if (highpart != 0) { out.fill(oldfill); } return out; } constexpr size_t MAX_CHARS_128BIT = 40; char buffer[MAX_CHARS_128BIT]; char* pos = buffer+sizeof(buffer); *(--pos) = '\0'; constexpr auto BASE = pcg128_t(10ULL); do { auto div = value / BASE; auto mod = uint32_t(value - (div * BASE)); *(--pos) = '0' + char(mod); value = div; } while(value != pcg128_t(0ULL)); return out << pos; } template std::basic_istream& operator>>(std::basic_istream& in, pcg128_t& value) { typename std::basic_istream::sentry s(in); if (!s) return in; constexpr auto BASE = pcg128_t(10ULL); pcg128_t current(0ULL); bool did_nothing = true; bool overflow = false; for(;;) { CharT wide_ch = in.get(); if (!in.good()) break; auto ch = in.narrow(wide_ch, '\0'); if (ch < '0' || ch > '9') { in.unget(); break; } did_nothing = false; pcg128_t digit(uint32_t(ch - '0')); pcg128_t timesbase = current*BASE; overflow = overflow || timesbase < current; current = timesbase + digit; overflow = overflow || current < digit; } if (did_nothing || overflow) { in.setstate(std::ios::failbit); if (overflow) current = ~pcg128_t(0ULL); } value = current; return in; } /* * Likewise, if people use tiny rngs, we'll be serializing uint8_t. * If we just used the provided IO operators, they'd read/write chars, * not ints, so we need to define our own. We *can* redefine this operator * here because we're in our own namespace. */ template std::basic_ostream& operator<<(std::basic_ostream&out, uint8_t value) { return out << uint32_t(value); } template std::basic_istream& operator>>(std::basic_istream& in, uint8_t& target) { uint32_t value = 0xdecea5edU; in >> value; if (!in && value == 0xdecea5edU) return in; if (value > uint8_t(~0)) { in.setstate(std::ios::failbit); value = ~0U; } target = uint8_t(value); return in; } /* Unfortunately, the above functions don't get found in preference to the * built in ones, so we create some more specific overloads that will. * Ugh. */ inline std::ostream& operator<<(std::ostream& out, uint8_t value) { return pcg_extras::operator<< (out, value); } inline std::istream& operator>>(std::istream& in, uint8_t& value) { return pcg_extras::operator>> (in, value); } /* * Useful bitwise operations. */ /* * XorShifts are invertable, but they are someting of a pain to invert. * This function backs them out. It's used by the whacky "inside out" * generator defined later. */ template inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift) { if (2*shift >= bits) { return x ^ (x >> shift); } itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1; itype highmask1 = ~lowmask1; itype top1 = x; itype bottom1 = x & lowmask1; top1 ^= top1 >> shift; top1 &= highmask1; x = top1 | bottom1; itype lowmask2 = (itype(1U) << (bits - shift)) - 1; itype bottom2 = x & lowmask2; bottom2 = unxorshift(bottom2, bits - shift, shift); bottom2 &= lowmask1; return top1 | bottom2; } /* * Rotate left and right. * * In ideal world, compilers would spot idiomatic rotate code and convert it * to a rotate instruction. Of course, opinions vary on what the correct * idiom is and how to spot it. For clang, sometimes it generates better * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM. */ template inline itype rotl(itype value, bitcount_t rot) { constexpr bitcount_t bits = sizeof(itype) * 8; constexpr bitcount_t mask = bits - 1; #if PCG_USE_ZEROCHECK_ROTATE_IDIOM return rot ? (value << rot) | (value >> (bits - rot)) : value; #else return (value << rot) | (value >> ((- rot) & mask)); #endif } template inline itype rotr(itype value, bitcount_t rot) { constexpr bitcount_t bits = sizeof(itype) * 8; constexpr bitcount_t mask = bits - 1; #if PCG_USE_ZEROCHECK_ROTATE_IDIOM return rot ? (value >> rot) | (value << (bits - rot)) : value; #else return (value >> rot) | (value << ((- rot) & mask)); #endif } /* Unfortunately, both Clang and GCC sometimes perform poorly when it comes * to properly recognizing idiomatic rotate code, so for we also provide * assembler directives (enabled with PCG_USE_INLINE_ASM). Boo, hiss. * (I hope that these compilers get better so that this code can die.) * * These overloads will be preferred over the general template code above. */ #if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__ || __i386__) inline uint8_t rotr(uint8_t value, bitcount_t rot) { asm ("rorb %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } inline uint16_t rotr(uint16_t value, bitcount_t rot) { asm ("rorw %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } inline uint32_t rotr(uint32_t value, bitcount_t rot) { asm ("rorl %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } #if __x86_64__ inline uint64_t rotr(uint64_t value, bitcount_t rot) { asm ("rorq %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } #endif // __x86_64__ #elif defined(_MSC_VER) // Use MSVC++ bit rotation intrinsics #pragma intrinsic(_rotr, _rotr64, _rotr8, _rotr16) inline uint8_t rotr(uint8_t value, bitcount_t rot) { return _rotr8(value, rot); } inline uint16_t rotr(uint16_t value, bitcount_t rot) { return _rotr16(value, rot); } inline uint32_t rotr(uint32_t value, bitcount_t rot) { return _rotr(value, rot); } inline uint64_t rotr(uint64_t value, bitcount_t rot) { return _rotr64(value, rot); } #endif // PCG_USE_INLINE_ASM /* * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of * 32-bit integers with seed data, but sometimes we want to produce * larger or smaller integers. * * The following code handles this annoyance. * * uneven_copy will copy an array of 32-bit ints to an array of larger or * smaller ints (actually, the code is general it only needing forward * iterators). The copy is identical to the one that would be performed if * we just did memcpy on a standard little-endian machine, but works * regardless of the endian of the machine (or the weirdness of the ints * involved). * * generate_to initializes an array of integers using a SeedSeq * object. It is given the size as a static constant at compile time and * tries to avoid memory allocation. If we're filling in 32-bit constants * we just do it directly. If we need a separate buffer and it's small, * we allocate it on the stack. Otherwise, we fall back to heap allocation. * Ugh. * * generate_one produces a single value of some integral type using a * SeedSeq object. */ /* uneven_copy helper, case where destination ints are less than 32 bit. */ template SrcIter uneven_copy_impl( SrcIter src_first, DestIter dest_first, DestIter dest_last, std::true_type) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr bitcount_t SRC_SIZE = sizeof(src_t); constexpr bitcount_t DEST_SIZE = sizeof(dest_t); constexpr bitcount_t DEST_BITS = DEST_SIZE * 8; constexpr bitcount_t SCALE = SRC_SIZE / DEST_SIZE; size_t count = 0; src_t value = 0; while (dest_first != dest_last) { if ((count++ % SCALE) == 0) value = *src_first++; // Get more bits else value >>= DEST_BITS; // Move down bits *dest_first++ = dest_t(value); // Truncates, ignores high bits. } return src_first; } /* uneven_copy helper, case where destination ints are more than 32 bit. */ template SrcIter uneven_copy_impl( SrcIter src_first, DestIter dest_first, DestIter dest_last, std::false_type) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr auto SRC_SIZE = sizeof(src_t); constexpr auto SRC_BITS = SRC_SIZE * 8; constexpr auto DEST_SIZE = sizeof(dest_t); constexpr auto SCALE = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE; while (dest_first != dest_last) { dest_t value(0UL); unsigned int shift = 0; for (size_t i = 0; i < SCALE; ++i) { value |= dest_t(*src_first++) << shift; shift += SRC_BITS; } *dest_first++ = value; } return src_first; } /* uneven_copy, call the right code for larger vs. smaller */ template inline SrcIter uneven_copy(SrcIter src_first, DestIter dest_first, DestIter dest_last) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t); return uneven_copy_impl(src_first, dest_first, dest_last, std::integral_constant{}); } /* generate_to, fill in a fixed-size array of integral type using a SeedSeq * (actually works for any random-access iterator) */ template inline void generate_to_impl(SeedSeq&& generator, DestIter dest, std::true_type) { generator.generate(dest, dest+size); } template void generate_to_impl(SeedSeq&& generator, DestIter dest, std::false_type) { typedef typename std::iterator_traits::value_type dest_t; constexpr auto DEST_SIZE = sizeof(dest_t); constexpr auto GEN_SIZE = sizeof(uint32_t); constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE; constexpr size_t FROM_ELEMS = GEN_IS_SMALLER ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE) : (size + (GEN_SIZE / DEST_SIZE) - 1) / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER); // this odd code ^^^^^^^^^^^^^^^^^ is work-around for // a bug: http://llvm.org/bugs/show_bug.cgi?id=21287 if (FROM_ELEMS <= 1024) { uint32_t buffer[FROM_ELEMS]; generator.generate(buffer, buffer+FROM_ELEMS); uneven_copy(buffer, dest, dest+size); } else { uint32_t* buffer = static_cast(malloc(GEN_SIZE * FROM_ELEMS)); generator.generate(buffer, buffer+FROM_ELEMS); uneven_copy(buffer, dest, dest+size); free(static_cast(buffer)); } } template inline void generate_to(SeedSeq&& generator, DestIter dest) { typedef typename std::iterator_traits::value_type dest_t; constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t); generate_to_impl(std::forward(generator), dest, std::integral_constant{}); } /* generate_one, produce a value of integral type using a SeedSeq * (optionally, we can have it produce more than one and pick which one * we want) */ template inline UInt generate_one(SeedSeq&& generator) { UInt result[N]; generate_to(std::forward(generator), result); return result[i]; } template auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound) -> typename RngType::result_type { typedef typename RngType::result_type rtype; rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound) % upper_bound; for (;;) { rtype r = rng() - RngType::min(); if (r >= threshold) return r % upper_bound; } } template void shuffle(Iter from, Iter to, RandType&& rng) { typedef typename std::iterator_traits::difference_type delta_t; typedef typename std::remove_reference::type::result_type result_t; auto count = to - from; while (count > 1) { delta_t chosen = delta_t(bounded_rand(rng, result_t(count))); --count; --to; using std::swap; swap(*(from + chosen), *to); } } /* * Although std::seed_seq is useful, it isn't everything. Often we want to * initialize a random-number generator some other way, such as from a random * device. * * Technically, it does not meet the requirements of a SeedSequence because * it lacks some of the rarely-used member functions (some of which would * be impossible to provide). However the C++ standard is quite specific * that actual engines only called the generate method, so it ought not to be * a problem in practice. */ template class seed_seq_from { private: RngType rng_; typedef uint_least32_t result_type; public: template seed_seq_from(Args&&... args) : rng_(std::forward(args)...) { // Nothing (else) to do... } template void generate(Iter start, Iter finish) { for (auto i = start; i != finish; ++i) *i = result_type(rng_()); } constexpr size_t size() const { return (sizeof(typename RngType::result_type) > sizeof(result_type) && RngType::max() > ~size_t(0UL)) ? ~size_t(0UL) : size_t(RngType::max()); } }; /* * Sometimes you might want a distinct seed based on when the program * was compiled. That way, a particular instance of the program will * behave the same way, but when recompiled it'll produce a different * value. */ template struct static_arbitrary_seed { private: static constexpr IntType fnv(IntType hash, const char* pos) { return *pos == '\0' ? hash : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1)); } public: static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)), __DATE__ __TIME__ __FILE__); }; // Sometimes, when debugging or testing, it's handy to be able print the name // of a (in human-readable form). This code allows the idiom: // // cout << printable_typename() // // to print out my_foo_type_t (or its concrete type if it is a synonym) #if __cpp_rtti || __GXX_RTTI template struct printable_typename {}; template std::ostream& operator<<(std::ostream& out, printable_typename) { const char *implementation_typename = typeid(T).name(); #ifdef __GNUC__ int status; char* pretty_name = abi::__cxa_demangle(implementation_typename, nullptr, nullptr, &status); if (status == 0) out << pretty_name; free(static_cast(pretty_name)); if (status == 0) return out; #endif out << implementation_typename; return out; } #endif // __cpp_rtti || __GXX_RTTI } // namespace pcg_extras #endif // PCG_EXTRAS_HPP_INCLUDED RenderKit-rkcommon-988718e/rkcommon/utility/detail/pcg_random.hpp000066400000000000000000002172431467524601100251560ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2019 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This code provides the reference implementation of the PCG family of * random number generators. The code is complex because it implements * * - several members of the PCG family, specifically members corresponding * to the output functions: * - XSH RR (good for 64-bit state, 32-bit output) * - XSH RS (good for 64-bit state, 32-bit output) * - XSL RR (good for 128-bit state, 64-bit output) * - RXS M XS (statistically most powerful generator) * - XSL RR RR (good for 128-bit state, 128-bit output) * - and RXS, RXS M, XSH, XSL (mostly for testing) * - at potentially *arbitrary* bit sizes * - with four different techniques for random streams (MCG, one-stream * LCG, settable-stream LCG, unique-stream LCG) * - and the extended generation schemes allowing arbitrary periods * - with all features of C++11 random number generation (and more), * some of which are somewhat painful, including * - initializing with a SeedSequence which writes 32-bit values * to memory, even though the state of the generator may not * use 32-bit values (it might use smaller or larger integers) * - I/O for RNGs and a prescribed format, which needs to handle * the issue that 8-bit and 128-bit integers don't have working * I/O routines (e.g., normally 8-bit = char, not integer) * - equality and inequality for RNGs * - and a number of convenience typedefs to mask all the complexity * * The code employes a fairly heavy level of abstraction, and has to deal * with various C++ minutia. If you're looking to learn about how the PCG * scheme works, you're probably best of starting with one of the other * codebases (see www.pcg-random.org). But if you're curious about the * constants for the various output functions used in those other, simpler, * codebases, this code shows how they are calculated. * * On the positive side, at least there are convenience typedefs so that you * can say * * pcg32 myRNG; * * rather than: * * pcg_detail::engine< * uint32_t, // Output Type * uint64_t, // State Type * pcg_detail::xsh_rr_mixin, true, // Output Func * pcg_detail::specific_stream, // Stream Kind * pcg_detail::default_multiplier // LCG Mult * > myRNG; * */ #ifndef PCG_RAND_HPP_INCLUDED #define PCG_RAND_HPP_INCLUDED 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _MSC_VER #pragma warning(disable:4146) #endif #ifdef _MSC_VER #define PCG_ALWAYS_INLINE __forceinline #elif __GNUC__ #define PCG_ALWAYS_INLINE __attribute__((always_inline)) #else #define PCG_ALWAYS_INLINE inline #endif /* * The pcg_extras namespace contains some support code that is likley to * be useful for a variety of RNGs, including: * - 128-bit int support for platforms where it isn't available natively * - bit twiddling operations * - I/O of 128-bit and 8-bit integers * - Handling the evilness of SeedSeq * - Support for efficiently producing random numbers less than a given * bound */ #include "pcg_extras.hpp" namespace pcg_detail { using namespace pcg_extras; /* * The LCG generators need some constants to function. This code lets you * look up the constant by *type*. For example * * default_multiplier::multiplier() * * gives you the default multipler for 32-bit integers. We use the name * of the constant and not a generic word like value to allow these classes * to be used as mixins. */ template struct default_multiplier { // Not defined for an arbitrary type }; template struct default_increment { // Not defined for an arbitrary type }; #define PCG_DEFINE_CONSTANT(type, what, kind, constant) \ template <> \ struct what ## _ ## kind { \ static constexpr type kind() { \ return constant; \ } \ }; PCG_DEFINE_CONSTANT(uint8_t, default, multiplier, 141U) PCG_DEFINE_CONSTANT(uint8_t, default, increment, 77U) PCG_DEFINE_CONSTANT(uint16_t, default, multiplier, 12829U) PCG_DEFINE_CONSTANT(uint16_t, default, increment, 47989U) PCG_DEFINE_CONSTANT(uint32_t, default, multiplier, 747796405U) PCG_DEFINE_CONSTANT(uint32_t, default, increment, 2891336453U) PCG_DEFINE_CONSTANT(uint64_t, default, multiplier, 6364136223846793005ULL) PCG_DEFINE_CONSTANT(uint64_t, default, increment, 1442695040888963407ULL) PCG_DEFINE_CONSTANT(pcg128_t, default, multiplier, PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL)) PCG_DEFINE_CONSTANT(pcg128_t, default, increment, PCG_128BIT_CONSTANT(6364136223846793005ULL,1442695040888963407ULL)) /* Alternative (cheaper) multipliers for 128-bit */ template struct cheap_multiplier : public default_multiplier { // For most types just use the default. }; template <> struct cheap_multiplier { static constexpr uint64_t multiplier() { return 0xda942042e4dd58b5ULL; } }; /* * Each PCG generator is available in four variants, based on how it applies * the additive constant for its underlying LCG; the variations are: * * single stream - all instances use the same fixed constant, thus * the RNG always somewhere in same sequence * mcg - adds zero, resulting in a single stream and reduced * period * specific stream - the constant can be changed at any time, selecting * a different random sequence * unique stream - the constant is based on the memory address of the * object, thus every RNG has its own unique sequence * * This variation is provided though mixin classes which define a function * value called increment() that returns the nesessary additive constant. */ /* * unique stream */ template class unique_stream { protected: static constexpr bool is_mcg = false; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; constexpr itype increment() const { return itype(reinterpret_cast(this) | 1); } constexpr itype stream() const { return increment() >> 1; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return (sizeof(itype) < sizeof(size_t) ? sizeof(itype) : sizeof(size_t))*8 - 1u; } protected: constexpr unique_stream() = default; }; /* * no stream (mcg) */ template class no_stream { protected: static constexpr bool is_mcg = true; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; static constexpr itype increment() { return 0; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return 0u; } protected: constexpr no_stream() = default; }; /* * single stream/sequence (oneseq) */ template class oneseq_stream : public default_increment { protected: static constexpr bool is_mcg = false; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; static constexpr itype stream() { return default_increment::increment() >> 1; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return 0u; } protected: constexpr oneseq_stream() = default; }; /* * specific stream */ template class specific_stream { protected: static constexpr bool is_mcg = false; itype inc_ = default_increment::increment(); public: typedef itype state_type; typedef itype stream_state; constexpr itype increment() const { return inc_; } itype stream() { return inc_ >> 1; } void set_stream(itype specific_seq) { inc_ = (specific_seq << 1) | 1; } static constexpr bool can_specify_stream = true; static constexpr size_t streams_pow2() { return (sizeof(itype)*8) - 1u; } protected: specific_stream() = default; specific_stream(itype specific_seq) : inc_(itype(specific_seq << 1) | itype(1U)) { // Nothing (else) to do. } }; /* * This is where it all comes together. This function joins together three * mixin classes which define * - the LCG additive constant (the stream) * - the LCG multiplier * - the output function * in addition, we specify the type of the LCG state, and the result type, * and whether to use the pre-advance version of the state for the output * (increasing instruction-level parallelism) or the post-advance version * (reducing register pressure). * * Given the high level of parameterization, the code has to use some * template-metaprogramming tricks to handle some of the suble variations * involved. */ template , typename multiplier_mixin = default_multiplier > class engine : protected output_mixin, public stream_mixin, protected multiplier_mixin { protected: itype state_; struct can_specify_stream_tag {}; struct no_specifiable_stream_tag {}; using stream_mixin::increment; using multiplier_mixin::multiplier; public: typedef xtype result_type; typedef itype state_type; static constexpr size_t period_pow2() { return sizeof(state_type)*8 - 2*stream_mixin::is_mcg; } // It would be nice to use std::numeric_limits for these, but // we can't be sure that it'd be defined for the 128-bit types. static constexpr result_type min() { return result_type(0UL); } static constexpr result_type max() { return result_type(~result_type(0UL)); } protected: itype bump(itype state) { return state * multiplier() + increment(); } itype base_generate() { return state_ = bump(state_); } itype base_generate0() { itype old_state = state_; state_ = bump(state_); return old_state; } public: result_type operator()() { if (output_previous) return this->output(base_generate0()); else return this->output(base_generate()); } result_type operator()(result_type upper_bound) { return bounded_rand(*this, upper_bound); } protected: static itype advance(itype state, itype delta, itype cur_mult, itype cur_plus); static itype distance(itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask = ~itype(0U)); itype distance(itype newstate, itype mask = itype(~itype(0U))) const { return distance(state_, newstate, multiplier(), increment(), mask); } public: void advance(itype delta) { state_ = advance(state_, delta, this->multiplier(), this->increment()); } void backstep(itype delta) { advance(-delta); } void discard(itype delta) { advance(delta); } bool wrapped() { if (stream_mixin::is_mcg) { // For MCGs, the low order two bits never change. In this // implementation, we keep them fixed at 3 to make this test // easier. return state_ == 3; } else { return state_ == 0; } } engine(itype state = itype(0xcafef00dd15ea5e5ULL)) : state_(this->is_mcg ? state|state_type(3U) : bump(state + this->increment())) { // Nothing else to do. } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template engine(itype state, typename sm::stream_state stream_seed) : stream_mixin(stream_seed), state_(this->is_mcg ? state|state_type(3U) : bump(state + this->increment())) { // Nothing else to do. } template engine(SeedSeq&& seedSeq, typename std::enable_if< !stream_mixin::can_specify_stream && !std::is_convertible::value && !std::is_convertible::value, no_specifiable_stream_tag>::type = {}) : engine(generate_one(std::forward(seedSeq))) { // Nothing else to do. } template engine(SeedSeq&& seedSeq, typename std::enable_if< stream_mixin::can_specify_stream && !std::is_convertible::value && !std::is_convertible::value, can_specify_stream_tag>::type = {}) : engine(generate_one(seedSeq), generate_one(seedSeq)) { // Nothing else to do. } template void seed(Args&&... args) { new (this) engine(std::forward(args)...); } template friend bool operator==(const engine&, const engine&); template friend itype1 operator-(const engine&, const engine&); template friend std::basic_ostream& operator<<(std::basic_ostream& out, const engine&); template friend std::basic_istream& operator>>(std::basic_istream& in, engine& rng); }; template std::basic_ostream& operator<<(std::basic_ostream& out, const engine& rng) { using pcg_extras::operator<<; auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left); auto space = out.widen(' '); auto orig_fill = out.fill(); out << rng.multiplier() << space << rng.increment() << space << rng.state_; out.flags(orig_flags); out.fill(orig_fill); return out; } template std::basic_istream& operator>>(std::basic_istream& in, engine& rng) { using pcg_extras::operator>>; auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws); itype multiplier, increment, state; in >> multiplier >> increment >> state; if (!in.fail()) { bool good = true; if (multiplier != rng.multiplier()) { good = false; } else if (rng.can_specify_stream) { rng.set_stream(increment >> 1); } else if (increment != rng.increment()) { good = false; } if (good) { rng.state_ = state; } else { in.clear(std::ios::failbit); } } in.flags(orig_flags); return in; } template itype engine::advance( itype state, itype delta, itype cur_mult, itype cur_plus) { // The method used here is based on Brown, "Random Number Generation // with Arbitrary Stride,", Transactions of the American Nuclear // Society (Nov. 1994). The algorithm is very similar to fast // exponentiation. // // Even though delta is an unsigned integer, we can pass a // signed integer to go backwards, it just goes "the long way round". constexpr itype ZERO = 0u; // itype may be a non-trivial types, so constexpr itype ONE = 1u; // we define some ugly constants. itype acc_mult = 1; itype acc_plus = 0; while (delta > ZERO) { if (delta & ONE) { acc_mult *= cur_mult; acc_plus = acc_plus*cur_mult + cur_plus; } cur_plus = (cur_mult+ONE)*cur_plus; cur_mult *= cur_mult; delta >>= 1; } return acc_mult * state + acc_plus; } template itype engine::distance( itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask) { constexpr itype ONE = 1u; // itype could be weird, so use constant bool is_mcg = cur_plus == itype(0); itype the_bit = is_mcg ? itype(4u) : itype(1u); itype distance = 0u; while ((cur_state & mask) != (newstate & mask)) { if ((cur_state & the_bit) != (newstate & the_bit)) { cur_state = cur_state * cur_mult + cur_plus; distance |= the_bit; } assert((cur_state & the_bit) == (newstate & the_bit)); the_bit <<= 1; cur_plus = (cur_mult+ONE)*cur_plus; cur_mult *= cur_mult; } return is_mcg ? distance >> 2 : distance; } template itype operator-(const engine& lhs, const engine& rhs) { static_assert( std::is_same::value && std::is_same::value, "Incomparable generators"); if (lhs.increment() == rhs.increment()) { return rhs.distance(lhs.state_); } else { constexpr itype ONE = 1u; itype lhs_diff = lhs.increment() + (lhs.multiplier()-ONE) * lhs.state_; itype rhs_diff = rhs.increment() + (rhs.multiplier()-ONE) * rhs.state_; if ((lhs_diff & itype(3u)) != (rhs_diff & itype(3u))) { rhs_diff = -rhs_diff; } return rhs.distance(rhs_diff, lhs_diff, rhs.multiplier(), itype(0u)); } } template bool operator==(const engine& lhs, const engine& rhs) { return (lhs.multiplier() == rhs.multiplier()) && (lhs.increment() == rhs.increment()) && (lhs.state_ == rhs.state_); } template inline bool operator!=(const engine& lhs, const engine& rhs) { return !operator==(lhs,rhs); } template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using oneseq_base = engine, output_previous, oneseq_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using unique_base = engine, output_previous, unique_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using setseq_base = engine, output_previous, specific_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using mcg_base = engine, output_previous, no_stream, multiplier_mixin >; /* * OUTPUT FUNCTIONS. * * These are the core of the PCG generation scheme. They specify how to * turn the base LCG's internal state into the output value of the final * generator. * * They're implemented as mixin classes. * * All of the classes have code that is written to allow it to be applied * at *arbitrary* bit sizes, although in practice they'll only be used at * standard sizes supported by C++. */ /* * XSH RS -- high xorshift, followed by a random shift * * Fast. A good performer. */ template struct xsh_rs_mixin { static xtype output(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t opbits = sparebits-5 >= 64 ? 5 : sparebits-4 >= 32 ? 4 : sparebits-3 >= 16 ? 3 : sparebits-2 >= 4 ? 2 : sparebits-1 >= 1 ? 1 : 0; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t maxrandshift = mask; constexpr bitcount_t topspare = opbits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = topspare + (xtypebits+maxrandshift)/2; bitcount_t rshift = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; internal ^= internal >> xshift; xtype result = xtype(internal >> (bottomspare - maxrandshift + rshift)); return result; } }; /* * XSH RR -- high xorshift, followed by a random rotate * * Fast. A good performer. Slightly better statistically than XSH RS. */ template struct xsh_rr_mixin { static xtype output(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype)*8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7 : xtypebits >= 64 ? 6 : xtypebits >= 32 ? 5 : xtypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = opbits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits)/2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; xtype result = xtype(internal >> bottomspare); result = rotr(result, amprot); return result; } }; /* * RXS -- random xorshift */ template struct rxs_mixin { static xtype output_rxs(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype)*8); constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t extrashift = (xtypebits - shift)/2; bitcount_t rshift = shift > 64+8 ? (internal >> (bits - 6)) & 63 : shift > 32+4 ? (internal >> (bits - 5)) & 31 : shift > 16+2 ? (internal >> (bits - 4)) & 15 : shift > 8+1 ? (internal >> (bits - 3)) & 7 : shift > 4+1 ? (internal >> (bits - 2)) & 3 : shift > 2+1 ? (internal >> (bits - 1)) & 1 : 0; internal ^= internal >> (shift + extrashift - rshift); xtype result = internal >> rshift; return result; } }; /* * RXS M XS -- random xorshift, mcg multiply, fixed xorshift * * The most statistically powerful generator, but all those steps * make it slower than some of the others. We give it the rottenest jobs. * * Because it's usually used in contexts where the state type and the * result type are the same, it is a permutation and is thus invertable. * We thus provide a function to invert it. This function is used to * for the "inside out" generator used by the extended generator. */ /* Defined type-based concepts for the multiplication step. They're actually * all derived by truncating the 128-bit, which was computed to be a good * "universal" constant. */ template struct mcg_multiplier { // Not defined for an arbitrary type }; template struct mcg_unmultiplier { // Not defined for an arbitrary type }; PCG_DEFINE_CONSTANT(uint8_t, mcg, multiplier, 217U) PCG_DEFINE_CONSTANT(uint8_t, mcg, unmultiplier, 105U) PCG_DEFINE_CONSTANT(uint16_t, mcg, multiplier, 62169U) PCG_DEFINE_CONSTANT(uint16_t, mcg, unmultiplier, 28009U) PCG_DEFINE_CONSTANT(uint32_t, mcg, multiplier, 277803737U) PCG_DEFINE_CONSTANT(uint32_t, mcg, unmultiplier, 2897767785U) PCG_DEFINE_CONSTANT(uint64_t, mcg, multiplier, 12605985483714917081ULL) PCG_DEFINE_CONSTANT(uint64_t, mcg, unmultiplier, 15009553638781119849ULL) PCG_DEFINE_CONSTANT(pcg128_t, mcg, multiplier, PCG_128BIT_CONSTANT(17766728186571221404ULL, 12605985483714917081ULL)) PCG_DEFINE_CONSTANT(pcg128_t, mcg, unmultiplier, PCG_128BIT_CONSTANT(14422606686972528997ULL, 15009553638781119849ULL)) template struct rxs_m_xs_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = xtypebits >= 128 ? 6 : xtypebits >= 64 ? 5 : xtypebits >= 32 ? 4 : xtypebits >= 16 ? 3 : 2; constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t mask = (1 << opbits) - 1; bitcount_t rshift = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; internal ^= internal >> (opbits + rshift); internal *= mcg_multiplier::multiplier(); xtype result = internal >> shift; result ^= result >> ((2U*xtypebits+2U)/3U); return result; } static itype unoutput(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = bits >= 128 ? 6 : bits >= 64 ? 5 : bits >= 32 ? 4 : bits >= 16 ? 3 : 2; constexpr bitcount_t mask = (1 << opbits) - 1; internal = unxorshift(internal, bits, (2U*bits+2U)/3U); internal *= mcg_unmultiplier::unmultiplier(); bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0; internal = unxorshift(internal, bits, opbits + rshift); return internal; } }; /* * RXS M -- random xorshift, mcg multiply */ template struct rxs_m_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = xtypebits >= 128 ? 6 : xtypebits >= 64 ? 5 : xtypebits >= 32 ? 4 : xtypebits >= 16 ? 3 : 2; constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t mask = (1 << opbits) - 1; bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0; internal ^= internal >> (opbits + rshift); internal *= mcg_multiplier::multiplier(); xtype result = internal >> shift; return result; } }; /* * DXSM -- double xorshift multiply * * This is a new, more powerful output permutation (added in 2019). It's * a more comprehensive scrambling than RXS M, but runs faster on 128-bit * types. Although primarily intended for use at large sizes, also works * at smaller sizes as well. * * This permutation is similar to xorshift multiply hash functions, except * that one of the multipliers is the LCG multiplier (to avoid needing to * have a second constant) and the other is based on the low-order bits. * This latter aspect means that the scrambling applied to the high bits * depends on the low bits, and makes it (to my eye) impractical to back * out the permutation without having the low-order bits. */ template struct dxsm_mixin { inline xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t itypebits = bitcount_t(sizeof(itype) * 8); static_assert(xtypebits <= itypebits/2, "Output type must be half the size of the state type."); xtype hi = xtype(internal >> (itypebits - xtypebits)); xtype lo = xtype(internal); lo |= 1; hi ^= hi >> (xtypebits/2); hi *= xtype(cheap_multiplier::multiplier()); hi ^= hi >> (3*(xtypebits/4)); hi *= lo; return hi; } }; /* * XSL RR -- fixed xorshift (to low bits), random rotate * * Useful for 128-bit types that are split across two CPU registers. */ template struct xsl_rr_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7 : xtypebits >= 64 ? 6 : xtypebits >= 32 ? 5 : xtypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; xtype result = xtype(internal >> bottomspare); result = rotr(result, amprot); return result; } }; /* * XSL RR RR -- fixed xorshift (to low bits), random rotate (both parts) * * Useful for 128-bit types that are split across two CPU registers. * If you really want an invertable 128-bit RNG, I guess this is the one. */ template struct halfsize_trait {}; template <> struct halfsize_trait { typedef uint64_t type; }; template <> struct halfsize_trait { typedef uint32_t type; }; template <> struct halfsize_trait { typedef uint16_t type; }; template <> struct halfsize_trait { typedef uint8_t type; }; template struct xsl_rr_rr_mixin { typedef typename halfsize_trait::type htype; static itype output(itype internal) { constexpr bitcount_t htypebits = bitcount_t(sizeof(htype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - htypebits; constexpr bitcount_t wantedopbits = htypebits >= 128 ? 7 : htypebits >= 64 ? 6 : htypebits >= 32 ? 5 : htypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t xshift = (topspare + htypebits) / 2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; htype lowbits = htype(internal); lowbits = rotr(lowbits, amprot); htype highbits = htype(internal >> topspare); bitcount_t rot2 = lowbits & mask; bitcount_t amprot2 = (rot2 << amplifier) & mask; highbits = rotr(highbits, amprot2); return (itype(highbits) << topspare) ^ itype(lowbits); } }; /* * XSH -- fixed xorshift (to high bits) * * You shouldn't use this at 64-bits or less. */ template struct xsh_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t topspare = 0; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; internal ^= internal >> xshift; xtype result = internal >> bottomspare; return result; } }; /* * XSL -- fixed xorshift (to low bits) * * You shouldn't use this at 64-bits or less. */ template struct xsl_mixin { inline xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; internal ^= internal >> xshift; xtype result = internal >> bottomspare; return result; } }; /* ---- End of Output Functions ---- */ template struct inside_out : private baseclass { inside_out() = delete; typedef typename baseclass::result_type result_type; typedef typename baseclass::state_type state_type; static_assert(sizeof(result_type) == sizeof(state_type), "Require a RNG whose output function is a permutation"); static bool external_step(result_type& randval, size_t i) { state_type state = baseclass::unoutput(randval); state = state * baseclass::multiplier() + baseclass::increment() + state_type(i*2); result_type result = baseclass::output(state); randval = result; state_type zero = baseclass::is_mcg ? state & state_type(3U) : state_type(0U); return result == zero; } static bool external_advance(result_type& randval, size_t i, result_type delta, bool forwards = true) { state_type state = baseclass::unoutput(randval); state_type mult = baseclass::multiplier(); state_type inc = baseclass::increment() + state_type(i*2); state_type zero = baseclass::is_mcg ? state & state_type(3U) : state_type(0U); state_type dist_to_zero = baseclass::distance(state, zero, mult, inc); bool crosses_zero = forwards ? dist_to_zero <= delta : (-dist_to_zero) <= delta; if (!forwards) delta = -delta; state = baseclass::advance(state, delta, mult, inc); randval = baseclass::output(state); return crosses_zero; } }; template class extended : public baseclass { public: typedef typename baseclass::state_type state_type; typedef typename baseclass::result_type result_type; typedef inside_out insideout; private: static constexpr bitcount_t rtypebits = sizeof(result_type)*8; static constexpr bitcount_t stypebits = sizeof(state_type)*8; static constexpr bitcount_t tick_limit_pow2 = 64U; static constexpr size_t table_size = 1UL << table_pow2; static constexpr size_t table_shift = stypebits - table_pow2; static constexpr state_type table_mask = (state_type(1U) << table_pow2) - state_type(1U); static constexpr bool may_tick = (advance_pow2 < stypebits) && (advance_pow2 < tick_limit_pow2); static constexpr size_t tick_shift = stypebits - advance_pow2; static constexpr state_type tick_mask = may_tick ? state_type( (uint64_t(1) << (advance_pow2*may_tick)) - 1) // ^-- stupidity to appease GCC warnings : ~state_type(0U); static constexpr bool may_tock = stypebits < tick_limit_pow2; result_type data_[table_size]; PCG_NOINLINE void advance_table(); PCG_NOINLINE void advance_table(state_type delta, bool isForwards = true); result_type& get_extended_value() { state_type state = this->state_; if (kdd && baseclass::is_mcg) { // The low order bits of an MCG are constant, so drop them. state >>= 2; } size_t index = kdd ? state & table_mask : state >> table_shift; if (may_tick) { bool tick = kdd ? (state & tick_mask) == state_type(0u) : (state >> tick_shift) == state_type(0u); if (tick) advance_table(); } if (may_tock) { bool tock = state == state_type(0u); if (tock) advance_table(); } return data_[index]; } public: static constexpr size_t period_pow2() { return baseclass::period_pow2() + table_size*extvalclass::period_pow2(); } PCG_ALWAYS_INLINE result_type operator()() { result_type rhs = get_extended_value(); result_type lhs = this->baseclass::operator()(); return lhs ^ rhs; } result_type operator()(result_type upper_bound) { return bounded_rand(*this, upper_bound); } void set(result_type wanted) { result_type& rhs = get_extended_value(); result_type lhs = this->baseclass::operator()(); rhs = lhs ^ wanted; } void advance(state_type distance, bool forwards = true); void backstep(state_type distance) { advance(distance, false); } extended(const result_type* data) : baseclass() { datainit(data); } extended(const result_type* data, state_type seed) : baseclass(seed) { datainit(data); } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template extended(const result_type* data, state_type seed, typename bc::stream_state stream_seed) : baseclass(seed, stream_seed) { datainit(data); } extended() : baseclass() { selfinit(); } extended(state_type seed) : baseclass(seed) { selfinit(); } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template extended(state_type seed, typename bc::stream_state stream_seed) : baseclass(seed, stream_seed) { selfinit(); } private: void selfinit(); void datainit(const result_type* data); public: template::value && !std::is_convertible::value>::type> extended(SeedSeq&& seedSeq) : baseclass(seedSeq) { generate_to(seedSeq, data_); } template void seed(Args&&... args) { new (this) extended(std::forward(args)...); } template friend bool operator==(const extended&, const extended&); template friend std::basic_ostream& operator<<(std::basic_ostream& out, const extended&); template friend std::basic_istream& operator>>(std::basic_istream& in, extended&); }; template void extended::datainit( const result_type* data) { for (size_t i = 0; i < table_size; ++i) data_[i] = data[i]; } template void extended::selfinit() { // We need to fill the extended table with something, and we have // very little provided data, so we use the base generator to // produce values. Although not ideal (use a seed sequence, folks!), // unexpected correlations are mitigated by // - using XOR differences rather than the number directly // - the way the table is accessed, its values *won't* be accessed // in the same order the were written. // - any strange correlations would only be apparent if we // were to backstep the generator so that the base generator // was generating the same values again result_type lhs = baseclass::operator()(); result_type rhs = baseclass::operator()(); result_type xdiff = lhs - rhs; for (size_t i = 0; i < table_size; ++i) { data_[i] = baseclass::operator()() ^ xdiff; } } template bool operator==(const extended& lhs, const extended& rhs) { auto& base_lhs = static_cast(lhs); auto& base_rhs = static_cast(rhs); return base_lhs == base_rhs && std::equal( std::begin(lhs.data_), std::end(lhs.data_), std::begin(rhs.data_) ); } template inline bool operator!=(const extended& lhs, const extended& rhs) { return !operator==(lhs, rhs); } template std::basic_ostream& operator<<(std::basic_ostream& out, const extended& rng) { auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left); auto space = out.widen(' '); auto orig_fill = out.fill(); out << rng.multiplier() << space << rng.increment() << space << rng.state_; for (const auto& datum : rng.data_) out << space << datum; out.flags(orig_flags); out.fill(orig_fill); return out; } template std::basic_istream& operator>>(std::basic_istream& in, extended& rng) { extended new_rng; auto& base_rng = static_cast(new_rng); in >> base_rng; if (in.fail()) return in; auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws); for (auto& datum : new_rng.data_) { in >> datum; if (in.fail()) goto bail; } rng = new_rng; bail: in.flags(orig_flags); return in; } template void extended::advance_table() { bool carry = false; for (size_t i = 0; i < table_size; ++i) { if (carry) { carry = insideout::external_step(data_[i],i+1); } bool carry2 = insideout::external_step(data_[i],i+1); carry = carry || carry2; } } template void extended::advance_table( state_type delta, bool isForwards) { typedef typename baseclass::state_type base_state_t; typedef typename extvalclass::state_type ext_state_t; constexpr bitcount_t basebits = sizeof(base_state_t)*8; constexpr bitcount_t extbits = sizeof(ext_state_t)*8; static_assert(basebits <= extbits || advance_pow2 > 0, "Current implementation might overflow its carry"); base_state_t carry = 0; for (size_t i = 0; i < table_size; ++i) { base_state_t total_delta = carry + delta; ext_state_t trunc_delta = ext_state_t(total_delta); if (basebits > extbits) { carry = total_delta >> extbits; } else { carry = 0; } carry += insideout::external_advance(data_[i],i+1, trunc_delta, isForwards); } } template void extended::advance( state_type distance, bool forwards) { static_assert(kdd, "Efficient advance is too hard for non-kdd extension. " "For a weak advance, cast to base class"); state_type zero = baseclass::is_mcg ? this->state_ & state_type(3U) : state_type(0U); if (may_tick) { state_type ticks = distance >> (advance_pow2*may_tick); // ^-- stupidity to appease GCC // warnings state_type adv_mask = baseclass::is_mcg ? tick_mask << 2 : tick_mask; state_type next_advance_distance = this->distance(zero, adv_mask); if (!forwards) next_advance_distance = (-next_advance_distance) & tick_mask; if (next_advance_distance < (distance & tick_mask)) { ++ticks; } if (ticks) advance_table(ticks, forwards); } if (forwards) { if (may_tock && this->distance(zero) <= distance) advance_table(); baseclass::advance(distance); } else { if (may_tock && -(this->distance(zero)) <= distance) advance_table(state_type(1U), false); baseclass::advance(-distance); } } } // namespace pcg_detail namespace pcg_engines { using namespace pcg_detail; /* Predefined types for XSH RS */ typedef oneseq_base oneseq_xsh_rs_16_8; typedef oneseq_base oneseq_xsh_rs_32_16; typedef oneseq_base oneseq_xsh_rs_64_32; typedef oneseq_base oneseq_xsh_rs_128_64; typedef oneseq_base cm_oneseq_xsh_rs_128_64; typedef unique_base unique_xsh_rs_16_8; typedef unique_base unique_xsh_rs_32_16; typedef unique_base unique_xsh_rs_64_32; typedef unique_base unique_xsh_rs_128_64; typedef unique_base cm_unique_xsh_rs_128_64; typedef setseq_base setseq_xsh_rs_16_8; typedef setseq_base setseq_xsh_rs_32_16; typedef setseq_base setseq_xsh_rs_64_32; typedef setseq_base setseq_xsh_rs_128_64; typedef setseq_base cm_setseq_xsh_rs_128_64; typedef mcg_base mcg_xsh_rs_16_8; typedef mcg_base mcg_xsh_rs_32_16; typedef mcg_base mcg_xsh_rs_64_32; typedef mcg_base mcg_xsh_rs_128_64; typedef mcg_base cm_mcg_xsh_rs_128_64; /* Predefined types for XSH RR */ typedef oneseq_base oneseq_xsh_rr_16_8; typedef oneseq_base oneseq_xsh_rr_32_16; typedef oneseq_base oneseq_xsh_rr_64_32; typedef oneseq_base oneseq_xsh_rr_128_64; typedef oneseq_base cm_oneseq_xsh_rr_128_64; typedef unique_base unique_xsh_rr_16_8; typedef unique_base unique_xsh_rr_32_16; typedef unique_base unique_xsh_rr_64_32; typedef unique_base unique_xsh_rr_128_64; typedef unique_base cm_unique_xsh_rr_128_64; typedef setseq_base setseq_xsh_rr_16_8; typedef setseq_base setseq_xsh_rr_32_16; typedef setseq_base setseq_xsh_rr_64_32; typedef setseq_base setseq_xsh_rr_128_64; typedef setseq_base cm_setseq_xsh_rr_128_64; typedef mcg_base mcg_xsh_rr_16_8; typedef mcg_base mcg_xsh_rr_32_16; typedef mcg_base mcg_xsh_rr_64_32; typedef mcg_base mcg_xsh_rr_128_64; typedef mcg_base cm_mcg_xsh_rr_128_64; /* Predefined types for RXS M XS */ typedef oneseq_base oneseq_rxs_m_xs_8_8; typedef oneseq_base oneseq_rxs_m_xs_16_16; typedef oneseq_base oneseq_rxs_m_xs_32_32; typedef oneseq_base oneseq_rxs_m_xs_64_64; typedef oneseq_base oneseq_rxs_m_xs_128_128; typedef oneseq_base cm_oneseq_rxs_m_xs_128_128; typedef unique_base unique_rxs_m_xs_8_8; typedef unique_base unique_rxs_m_xs_16_16; typedef unique_base unique_rxs_m_xs_32_32; typedef unique_base unique_rxs_m_xs_64_64; typedef unique_base unique_rxs_m_xs_128_128; typedef unique_base cm_unique_rxs_m_xs_128_128; typedef setseq_base setseq_rxs_m_xs_8_8; typedef setseq_base setseq_rxs_m_xs_16_16; typedef setseq_base setseq_rxs_m_xs_32_32; typedef setseq_base setseq_rxs_m_xs_64_64; typedef setseq_base setseq_rxs_m_xs_128_128; typedef setseq_base cm_setseq_rxs_m_xs_128_128; // MCG versions don't make sense here, so aren't defined. /* Predefined types for RXS M */ typedef oneseq_base oneseq_rxs_m_16_8; typedef oneseq_base oneseq_rxs_m_32_16; typedef oneseq_base oneseq_rxs_m_64_32; typedef oneseq_base oneseq_rxs_m_128_64; typedef oneseq_base cm_oneseq_rxs_m_128_64; typedef unique_base unique_rxs_m_16_8; typedef unique_base unique_rxs_m_32_16; typedef unique_base unique_rxs_m_64_32; typedef unique_base unique_rxs_m_128_64; typedef unique_base cm_unique_rxs_m_128_64; typedef setseq_base setseq_rxs_m_16_8; typedef setseq_base setseq_rxs_m_32_16; typedef setseq_base setseq_rxs_m_64_32; typedef setseq_base setseq_rxs_m_128_64; typedef setseq_base cm_setseq_rxs_m_128_64; typedef mcg_base mcg_rxs_m_16_8; typedef mcg_base mcg_rxs_m_32_16; typedef mcg_base mcg_rxs_m_64_32; typedef mcg_base mcg_rxs_m_128_64; typedef mcg_base cm_mcg_rxs_m_128_64; /* Predefined types for DXSM */ typedef oneseq_base oneseq_dxsm_16_8; typedef oneseq_base oneseq_dxsm_32_16; typedef oneseq_base oneseq_dxsm_64_32; typedef oneseq_base oneseq_dxsm_128_64; typedef oneseq_base cm_oneseq_dxsm_128_64; typedef unique_base unique_dxsm_16_8; typedef unique_base unique_dxsm_32_16; typedef unique_base unique_dxsm_64_32; typedef unique_base unique_dxsm_128_64; typedef unique_base cm_unique_dxsm_128_64; typedef setseq_base setseq_dxsm_16_8; typedef setseq_base setseq_dxsm_32_16; typedef setseq_base setseq_dxsm_64_32; typedef setseq_base setseq_dxsm_128_64; typedef setseq_base cm_setseq_dxsm_128_64; typedef mcg_base mcg_dxsm_16_8; typedef mcg_base mcg_dxsm_32_16; typedef mcg_base mcg_dxsm_64_32; typedef mcg_base mcg_dxsm_128_64; typedef mcg_base cm_mcg_dxsm_128_64; /* Predefined types for XSL RR (only defined for "large" types) */ typedef oneseq_base oneseq_xsl_rr_64_32; typedef oneseq_base oneseq_xsl_rr_128_64; typedef oneseq_base cm_oneseq_xsl_rr_128_64; typedef unique_base unique_xsl_rr_64_32; typedef unique_base unique_xsl_rr_128_64; typedef unique_base cm_unique_xsl_rr_128_64; typedef setseq_base setseq_xsl_rr_64_32; typedef setseq_base setseq_xsl_rr_128_64; typedef setseq_base cm_setseq_xsl_rr_128_64; typedef mcg_base mcg_xsl_rr_64_32; typedef mcg_base mcg_xsl_rr_128_64; typedef mcg_base cm_mcg_xsl_rr_128_64; /* Predefined types for XSL RR RR (only defined for "large" types) */ typedef oneseq_base oneseq_xsl_rr_rr_64_64; typedef oneseq_base oneseq_xsl_rr_rr_128_128; typedef oneseq_base cm_oneseq_xsl_rr_rr_128_128; typedef unique_base unique_xsl_rr_rr_64_64; typedef unique_base unique_xsl_rr_rr_128_128; typedef unique_base cm_unique_xsl_rr_rr_128_128; typedef setseq_base setseq_xsl_rr_rr_64_64; typedef setseq_base setseq_xsl_rr_rr_128_128; typedef setseq_base cm_setseq_xsl_rr_rr_128_128; // MCG versions don't make sense here, so aren't defined. /* Extended generators */ template using ext_std8 = extended; template using ext_std16 = extended; template using ext_std32 = extended; template using ext_std64 = extended; template using ext_oneseq_rxs_m_xs_32_32 = ext_std32; template using ext_mcg_xsh_rs_64_32 = ext_std32; template using ext_oneseq_xsh_rs_64_32 = ext_std32; template using ext_setseq_xsh_rr_64_32 = ext_std32; template using ext_mcg_xsl_rr_128_64 = ext_std64; template using ext_oneseq_xsl_rr_128_64 = ext_std64; template using ext_setseq_xsl_rr_128_64 = ext_std64; } // namespace pcg_engines typedef pcg_engines::setseq_xsh_rr_64_32 pcg32; typedef pcg_engines::oneseq_xsh_rr_64_32 pcg32_oneseq; typedef pcg_engines::unique_xsh_rr_64_32 pcg32_unique; typedef pcg_engines::mcg_xsh_rs_64_32 pcg32_fast; typedef pcg_engines::setseq_xsl_rr_128_64 pcg64; typedef pcg_engines::oneseq_xsl_rr_128_64 pcg64_oneseq; typedef pcg_engines::unique_xsl_rr_128_64 pcg64_unique; typedef pcg_engines::mcg_xsl_rr_128_64 pcg64_fast; typedef pcg_engines::setseq_rxs_m_xs_8_8 pcg8_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_16_16 pcg16_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_32_32 pcg32_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_64_64 pcg64_once_insecure; typedef pcg_engines::setseq_xsl_rr_rr_128_128 pcg128_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_8_8 pcg8_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_16_16 pcg16_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_32_32 pcg32_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_64_64 pcg64_oneseq_once_insecure; typedef pcg_engines::oneseq_xsl_rr_rr_128_128 pcg128_oneseq_once_insecure; // These two extended RNGs provide two-dimensionally equidistributed // 32-bit generators. pcg32_k2_fast occupies the same space as pcg64, // and can be called twice to generate 64 bits, but does not required // 128-bit math; on 32-bit systems, it's faster than pcg64 as well. typedef pcg_engines::ext_setseq_xsh_rr_64_32<1,16,true> pcg32_k2; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<1,32,true> pcg32_k2_fast; // These eight extended RNGs have about as much state as arc4random // // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // // (just how good the cryptographic security is is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true> pcg32_k64; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true> pcg32_k64_oneseq; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true> pcg32_k64_fast; typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,false> pcg32_c64; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,false> pcg32_c64_oneseq; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,false> pcg32_c64_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,true> pcg64_k32; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,true> pcg64_k32_oneseq; typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,true> pcg64_k32_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,false> pcg64_c32; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,false> pcg64_c32_oneseq; typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false> pcg64_c32_fast; // These eight extended RNGs have more state than the Mersenne twister // // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // // (just how good the cryptographic security is is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true> pcg32_k1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true> pcg32_k1024_fast; typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,false> pcg32_c1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,false> pcg32_c1024_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,true> pcg64_k1024; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,true> pcg64_k1024_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,false> pcg64_c1024; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,false> pcg64_c1024_fast; // These generators have an insanely huge period (2^524352), and is suitable // for silly party tricks, such as dumping out 64 KB ZIP files at an arbitrary // point in the future. [Actually, over the full period of the generator, it // will produce every 64 KB ZIP file 2^64 times!] typedef pcg_engines::ext_setseq_xsh_rr_64_32<14,16,true> pcg32_k16384; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<14,32,true> pcg32_k16384_fast; #ifdef _MSC_VER #pragma warning(default:4146) #endif #endif // PCG_RAND_HPP_INCLUDED RenderKit-rkcommon-988718e/rkcommon/utility/detail/pcg_uint128.hpp000066400000000000000000000603751467524601100251120ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2017 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This code provides a a C++ class that can provide 128-bit (or higher) * integers. To produce 2K-bit integers, it uses two K-bit integers, * placed in a union that allowes the code to also see them as four K/2 bit * integers (and access them either directly name, or by index). * * It may seem like we're reinventing the wheel here, because several * libraries already exist that support large integers, but most existing * libraries provide a very generic multiprecision code, but here we're * operating at a fixed size. Also, most other libraries are fairly * heavyweight. So we use a direct implementation. Sadly, it's much slower * than hand-coded assembly or direct CPU support. */ #ifndef PCG_UINT128_HPP_INCLUDED #define PCG_UINT128_HPP_INCLUDED 1 #include #include #include #include #include #include #include #if defined(_MSC_VER) // Use MSVC++ intrinsics #include #endif /* * We want to lay the type out the same way that a native type would be laid * out, which means we must know the machine's endian, at compile time. * This ugliness attempts to do so. */ #ifndef PCG_LITTLE_ENDIAN #if defined(__BYTE_ORDER__) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define PCG_LITTLE_ENDIAN 1 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define PCG_LITTLE_ENDIAN 0 #else #error __BYTE_ORDER__ does not match a standard endian, pick a side #endif #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN #define PCG_LITTLE_ENDIAN 1 #elif __BIG_ENDIAN__ || _BIG_ENDIAN #define PCG_LITTLE_ENDIAN 0 #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86 #define PCG_LITTLE_ENDIAN 1 #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \ || __m68k__ || __mc68000__ #define PCG_LITTLE_ENDIAN 0 #else #error Unable to determine target endianness #endif #endif namespace pcg_extras { // Recent versions of GCC have intrinsics we can use to quickly calculate // the number of leading and trailing zeros in a number. If possible, we // use them, otherwise we fall back to old-fashioned bit twiddling to figure // them out. #ifndef PCG_BITCOUNT_T typedef uint8_t bitcount_t; #else typedef PCG_BITCOUNT_T bitcount_t; #endif /* * Provide some useful helper functions * * flog2 floor(log2(x)) * * trailingzeros number of trailing zero bits */ #if defined(__GNUC__) // Any GNU-compatible compiler supporting C++11 has // some useful intrinsics we can use. inline bitcount_t flog2(uint32_t v) { return 31 - __builtin_clz(v); } inline bitcount_t trailingzeros(uint32_t v) { return __builtin_ctz(v); } inline bitcount_t flog2(uint64_t v) { #if UINT64_MAX == ULONG_MAX return 63 - __builtin_clzl(v); #elif UINT64_MAX == ULLONG_MAX return 63 - __builtin_clzll(v); #else #error Cannot find a function for uint64_t #endif } inline bitcount_t trailingzeros(uint64_t v) { #if UINT64_MAX == ULONG_MAX return __builtin_ctzl(v); #elif UINT64_MAX == ULLONG_MAX return __builtin_ctzll(v); #else #error Cannot find a function for uint64_t #endif } #elif defined(_MSC_VER) // Use MSVC++ intrinsics #pragma intrinsic(_BitScanReverse, _BitScanForward) #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) #pragma intrinsic(_BitScanReverse64, _BitScanForward64) #endif inline bitcount_t flog2(uint32_t v) { unsigned long i; _BitScanReverse(&i, v); return i; } inline bitcount_t trailingzeros(uint32_t v) { unsigned long i; _BitScanForward(&i, v); return i; } inline bitcount_t flog2(uint64_t v) { #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) unsigned long i; _BitScanReverse64(&i, v); return i; #else // 32-bit x86 uint32_t high = v >> 32; uint32_t low = uint32_t(v); return high ? 32+flog2(high) : flog2(low); #endif } inline bitcount_t trailingzeros(uint64_t v) { #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) unsigned long i; _BitScanForward64(&i, v); return i; #else // 32-bit x86 uint32_t high = v >> 32; uint32_t low = uint32_t(v); return low ? trailingzeros(low) : trailingzeros(high)+32; #endif } #else // Otherwise, we fall back to bit twiddling // implementations inline bitcount_t flog2(uint32_t v) { // Based on code by Eric Cole and Mark Dickinson, which appears at // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn static const uint8_t multiplyDeBruijnBitPos[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; v |= v >> 1; // first round down to one less than a power of 2 v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27]; } inline bitcount_t trailingzeros(uint32_t v) { static const uint8_t multiplyDeBruijnBitPos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27]; } inline bitcount_t flog2(uint64_t v) { uint32_t high = v >> 32; uint32_t low = uint32_t(v); return high ? 32+flog2(high) : flog2(low); } inline bitcount_t trailingzeros(uint64_t v) { uint32_t high = v >> 32; uint32_t low = uint32_t(v); return low ? trailingzeros(low) : trailingzeros(high)+32; } #endif inline bitcount_t flog2(uint8_t v) { return flog2(uint32_t(v)); } inline bitcount_t flog2(uint16_t v) { return flog2(uint32_t(v)); } #if __SIZEOF_INT128__ inline bitcount_t flog2(__uint128_t v) { uint64_t high = uint64_t(v >> 64); uint64_t low = uint64_t(v); return high ? 64+flog2(high) : flog2(low); } #endif inline bitcount_t trailingzeros(uint8_t v) { return trailingzeros(uint32_t(v)); } inline bitcount_t trailingzeros(uint16_t v) { return trailingzeros(uint32_t(v)); } #if __SIZEOF_INT128__ inline bitcount_t trailingzeros(__uint128_t v) { uint64_t high = uint64_t(v >> 64); uint64_t low = uint64_t(v); return low ? trailingzeros(low) : trailingzeros(high)+64; } #endif template inline bitcount_t clog2(UInt v) { return flog2(v) + ((v & (-v)) != v); } template inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout) { UInt half_result = y + carryin; UInt result = x + half_result; *carryout = (half_result < y) || (result < x); return result; } template inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout) { UInt half_result = y + carryin; UInt result = x - half_result; *carryout = (half_result < y) || (result > x); return result; } template class uint_x4 { // private: static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT; public: union { #if PCG_LITTLE_ENDIAN struct { UInt v0, v1, v2, v3; } w; struct { UIntX2 v01, v23; } d; #else struct { UInt v3, v2, v1, v0; } w; struct { UIntX2 v23, v01; } d; #endif // For the array access versions, the code that uses the array // must handle endian itself. Yuck. UInt wa[4]; UIntX2 da[2]; }; public: uint_x4() = default; constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0) #if PCG_LITTLE_ENDIAN : w{v0, v1, v2, v3} #else : w{v3, v2, v1, v0} #endif { // Nothing (else) to do } constexpr uint_x4(UIntX2 v23, UIntX2 v01) #if PCG_LITTLE_ENDIAN : d{v01,v23} #else : d{v23,v01} #endif { // Nothing (else) to do } constexpr uint_x4(UIntX2 v01) #if PCG_LITTLE_ENDIAN : d{v01, UIntX2(0)} #else : d{UIntX2(0),v01} #endif { // Nothing (else) to do } template::value && sizeof(Integral) <= sizeof(UIntX2)) >::type* = nullptr> constexpr uint_x4(Integral v01) #if PCG_LITTLE_ENDIAN : d{UIntX2(v01), UIntX2(0)} #else : d{UIntX2(0), UIntX2(v01)} #endif { // Nothing (else) to do } explicit constexpr operator UIntX2() const { return d.v01; } template::value && sizeof(Integral) <= sizeof(UIntX2)) >::type* = nullptr> explicit constexpr operator Integral() const { return Integral(d.v01); } explicit constexpr operator bool() const { return d.v01 || d.v23; } template friend uint_x4 operator*(const uint_x4&, const uint_x4&); template friend uint_x4 operator*(const uint_x4&, V); template friend std::pair< uint_x4,uint_x4 > divmod(const uint_x4&, const uint_x4&); template friend uint_x4 operator+(const uint_x4&, const uint_x4&); template friend uint_x4 operator-(const uint_x4&, const uint_x4&); template friend uint_x4 operator<<(const uint_x4&, const bitcount_t shift); template friend uint_x4 operator>>(const uint_x4&, const bitcount_t shift); template friend uint_x4 operator&(const uint_x4&, const uint_x4&); template friend uint_x4 operator|(const uint_x4&, const uint_x4&); template friend uint_x4 operator^(const uint_x4&, const uint_x4&); template friend bool operator==(const uint_x4&, const uint_x4&); template friend bool operator!=(const uint_x4&, const uint_x4&); template friend bool operator<(const uint_x4&, const uint_x4&); template friend bool operator<=(const uint_x4&, const uint_x4&); template friend bool operator>(const uint_x4&, const uint_x4&); template friend bool operator>=(const uint_x4&, const uint_x4&); template friend uint_x4 operator~(const uint_x4&); template friend uint_x4 operator-(const uint_x4&); template friend bitcount_t flog2(const uint_x4&); template friend bitcount_t trailingzeros(const uint_x4&); uint_x4& operator*=(const uint_x4& rhs) { uint_x4 result = *this * rhs; return *this = result; } uint_x4& operator*=(UIntX2 rhs) { uint_x4 result = *this * rhs; return *this = result; } uint_x4& operator/=(const uint_x4& rhs) { uint_x4 result = *this / rhs; return *this = result; } uint_x4& operator%=(const uint_x4& rhs) { uint_x4 result = *this % rhs; return *this = result; } uint_x4& operator+=(const uint_x4& rhs) { uint_x4 result = *this + rhs; return *this = result; } uint_x4& operator-=(const uint_x4& rhs) { uint_x4 result = *this - rhs; return *this = result; } uint_x4& operator&=(const uint_x4& rhs) { uint_x4 result = *this & rhs; return *this = result; } uint_x4& operator|=(const uint_x4& rhs) { uint_x4 result = *this | rhs; return *this = result; } uint_x4& operator^=(const uint_x4& rhs) { uint_x4 result = *this ^ rhs; return *this = result; } uint_x4& operator>>=(bitcount_t shift) { uint_x4 result = *this >> shift; return *this = result; } uint_x4& operator<<=(bitcount_t shift) { uint_x4 result = *this << shift; return *this = result; } }; template bitcount_t flog2(const uint_x4& v) { #if PCG_LITTLE_ENDIAN for (uint8_t i = 4; i !=0; /* dec in loop */) { --i; #else for (uint8_t i = 0; i < 4; ++i) { #endif if (v.wa[i] == 0) continue; return flog2(v.wa[i]) + uint_x4::UINT_BITS*i; } abort(); } template bitcount_t trailingzeros(const uint_x4& v) { #if PCG_LITTLE_ENDIAN for (uint8_t i = 0; i < 4; ++i) { #else for (uint8_t i = 4; i !=0; /* dec in loop */) { --i; #endif if (v.wa[i] != 0) return trailingzeros(v.wa[i]) + uint_x4::UINT_BITS*i; } return uint_x4::UINT_BITS*4; } template std::pair< uint_x4, uint_x4 > divmod(const uint_x4& orig_dividend, const uint_x4& divisor) { // If the dividend is less than the divisor, the answer is always zero. // This takes care of boundary cases like 0/x (which would otherwise be // problematic because we can't take the log of zero. (The boundary case // of division by zero is undefined.) if (orig_dividend < divisor) return { uint_x4(UIntX2(0)), orig_dividend }; auto dividend = orig_dividend; auto log2_divisor = flog2(divisor); auto log2_dividend = flog2(dividend); // assert(log2_dividend >= log2_divisor); bitcount_t logdiff = log2_dividend - log2_divisor; constexpr uint_x4 ONE(UIntX2(1)); if (logdiff == 0) return { ONE, dividend - divisor }; // Now we change the log difference to // floor(log2(divisor)) - ceil(log2(dividend)) // to ensure that we *underestimate* the result. logdiff -= 1; uint_x4 quotient(UIntX2(0)); auto qfactor = ONE << logdiff; auto factor = divisor << logdiff; do { dividend -= factor; quotient += qfactor; while (dividend < factor) { factor >>= 1; qfactor >>= 1; } } while (dividend >= divisor); return { quotient, dividend }; } template uint_x4 operator/(const uint_x4& dividend, const uint_x4& divisor) { return divmod(dividend, divisor).first; } template uint_x4 operator%(const uint_x4& dividend, const uint_x4& divisor) { return divmod(dividend, divisor).second; } template uint_x4 operator*(const uint_x4& a, const uint_x4& b) { constexpr auto UINT_BITS = uint_x4::UINT_BITS; uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0); r.w.v0 = UInt(a0b0); r.w.v1 = UInt(a0b0 >> UINT_BITS); UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0); r.w.v2 = UInt(a1b0 >> UINT_BITS); r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); carryin = false; r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout); r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01; return r; } template uint_x4 operator*(const uint_x4& a, UIntX2 b01) { constexpr auto UINT_BITS = uint_x4::UINT_BITS; uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01)); r.w.v0 = UInt(a0b0); r.w.v1 = UInt(a0b0 >> UINT_BITS); UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01)); r.w.v2 = UInt(a1b0 >> UINT_BITS); r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); carryin = false; r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout); r.d.v23 += a.d.v23 * b01; return r; } template uint_x4 operator+(const uint_x4& a, const uint_x4& b) { uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout); carryin = carryout; r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout); return r; } template uint_x4 operator-(const uint_x4& a, const uint_x4& b) { uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout); carryin = carryout; r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout); carryin = carryout; r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout); carryin = carryout; r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout); return r; } template uint_x4 operator&(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01); } template uint_x4 operator|(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01); } template uint_x4 operator^(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01); } template uint_x4 operator~(const uint_x4& v) { return uint_x4(~v.d.v23, ~v.d.v01); } template uint_x4 operator-(const uint_x4& v) { return uint_x4(0UL,0UL) - v; } template bool operator==(const uint_x4& a, const uint_x4& b) { return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23); } template bool operator!=(const uint_x4& a, const uint_x4& b) { return !operator==(a,b); } template bool operator<(const uint_x4& a, const uint_x4& b) { return (a.d.v23 < b.d.v23) || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01)); } template bool operator>(const uint_x4& a, const uint_x4& b) { return operator<(b,a); } template bool operator<=(const uint_x4& a, const uint_x4& b) { return !(operator<(b,a)); } template bool operator>=(const uint_x4& a, const uint_x4& b) { return !(operator<(a,b)); } template uint_x4 operator<<(const uint_x4& v, const bitcount_t shift) { uint_x4 r = {0U, 0U, 0U, 0U}; const bitcount_t bits = uint_x4::UINT_BITS; const bitcount_t bitmask = bits - 1; const bitcount_t shiftdiv = shift / bits; const bitcount_t shiftmod = shift & bitmask; if (shiftmod) { UInt carryover = 0; #if PCG_LITTLE_ENDIAN for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #else for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #endif r.wa[out] = (v.wa[in] << shiftmod) | carryover; carryover = (v.wa[in] >> (bits - shiftmod)); } } else { #if PCG_LITTLE_ENDIAN for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #else for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #endif r.wa[out] = v.wa[in]; } } return r; } template uint_x4 operator>>(const uint_x4& v, const bitcount_t shift) { uint_x4 r = {0U, 0U, 0U, 0U}; const bitcount_t bits = uint_x4::UINT_BITS; const bitcount_t bitmask = bits - 1; const bitcount_t shiftdiv = shift / bits; const bitcount_t shiftmod = shift & bitmask; if (shiftmod) { UInt carryover = 0; #if PCG_LITTLE_ENDIAN for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #else for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #endif r.wa[out] = (v.wa[in] >> shiftmod) | carryover; carryover = (v.wa[in] << (bits - shiftmod)); } } else { #if PCG_LITTLE_ENDIAN for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #else for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #endif r.wa[out] = v.wa[in]; } } return r; } } // namespace pcg_extras #endif // PCG_UINT128_HPP_INCLUDED RenderKit-rkcommon-988718e/rkcommon/utility/getEnvVar.h000066400000000000000000000025751467524601100231440ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "Optional.h" namespace rkcommon { namespace utility { template inline Optional getEnvVar(const std::string & /*var*/) { static_assert(!std::is_same::value && !std::is_same::value && !std::is_same::value, "You can only get an int, float, or std::string " "when using ospray::getEnvVar()!"); return {}; } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional((float)atof(str)) : Optional(); } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional(atoi(str)) : Optional(); } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional(std::string(str)) : Optional(); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/multidim_index_sequence.h000066400000000000000000000141151467524601100261370ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/vec.h" namespace rkcommon { using namespace math; template struct multidim_index_iterator; template struct multidim_index_sequence { static_assert(NDIMS == 2 || NDIMS == 3, "rkcommon::multidim_index_sequence is currently limited to" " only 2 or 3 dimensions. (NDIMS == 2 || NDIMS == 3)"); multidim_index_sequence(const vec_t &_dims); size_t flatten(const vec_t &coords) const; vec_t reshape(size_t i) const; vec_t dimensions() const; size_t total_indices() const; multidim_index_iterator begin() const; multidim_index_iterator end() const; private: vec_t dims{0}; }; using index_sequence_2D = multidim_index_sequence<2>; using index_sequence_3D = multidim_index_sequence<3>; template struct multidim_index_iterator { multidim_index_iterator(const vec_t &_dims) : dims(_dims) {} multidim_index_iterator(const vec_t &_dims, size_t start) : multidim_index_iterator(_dims) { current_index = start; } // Traditional iterator interface methods // vec_t operator*() const; multidim_index_iterator operator++(); multidim_index_iterator &operator++(int); multidim_index_iterator operator--(); multidim_index_iterator &operator--(int); multidim_index_iterator &operator+(const multidim_index_iterator &other); multidim_index_iterator &operator-(const multidim_index_iterator &other); multidim_index_iterator &operator+(size_t other); multidim_index_iterator &operator-(size_t other); bool operator==(const multidim_index_iterator &other) const; bool operator!=(const multidim_index_iterator &other) const; // Extra helper methods // void jump_to(size_t index); size_t current() const; private: multidim_index_sequence dims; size_t current_index{0}; }; // Inlined multidim_index_sequence definitions ////////////////////////////// template inline multidim_index_sequence::multidim_index_sequence( const vec_t &_dims) : dims(_dims) { } template <> inline size_t index_sequence_2D::flatten(const vec_t &coords) const { return coords.x + dims.x * coords.y; } template <> inline size_t index_sequence_3D::flatten(const vec_t &coords) const { return coords.x + dims.x * (coords.y + dims.y * coords.z); } template <> inline vec_t index_sequence_2D::reshape(size_t i) const { size_t y = i / dims.x; size_t x = i % dims.x; return vec_t(x, y); } template <> inline vec_t index_sequence_3D::reshape(size_t i) const { size_t z = i / (dims.x * dims.y); i -= (z * dims.x * dims.y); size_t y = i / dims.x; size_t x = i % dims.x; return vec_t(x, y, z); } template inline vec_t multidim_index_sequence::dimensions() const { return dims; } template inline size_t multidim_index_sequence::total_indices() const { return dims.long_product(); } template multidim_index_iterator multidim_index_sequence::begin() const { return multidim_index_iterator(dims, 0); } template multidim_index_iterator multidim_index_sequence::end() const { return multidim_index_iterator(dims, total_indices()); } // Inlined multidim_index_iterator definitions ////////////////////////////// template inline vec_t multidim_index_iterator::operator*() const { return dims.reshape(current_index); } template inline multidim_index_iterator multidim_index_iterator::operator++() { return multidim_index_iterator(dims.dimensions(), ++current_index); } template inline multidim_index_iterator &multidim_index_iterator::operator++(int) { current_index++; return *this; } template inline multidim_index_iterator multidim_index_iterator::operator--() { return multidim_index_iterator(dims.dimensions(), --current_index); } template inline multidim_index_iterator &multidim_index_iterator::operator--(int) { current_index--; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator+( const multidim_index_iterator &other) { current_index += other.current_index; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator-( const multidim_index_iterator &other) { current_index -= other.current_index; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator+(size_t offset) { current_index += offset; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator-(size_t offset) { current_index -= offset; return *this; } template inline bool multidim_index_iterator::operator==( const multidim_index_iterator &other) const { return dims.dimensions() == other.dims.dimensions() && current_index == other.current_index; } template inline bool multidim_index_iterator::operator!=( const multidim_index_iterator &other) const { return !(*this == other); } template inline void multidim_index_iterator::jump_to(size_t index) { current_index = index; } template inline size_t multidim_index_iterator::current() const { return current_index; } } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/random.h000066400000000000000000000042211467524601100225110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../common.h" #include "../math/vec.h" #include "detail/pcg_random.hpp" namespace rkcommon { namespace utility { class pcg32_biased_float_distribution { public: pcg32_biased_float_distribution(int seed, int sequence, float lower, float upper); float operator()(); private: pcg32 rng; float lower, upper, diff; }; // Inlined pcg32_biased_float_distribution definitions /////////////////// inline pcg32_biased_float_distribution::pcg32_biased_float_distribution( int seed, int sequence, float lower, float upper) : lower(lower), upper(upper) { diff = upper - lower; rng.seed(seed, sequence); } inline float pcg32_biased_float_distribution::operator()() { const unsigned scaleBits = 0x2F800000; // 2^(-32) const float scale = *(float *)&scaleBits; return (scale * rng()) * diff + lower; } // The std::uniform_real_distribution from is not portable and may give // different results on different plaforms/compilers, we have to use our own // implementation for consistency template struct uniform_real_distribution { uniform_real_distribution(T lowerValue = 0, T upperValue = 1) : l(lowerValue), u(upperValue) {} template T operator()(G &g) { const T scale = (u - l) / T(g.max() - g.min()); return l + (g() - g.min()) * scale; } private: T l, u; }; inline math::vec3f makeRandomColor(const unsigned int i) { const unsigned int mx = 13 * 17 * 43; const unsigned int my = 11 * 29; const unsigned int mz = 7 * 23 * 63; const unsigned int g = (i * (3 * 5 * 127) + 12312314); return math::vec3f((g % mx) * (1.f / (mx - 1)), (g % my) * (1.f / (my - 1)), (g % mz) * (1.f / (mz - 1))); } } // namespace utility } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/utility/random.ih000066400000000000000000000010061467524601100226600ustar00rootroot00000000000000// Copyright 2022 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/vec.ih" #ifndef ISPC namespace ispc { #endif inline vec3f makeRandomColor(const uint32 i) { const uniform uint32 mx = 13 * 17 * 43; const uniform uint32 my = 11 * 29; const uniform uint32 mz = 7 * 23 * 63; const uint32 g = (i * (3 * 5 * 127) + 12312314); return make_vec3f((g % mx) * (1.f / (mx - 1)), (g % my) * (1.f / (my - 1)), (g % mz) * (1.f / (mz - 1))); } #ifndef ISPC } #endif RenderKit-rkcommon-988718e/rkcommon/version.h.in000066400000000000000000000004361467524601100216240ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #define RKCOMMON_VERSION_MAJOR @PROJECT_VERSION_MAJOR@ #define RKCOMMON_VERSION_MINOR @PROJECT_VERSION_MINOR@ #define RKCOMMON_VERSION_PATCH @PROJECT_VERSION_PATCH@ #define RKCOMMON_VERSION "@PROJECT_VERSION@" RenderKit-rkcommon-988718e/rkcommon/xml/000077500000000000000000000000001467524601100201565ustar00rootroot00000000000000RenderKit-rkcommon-988718e/rkcommon/xml/XML.cpp000066400000000000000000000225241467524601100213270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "XML.h" #include namespace rkcommon { namespace xml { std::string toString(const float f) { std::stringstream ss; ss << f; return ss.str(); } std::string toString(const math::vec3f &v) { std::stringstream ss; ss << v.x << " " << v.y << " " << v.z; return ss.str(); } /*! checks if given node has given property */ bool Node::hasProp(const std::string &propName) const { return (properties.find(propName) != properties.end()); } /*! return value of property with given name if present, else return * 'fallbackValue' */ std::string Node::getProp(const std::string &propName, const std::string &fallbackValue) const { const auto it = properties.find(propName); return (it != properties.end()) ? it->second : fallbackValue; } /*! return value of property with given name if present; and throw an * exception if not */ std::string Node::getProp(const std::string &propName) const { return getProp(propName, std::string()); } static bool isWhite(char s) { return s == ' ' || s == '\t' || s == '\n' || s == '\r'; } static void expect(char *&s, const char w) { if (*s != w) { std::stringstream err; err << "error reading XML file: expecting '" << w << "', but found '" << *s << "'"; throw std::runtime_error(err.str()); } } static void expect(char *&s, const char w0, const char w1) { if (*s != w0 && *s != w1) { std::stringstream err; err << "error reading XML file: expecting '" << w0 << "' or '" << w1 << "', but found '" << *s << "'"; throw std::runtime_error(err.str()); } } static void consume(char *&s, const char w) { expect(s, w); ++s; } static void consumeComment(char *&s) { consume(s, '<'); consume(s, '!'); while (!((s[0] == 0) || (s[0] == '-' && s[1] == '-' && s[2] == '>'))) ++s; consume(s, '-'); consume(s, '-'); consume(s, '>'); } static void consume(char *&s, const char *word) { const char *in = word; while (*word) { try { consume(s, *word); ++word; } catch (...) { std::stringstream err; err << "error reading XML file: expecting '" << in << "', but could not find it"; throw std::runtime_error(err.str()); } } } static std::string makeString(const char *begin, const char *end) { if (!begin || !end || begin > end) throw std::runtime_error("invalid substring in osp::xml::makeString"); if (begin == end) return ""; char *mem = new char[end - begin + 1]; memcpy(mem, begin, end - begin); mem[end - begin] = 0; std::string s = mem; delete[] mem; return s; } static void parseString(char *&s, std::string &value) { if (*s == '"') { consume(s, '"'); char *begin = s; while (*s != '"') { if (*s == '\\') ++s; ++s; } char *end = s; value = makeString(begin, end); consume(s, '"'); } else { consume(s, '\''); char *begin = s; while (*s != '\'') { if (*s == '\\') ++s; ++s; } char *end = s; value = makeString(begin, end); consume(s, '\''); } } static bool parseIdentifier(char *&s, std::string &identifier) { if (isalpha(*s) || *s == '_') { char *begin = s; ++s; while (isalpha(*s) || isdigit(*s) || *s == '_' || *s == '.') { ++s; } char *end = s; identifier = makeString(begin, end); return true; } return false; } static void skipWhites(char *&s) { while (isWhite(*s)) ++s; } static bool parseProp(char *&s, std::string &name, std::string &value) { if (!parseIdentifier(s, name)) return false; skipWhites(s); consume(s, '='); skipWhites(s); expect(s, '"', '\''); parseString(s, value); return true; } static bool skipComment(char *&s) { if (*s == '<' && s[1] == '!') { consumeComment(s); return true; } return false; } static Node parseNode(char *&s) { consume(s, '<'); Node node; if (!parseIdentifier(s, node.name)) throw std::runtime_error("XML error: could not parse node name"); skipWhites(s); std::string name, value; while (parseProp(s, name, value)) { node.properties[name] = value; skipWhites(s); } if (*s == '/') { consume(s, "/>"); return node; } consume(s, ">"); while (1) { skipWhites(s); if (skipComment(s)) continue; if (*s == '<' && s[1] == '/') { consume(s, ", but ended with '"); } consume(s, ">"); break; // either end of current node } else if (*s == '<') { // child node node.child.push_back(parseNode(s)); } else if (*s == 0) { std::cout << "#osp:xml: warning: xml file ended with still-open" " nodes (this typically indicates a partial xml file)" << std::endl; return node; } else { if (node.content != "") { throw std::runtime_error( "invalid XML node - two different" " contents!?"); } // content char *begin = s; while (*s != '<' && *s != 0) ++s; char *end = s; while (isspace(end[-1])) --end; node.content = makeString(begin, end); } } return node; } static bool parseHeader(char *&s) { consume(s, "') { consume(s, "?>"); return true; } if (!isWhite(*s)) return false; ++s; skipWhites(s); std::string name, value; while (parseProp(s, name, value)) { // ignore header prop skipWhites(s); } consume(s, "?>"); return true; } void parseXML(XMLDoc &doc, char *s) { if (s[0] == '<' && s[1] == '?') { if (!parseHeader(s)) throw std::runtime_error("could not parse XML header"); } skipWhites(s); while (*s != 0) { if (skipComment(s)) { skipWhites(s); continue; } doc.child.push_back(parseNode(s)); skipWhites(s); } if (*s != 0) throw std::runtime_error("un-parsed junk at end of file"); } void Writer::spaces() { for (size_t i = 0; i < state.size(); i++) fprintf(xml, " "); } void Writer::writeProperty(const std::string &name, const std::string &value) { assert(xml); assert(!state.empty()); State *s = state.top(); (void)s; assert(s); assert(!s->hasContent); // content may not be written before properties fprintf(xml, " %s=\"%s\"", name.c_str(), value.c_str()); } void Writer::openNode(const std::string &type) { assert(xml); spaces(); fprintf(xml, "<%s", type.c_str()); State *s = new State; s->type = type; state.push(s); } void Writer::closeNode() { assert(xml); assert(!state.empty()); State *s = state.top(); assert(s); if (s->hasContent) fprintf(xml, "", s->type.c_str()); else fprintf(xml, "/>\n"); delete s; state.pop(); } XMLDoc readXML(const std::string &fn) { FILE *file = fopen(fn.c_str(), "r"); if (!file) { throw std::runtime_error("ospray::XML error: could not open file '" + fn + "'"); } fseek(file, 0, SEEK_END); ssize_t numBytes = #ifdef _WIN32 _ftelli64(file); #else ftell(file); #endif fseek(file, 0, SEEK_SET); std::vector mem(numBytes + 1, 0); try { auto rc = fread(mem.data(), 1, numBytes, file); (void)rc; XMLDoc doc; doc.fileName = fn; parseXML(doc, mem.data()); fclose(file); return doc; } catch (const std::runtime_error &e) { fclose(file); throw e; } } Writer::Writer(FILE *xml, FILE *bin) : xml(xml), bin(bin) {} /*! write document header, may only be called once */ void Writer::writeHeader(const std::string &version) { assert(xml); fprintf(xml, "\n", version.c_str()); } /*! write document footer. may only be called once, at end of write */ void Writer::writeFooter() { assert(xml); } } // namespace xml } // namespace rkcommon RenderKit-rkcommon-988718e/rkcommon/xml/XML.h000066400000000000000000000062151467524601100207730ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // ospcomon #include "../common.h" #include "../math/vec.h" #include "../os/FileName.h" // stl #include #include #include #include namespace rkcommon { namespace xml { struct Node; struct XMLDoc; /*! a XML node, consisting of a name, a list of properties, and a set of child nodes */ struct RKCOMMON_INTERFACE Node { Node() = default; ~Node() = default; /*! checks if given node has given property */ bool hasProp(const std::string &name) const; /*! return value of property with given name if present; and throw an * exception if not */ std::string getProp(const std::string &name) const; /*! return value of property with given name if present, else return * 'fallbackValue' */ std::string getProp(const std::string &name, const std::string &fallbackValue) const; /*! name of the xml node (i.e., the thing that's in "....") */ std::string name; /*! the content string, i.e., the thing that's between "..." and "..." */ std::string content; /*! \brief list of xml node properties properties. ' \detailed prop'erties in xml nodes are the 'name="value"' inside the ... description */ std::map properties; /*! list of child nodes */ std::vector child; }; /*! a entire xml document */ struct RKCOMMON_INTERFACE XMLDoc : public Node { XMLDoc() = default; ~XMLDoc() = default; FileName fileName; }; /*! parse an XML file with given file name, and return a pointer to it. In case of any error, this function will free all already-allocated data, and throw a std::runtime_error exception */ RKCOMMON_INTERFACE XMLDoc readXML(const std::string &fn); /*! helper class for writing sg nodes in XML format */ struct Writer { Writer(FILE *xml, FILE *bin); /*! write document header, may only be called once */ void writeHeader(const std::string &version); /*! write document footer. may only be called once, at end of write */ void writeFooter(); //! open a new xml node with given node type */ void openNode(const std::string &type); void writeProperty(const std::string &name, const std::string &value); void writeContent(const std::string &name, const std::string &value); //! close last open node type */ void closeNode(); /*! align output pos on binary file to given alignment */ void alignData(size_t alignment); /*! write given data into data file, and return offset value at which it was written */ size_t writeData(const void *ptr, size_t size); FILE *xml, *bin; private: struct State { bool hasContent{false}; std::string type; }; void spaces(); std::stack state; }; } // namespace xml } // namespace rkcommon RenderKit-rkcommon-988718e/tests/000077500000000000000000000000001467524601100166735ustar00rootroot00000000000000RenderKit-rkcommon-988718e/tests/CMakeLists.txt000066400000000000000000000057251467524601100214440ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 add_executable(rkcommon_test_suite ${RKCOMMON_RESOURCE} catch_main.cpp array3D/test_Array3D.cpp array3D/test_for_each.cpp math/test_AffineSpace.cpp math/test_box.cpp math/test_constants.cpp math/test_LinearSpace.cpp math/test_rkmath.cpp math/test_Quaternion.cpp math/test_range.cpp math/test_vec.cpp memory/test_DeletedUniquePtr.cpp memory/test_malloc.cpp memory/test_RefCount.cpp os/test_FileName.cpp os/test_library.cpp containers/test_AlignedVector.cpp containers/test_FlatMap.cpp containers/test_TransactionalBuffer.cpp tasking/test_async.cpp tasking/test_AsyncLoop.cpp tasking/test_AsyncTask.cpp tasking/test_parallel_for.cpp tasking/test_parallel_foreach.cpp tasking/test_schedule.cpp traits/test_traits.cpp utility/test_AbstractArray.cpp utility/test_Any.cpp utility/test_ArgumentList.cpp utility/test_ArrayView.cpp utility/test_CodeTimer.cpp utility/test_DataView.cpp utility/test_demangle.cpp utility/test_DoubleBufferedValue.cpp utility/test_getEnvVar.cpp utility/test_multidim_index_sequence.cpp utility/test_Observers.cpp utility/test_OnScopeExit.cpp utility/test_Optional.cpp utility/test_OwnedArray.cpp utility/test_ParameterizedObject.cpp utility/test_PseudoURL.cpp utility/test_random.cpp utility/test_SaveImage.cpp utility/test_StringManip.cpp utility/test_TimeStamp.cpp utility/test_TransactionalValue.cpp ) target_link_libraries(rkcommon_test_suite PRIVATE rkcommon) add_test(NAME ArgumentList COMMAND rkcommon_test_suite "[ArgumentList]") add_test(NAME ArrayView COMMAND rkcommon_test_suite "[ArrayView]") add_test(NAME OnScopeExit COMMAND rkcommon_test_suite "[OnScopeExit]") add_test(NAME Optional COMMAND rkcommon_test_suite "[Optional]") add_test(NAME FlatMap COMMAND rkcommon_test_suite "[FlatMap]") add_test(NAME TransactionalBuffer COMMAND rkcommon_test_suite "[TransactionalBuffer]") add_test(NAME StringManip COMMAND rkcommon_test_suite "[StringManip]") add_test(NAME random COMMAND rkcommon_test_suite "[random]") if(NOT WIN32) # Tests which are broken on Windows with unknown fixes (for now) add_test(NAME Any COMMAND rkcommon_test_suite "[Any]") add_test(NAME AlignedVector COMMAND rkcommon_test_suite "[AlignedVector]") add_test(NAME Observers COMMAND rkcommon_test_suite "[Observers]") add_test(NAME ParameterizedObject COMMAND rkcommon_test_suite "[ParameterizedObject]") add_test(NAME async COMMAND rkcommon_test_suite "[async]") add_test(NAME parallel_for COMMAND rkcommon_test_suite "[parallel_for]") add_test(NAME parallel_foreach COMMAND rkcommon_test_suite "[parallel_foreach]") add_test(NAME schedule COMMAND rkcommon_test_suite "[schedule]") endif() install(TARGETS rkcommon_test_suite RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) RenderKit-rkcommon-988718e/tests/array3D/000077500000000000000000000000001467524601100202005ustar00rootroot00000000000000RenderKit-rkcommon-988718e/tests/array3D/test_Array3D.cpp000066400000000000000000000002461467524601100232120ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // only test compliation, no functional tests (yet) #include "rkcommon/array3D/Array3D.h" RenderKit-rkcommon-988718e/tests/array3D/test_for_each.cpp000066400000000000000000000002471467524601100235140ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // only test compliation, no functional tests (yet) #include "rkcommon/array3D/for_each.h" RenderKit-rkcommon-988718e/tests/catch.hpp000066400000000000000000024040031467524601100204710ustar00rootroot00000000000000/* * Catch v2.13.10 * Generated: 2022-10-16 11:01:23.452308 * ---------------------------------------------------------- * This file has been merged from multiple headers. Please don't edit it directly * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. * * Distributed under the Boost Software License, Version 1.0. (See accompanying * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */ #ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED #define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED // start catch.hpp #define CATCH_VERSION_MAJOR 2 #define CATCH_VERSION_MINOR 13 #define CATCH_VERSION_PATCH 10 #ifdef __clang__ # pragma clang system_header #elif defined __GNUC__ # pragma GCC system_header #endif // start catch_suppress_warnings.h #ifdef __clang__ # ifdef __ICC // icpc defines the __clang__ macro # pragma warning(push) # pragma warning(disable: 161 1682) # else // __ICC # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wpadded" # pragma clang diagnostic ignored "-Wswitch-enum" # pragma clang diagnostic ignored "-Wcovered-switch-default" # endif #elif defined __GNUC__ // Because REQUIREs trigger GCC's -Wparentheses, and because still // supported version of g++ have only buggy support for _Pragmas, // Wparentheses have to be suppressed globally. # pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-variable" # pragma GCC diagnostic ignored "-Wpadded" #endif // end catch_suppress_warnings.h #if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER) # define CATCH_IMPL # define CATCH_CONFIG_ALL_PARTS #endif // In the impl file, we want to have access to all parts of the headers // Can also be used to sanely support PCHs #if defined(CATCH_CONFIG_ALL_PARTS) # define CATCH_CONFIG_EXTERNAL_INTERFACES # if defined(CATCH_CONFIG_DISABLE_MATCHERS) # undef CATCH_CONFIG_DISABLE_MATCHERS # endif # if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER) # define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER # endif #endif #if !defined(CATCH_CONFIG_IMPL_ONLY) // start catch_platform.h // See e.g.: // https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html #ifdef __APPLE__ # include # if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) # define CATCH_PLATFORM_MAC # elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) # define CATCH_PLATFORM_IPHONE # endif #elif defined(linux) || defined(__linux) || defined(__linux__) # define CATCH_PLATFORM_LINUX #elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__) # define CATCH_PLATFORM_WINDOWS #endif // end catch_platform.h #ifdef CATCH_IMPL # ifndef CLARA_CONFIG_MAIN # define CLARA_CONFIG_MAIN_NOT_DEFINED # define CLARA_CONFIG_MAIN # endif #endif // start catch_user_interfaces.h namespace Catch { unsigned int rngSeed(); } // end catch_user_interfaces.h // start catch_tag_alias_autoregistrar.h // start catch_common.h // start catch_compiler_capabilities.h // Detect a number of compiler features - by compiler // The following features are defined: // // CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported? // CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported? // CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported? // CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled? // **************** // Note to maintainers: if new toggles are added please document them // in configuration.md, too // **************** // In general each macro has a _NO_ form // (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature. // Many features, at point of detection, define an _INTERNAL_ macro, so they // can be combined, en-mass, with the _NO_ forms later. #ifdef __cplusplus # if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) # define CATCH_CPP14_OR_GREATER # endif # if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) # define CATCH_CPP17_OR_GREATER # endif #endif // Only GCC compiler should be used in this block, so other compilers trying to // mask themselves as GCC should be ignored. #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) #endif #if defined(__clang__) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) // As of this writing, IBM XL's implementation of __builtin_constant_p has a bug // which results in calls to destructors being emitted for each temporary, // without a matching initialization. In practice, this can result in something // like `std::string::~string` being called on an uninitialized value. // // For example, this code will likely segfault under IBM XL: // ``` // REQUIRE(std::string("12") + "34" == "1234") // ``` // // Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. # if !defined(__ibmxl__) && !defined(__CUDACC__) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ # endif # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) #endif // __clang__ //////////////////////////////////////////////////////////////////////////////// // Assume that non-Windows platforms support posix signals by default #if !defined(CATCH_PLATFORM_WINDOWS) #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS #endif //////////////////////////////////////////////////////////////////////////////// // We know some environments not to support full POSIX signals #if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__) #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS #endif #ifdef __OS400__ # define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS # define CATCH_CONFIG_COLOUR_NONE #endif //////////////////////////////////////////////////////////////////////////////// // Android somehow still does not support std::to_string #if defined(__ANDROID__) # define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING # define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE #endif //////////////////////////////////////////////////////////////////////////////// // Not all Windows environments support SEH properly #if defined(__MINGW32__) # define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH #endif //////////////////////////////////////////////////////////////////////////////// // PS4 #if defined(__ORBIS__) # define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE #endif //////////////////////////////////////////////////////////////////////////////// // Cygwin #ifdef __CYGWIN__ // Required for some versions of Cygwin to declare gettimeofday // see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin # define _BSD_SOURCE // some versions of cygwin (most) do not support std::to_string. Use the libstd check. // https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813 # if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \ && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF)) # define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING # endif #endif // __CYGWIN__ //////////////////////////////////////////////////////////////////////////////// // Visual C++ #if defined(_MSC_VER) // Universal Windows platform does not support SEH // Or console colours (or console at all...) # if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) # define CATCH_CONFIG_COLOUR_NONE # else # define CATCH_INTERNAL_CONFIG_WINDOWS_SEH # endif # if !defined(__clang__) // Handle Clang masquerading for msvc // MSVC traditional preprocessor needs some workaround for __VA_ARGS__ // _MSVC_TRADITIONAL == 0 means new conformant preprocessor // _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor # if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) # define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR # endif // MSVC_TRADITIONAL // Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) # endif // __clang__ #endif // _MSC_VER #if defined(_REENTRANT) || defined(_MSC_VER) // Enable async processing, as -pthread is specified or no additional linking is required # define CATCH_INTERNAL_CONFIG_USE_ASYNC #endif // _MSC_VER //////////////////////////////////////////////////////////////////////////////// // Check if we are compiled with -fno-exceptions or equivalent #if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND) # define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED #endif //////////////////////////////////////////////////////////////////////////////// // DJGPP #ifdef __DJGPP__ # define CATCH_INTERNAL_CONFIG_NO_WCHAR #endif // __DJGPP__ //////////////////////////////////////////////////////////////////////////////// // Embarcadero C++Build #if defined(__BORLANDC__) #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN #endif //////////////////////////////////////////////////////////////////////////////// // Use of __COUNTER__ is suppressed during code analysis in // CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly // handled by it. // Otherwise all supported compilers support COUNTER macro, // but user still might want to turn it off #if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L ) #define CATCH_INTERNAL_CONFIG_COUNTER #endif //////////////////////////////////////////////////////////////////////////////// // RTX is a special version of Windows that is real time. // This means that it is detected as Windows, but does not provide // the same set of capabilities as real Windows does. #if defined(UNDER_RTSS) || defined(RTX64_BUILD) #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH #define CATCH_INTERNAL_CONFIG_NO_ASYNC #define CATCH_CONFIG_COLOUR_NONE #endif #if !defined(_GLIBCXX_USE_C99_MATH_TR1) #define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER #endif // Various stdlib support checks that require __has_include #if defined(__has_include) // Check if string_view is available and usable #if __has_include() && defined(CATCH_CPP17_OR_GREATER) # define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW #endif // Check if optional is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if byte is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # include # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) # define CATCH_INTERNAL_CONFIG_CPP17_BYTE # endif # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if variant is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # if defined(__clang__) && (__clang_major__ < 8) // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852 // fix should be in clang 8, workaround in libstdc++ 8.2 # include # if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) # define CATCH_CONFIG_NO_CPP17_VARIANT # else # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT # endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) # else # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT # endif // defined(__clang__) && (__clang_major__ < 8) # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) #endif // defined(__has_include) #if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER) # define CATCH_CONFIG_COUNTER #endif #if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH) # define CATCH_CONFIG_WINDOWS_SEH #endif // This is set by default, because we assume that unix compilers are posix-signal-compatible by default. #if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS) # define CATCH_CONFIG_POSIX_SIGNALS #endif // This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions. #if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR) # define CATCH_CONFIG_WCHAR #endif #if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING) # define CATCH_CONFIG_CPP11_TO_STRING #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL) # define CATCH_CONFIG_CPP17_OPTIONAL #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) # define CATCH_CONFIG_CPP17_STRING_VIEW #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT) # define CATCH_CONFIG_CPP17_VARIANT #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE) # define CATCH_CONFIG_CPP17_BYTE #endif #if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT) # define CATCH_INTERNAL_CONFIG_NEW_CAPTURE #endif #if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE) # define CATCH_CONFIG_NEW_CAPTURE #endif #if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) # define CATCH_CONFIG_DISABLE_EXCEPTIONS #endif #if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN) # define CATCH_CONFIG_POLYFILL_ISNAN #endif #if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC) # define CATCH_CONFIG_USE_ASYNC #endif #if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_ANDROID_LOGWRITE) # define CATCH_CONFIG_ANDROID_LOGWRITE #endif #if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) # define CATCH_CONFIG_GLOBAL_NEXTAFTER #endif // Even if we do not think the compiler has that warning, we still have // to provide a macro that can be used by the code. #if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION #endif #if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #endif #if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS #endif // The goal of this macro is to avoid evaluation of the arguments, but // still have the compiler warn on problems inside... #if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) #endif #if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #elif defined(__clang__) && (__clang_major__ < 5) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) #define CATCH_TRY if ((true)) #define CATCH_CATCH_ALL if ((false)) #define CATCH_CATCH_ANON(type) if ((false)) #else #define CATCH_TRY try #define CATCH_CATCH_ALL catch (...) #define CATCH_CATCH_ANON(type) catch (type) #endif #if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) #define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #endif // end catch_compiler_capabilities.h #define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line #define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) #ifdef CATCH_CONFIG_COUNTER # define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ ) #else # define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ ) #endif #include #include #include // We need a dummy global operator<< so we can bring it into Catch namespace later struct Catch_global_namespace_dummy {}; std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy); namespace Catch { struct CaseSensitive { enum Choice { Yes, No }; }; class NonCopyable { NonCopyable( NonCopyable const& ) = delete; NonCopyable( NonCopyable && ) = delete; NonCopyable& operator = ( NonCopyable const& ) = delete; NonCopyable& operator = ( NonCopyable && ) = delete; protected: NonCopyable(); virtual ~NonCopyable(); }; struct SourceLineInfo { SourceLineInfo() = delete; SourceLineInfo( char const* _file, std::size_t _line ) noexcept : file( _file ), line( _line ) {} SourceLineInfo( SourceLineInfo const& other ) = default; SourceLineInfo& operator = ( SourceLineInfo const& ) = default; SourceLineInfo( SourceLineInfo&& ) noexcept = default; SourceLineInfo& operator = ( SourceLineInfo&& ) noexcept = default; bool empty() const noexcept { return file[0] == '\0'; } bool operator == ( SourceLineInfo const& other ) const noexcept; bool operator < ( SourceLineInfo const& other ) const noexcept; char const* file; std::size_t line; }; std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ); // Bring in operator<< from global namespace into Catch namespace // This is necessary because the overload of operator<< above makes // lookup stop at namespace Catch using ::operator<<; // Use this in variadic streaming macros to allow // >> +StreamEndStop // as well as // >> stuff +StreamEndStop struct StreamEndStop { std::string operator+() const; }; template T const& operator + ( T const& value, StreamEndStop ) { return value; } } #define CATCH_INTERNAL_LINEINFO \ ::Catch::SourceLineInfo( __FILE__, static_cast( __LINE__ ) ) // end catch_common.h namespace Catch { struct RegistrarForTagAliases { RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo ); }; } // end namespace Catch #define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_tag_alias_autoregistrar.h // start catch_test_registry.h // start catch_interfaces_testcase.h #include namespace Catch { class TestSpec; struct ITestInvoker { virtual void invoke () const = 0; virtual ~ITestInvoker(); }; class TestCase; struct IConfig; struct ITestCaseRegistry { virtual ~ITestCaseRegistry(); virtual std::vector const& getAllTests() const = 0; virtual std::vector const& getAllTestsSorted( IConfig const& config ) const = 0; }; bool isThrowSafe( TestCase const& testCase, IConfig const& config ); bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ); std::vector filterTests( std::vector const& testCases, TestSpec const& testSpec, IConfig const& config ); std::vector const& getAllTestCasesSorted( IConfig const& config ); } // end catch_interfaces_testcase.h // start catch_stringref.h #include #include #include #include namespace Catch { /// A non-owning string class (similar to the forthcoming std::string_view) /// Note that, because a StringRef may be a substring of another string, /// it may not be null terminated. class StringRef { public: using size_type = std::size_t; using const_iterator = const char*; private: static constexpr char const* const s_empty = ""; char const* m_start = s_empty; size_type m_size = 0; public: // construction constexpr StringRef() noexcept = default; StringRef( char const* rawChars ) noexcept; constexpr StringRef( char const* rawChars, size_type size ) noexcept : m_start( rawChars ), m_size( size ) {} StringRef( std::string const& stdString ) noexcept : m_start( stdString.c_str() ), m_size( stdString.size() ) {} explicit operator std::string() const { return std::string(m_start, m_size); } public: // operators auto operator == ( StringRef const& other ) const noexcept -> bool; auto operator != (StringRef const& other) const noexcept -> bool { return !(*this == other); } auto operator[] ( size_type index ) const noexcept -> char { assert(index < m_size); return m_start[index]; } public: // named queries constexpr auto empty() const noexcept -> bool { return m_size == 0; } constexpr auto size() const noexcept -> size_type { return m_size; } // Returns the current start pointer. If the StringRef is not // null-terminated, throws std::domain_exception auto c_str() const -> char const*; public: // substrings and searches // Returns a substring of [start, start + length). // If start + length > size(), then the substring is [start, size()). // If start > size(), then the substring is empty. auto substr( size_type start, size_type length ) const noexcept -> StringRef; // Returns the current start pointer. May not be null-terminated. auto data() const noexcept -> char const*; constexpr auto isNullTerminated() const noexcept -> bool { return m_start[m_size] == '\0'; } public: // iterators constexpr const_iterator begin() const { return m_start; } constexpr const_iterator end() const { return m_start + m_size; } }; auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { return StringRef( rawChars, size ); } } // namespace Catch constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { return Catch::StringRef( rawChars, size ); } // end catch_stringref.h // start catch_preprocessor.hpp #define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__ #define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__))) #ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__ // MSVC needs more evaluations #define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__))) #define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__)) #else #define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__) #endif #define CATCH_REC_END(...) #define CATCH_REC_OUT #define CATCH_EMPTY() #define CATCH_DEFER(id) id CATCH_EMPTY() #define CATCH_REC_GET_END2() 0, CATCH_REC_END #define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2 #define CATCH_REC_GET_END(...) CATCH_REC_GET_END1 #define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT #define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0) #define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next) #define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST2(f, x, peek, ...) f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) #define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ ) #define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...) f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) // Applies the function macro `f` to each of the remaining parameters, inserts commas between the results, // and passes userdata as the first parameter to each invocation, // e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c) #define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) #define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) #define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param) #define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__ #define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__ #define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF #define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__ #define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) #else // MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF #define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__) #define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__ #define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1) #endif #define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__ #define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name) #define INTERNAL_CATCH_REMOVE_PARENS(...) INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper()) #define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)) #else #define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper())) #define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))) #endif #define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\ CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__) #define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0) #define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1) #define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2) #define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) #define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) #define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) #define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) #define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) #define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) #define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) #define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) #define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N #define INTERNAL_CATCH_TYPE_GEN\ template struct TypeList {};\ template\ constexpr auto get_wrapper() noexcept -> TypeList { return {}; }\ template class...> struct TemplateTypeList{};\ template class...Cs>\ constexpr auto get_wrapper() noexcept -> TemplateTypeList { return {}; }\ template\ struct append;\ template\ struct rewrap;\ template class, typename...>\ struct create;\ template class, typename>\ struct convert;\ \ template \ struct append { using type = T; };\ template< template class L1, typename...E1, template class L2, typename...E2, typename...Rest>\ struct append, L2, Rest...> { using type = typename append, Rest...>::type; };\ template< template class L1, typename...E1, typename...Rest>\ struct append, TypeList, Rest...> { using type = L1; };\ \ template< template class Container, template class List, typename...elems>\ struct rewrap, List> { using type = TypeList>; };\ template< template class Container, template class List, class...Elems, typename...Elements>\ struct rewrap, List, Elements...> { using type = typename append>, typename rewrap, Elements...>::type>::type; };\ \ template