pax_global_header00006660000000000000000000000064145611737720014527gustar00rootroot0000000000000052 comment=7ebfa0765ea590767202b328e7da38102c2f5a15 ospray-rkcommon-538f8a2/000077500000000000000000000000001456117377200152315ustar00rootroot00000000000000ospray-rkcommon-538f8a2/.clang-format000066400000000000000000000055131456117377200176100ustar00rootroot00000000000000--- Language: Cpp # BasedOnStyle: Google AccessModifierOffset: -1 AlignAfterOpenBracket: DontAlign AlignConsecutiveAssignments: false AlignConsecutiveDeclarations: false #EscapedNewlineAlignmentStyle: Right AlignEscapedNewlines: Right AlignOperands: false AlignTrailingComments: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty #AllowShortLambdasOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: None AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false BraceWrapping: AfterClass: true AfterControlStatement: false AfterEnum: true AfterFunction: true AfterNamespace: false AfterStruct: true AfterUnion: true BeforeCatch: false BeforeElse: false IndentBraces: false SplitEmptyFunction: false BreakBeforeBinaryOperators: NonAssignment BreakBeforeBraces: Custom BreakBeforeTernaryOperators: true BreakConstructorInitializersBeforeComma: false #BreakConstructorInitializersStyle: BeforeComma BreakStringLiterals: false ColumnLimit: 80 CommentPragmas: '^ IWYU pragma:' ConstructorInitializerAllOnOneLineOrOnePerLine: true ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DerivePointerAlignment: false DisableFormat: false ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, foreach_active, foreach_tiled, foreach_unique, cdo, cfor, cif, cwhile ] IncludeCategories: - Regex: '^<.*\.i?h>' Priority: 1 - Regex: '^<.*' Priority: 2 - Regex: '.*' Priority: 3 IncludeIsMainRegex: '([-_](test|unittest))?$' IndentCaseLabels: false IndentWidth: 2 IndentWrappedFunctionNames: false KeepEmptyLinesAtTheStartOfBlocks: false MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PenaltyBreakBeforeFirstCallParameter: 1 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 200 #PPDirectiveIndentStyle: AfterHash PointerAlignment: Right ReflowComments: true SortIncludes: true SpaceAfterCStyleCast: false SpaceAfterTemplateKeyword: true SpaceBeforeAssignmentOperators: true SpaceBeforeParens: ControlStatements SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: false SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false Standard: Cpp11 TabWidth: 2 UseTab: Never ... ospray-rkcommon-538f8a2/.gitattributes000066400000000000000000000000661456117377200201260ustar00rootroot00000000000000*.rc text working-tree-encoding=UTF-16LE-BOM eol=CRLF ospray-rkcommon-538f8a2/.github/000077500000000000000000000000001456117377200165715ustar00rootroot00000000000000ospray-rkcommon-538f8a2/.github/workflows/000077500000000000000000000000001456117377200206265ustar00rootroot00000000000000ospray-rkcommon-538f8a2/.github/workflows/ci.linux.yml000066400000000000000000000073751456117377200231160ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: CI Linux on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: ## Build Jobs ## build-centos7: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: centos:7 cmd: | gitlab/build.sh artifact-out: build-centos7 artifact-path: build build-ubuntu1804: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:18.04 cmd: | gitlab/build.sh -G Ninja artifact-out: build-ubuntu1804 artifact-path: build build-ubuntu2004: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:20.04 cmd: | gitlab/build.sh -G Ninja artifact-out: build-ubuntu2004 artifact-path: build build-ubuntu2204: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:22.04 cmd: | gitlab/build.sh -G Ninja artifact-out: build-ubuntu2204 artifact-path: build build-arch: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ospray/docker-images:arch cmd: | gitlab/build.sh -G Ninja artifact-out: build-arch artifact-path: build build-arch-clang: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ospray/docker-images:arch cmd: | export CC=clang export CXX=clang++ gitlab/build.sh -G Ninja artifact-out: build-arch-clang artifact-path: build ## Functional Test Jobs ## test-centos7: needs: build-centos7 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: centos:7 cmd: | ./build/rkcommon_test_suite artifact-in: build-centos7 test-ubuntu1804: needs: build-ubuntu1804 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:18.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu1804 test-ubuntu2004: needs: build-ubuntu2004 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:20.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu2004 test-ubuntu2204: needs: build-ubuntu2204 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ubuntu:22.04 cmd: | ./build/rkcommon_test_suite artifact-in: build-ubuntu2204 test-arch: needs: build-arch uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ospray/docker-images:arch cmd: | ./build/rkcommon_test_suite artifact-in: build-arch test-arch-clang: needs: build-arch-clang uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/docker.yml@main with: image: ospray/docker-images:arch cmd: | ./build/rkcommon_test_suite artifact-in: build-arch-clang ## Static Analysis ## static-analysis: secrets: inherit uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/static_analysis.yml@main with: project: RKCommon prebuild: cmake -S . -B build -DRKCOMMON_TASKING_SYSTEM=INTERNAL build: cmake --build buildospray-rkcommon-538f8a2/.github/workflows/ci.macos.yml000066400000000000000000000014341456117377200230470ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: CI MacOS on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: ## Build Jobs ## build-macos: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: cmd: | gitlab/build.sh artifact-out: build-macos artifact-path: build ## Functional Test Jobs ## test-macos: needs: build-macos uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/macos.yml@main with: cmd: | export DYLD_FALLBACK_LIBRARY_PATH=./build:$DYLD_FALLBACK_LIBRARY_PATH ./build/rkcommon_test_suite artifact-in: build-macosospray-rkcommon-538f8a2/.github/workflows/ci.windows.yml000066400000000000000000000024501456117377200234360ustar00rootroot00000000000000## Copyright 2022 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 name: CI Windows on: push: workflow_dispatch: concurrency: group: ${{ github.workflow }}-${{ github.ref }} cancel-in-progress: true jobs: ## Build Jobs ## build-windows-msvc15: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | gitlab\build.ps1 "Visual Studio 15 2017 Win64" "v141" artifact-out: build-windows-msvc15 artifact-path: build build-windows-msvc16: uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | gitlab\build.ps1 "Visual Studio 16 2019" "v142" artifact-out: build-windows-msvc16 artifact-path: build ## Functional Test Jobs ## test-windows-msvc15: needs: build-windows-msvc15 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | gitlab\run_tests.ps1 artifact-in: build-windows-msvc15 test-windows-msvc16: needs: build-windows-msvc16 uses: intel-innersource/libraries.devops.renderkit.workflows/.github/workflows/windows.yml@main with: cmd: | gitlab\run_tests.ps1 artifact-in: build-windows-msvc16ospray-rkcommon-538f8a2/.gitignore000066400000000000000000000002331456117377200172170ustar00rootroot00000000000000*~ *# bin *.user* build*/ *.sw? tags .ycm_extra_conf.pyc *.autosave *DS_Store* *.gz *.rpm *.zip *.bak *.patch .vscode .idea/ premake.local.* cmake-build*/ ospray-rkcommon-538f8a2/CMakeLists.txt000066400000000000000000000052241456117377200177740ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 ## Global CMake options ## if (RKCOMMON_TASKING_SYSTEM STREQUAL "OpenMP") cmake_minimum_required(VERSION 3.9) # NOTE(jda): rely on OpenMP targets else() cmake_minimum_required(VERSION 3.1) endif() set(CMAKE_CXX_STANDARD 11) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(CMAKE_DISABLE_SOURCE_CHANGES ON) set(CMAKE_DISABLE_IN_SOURCE_BUILD ON) ## Establish project ## project(rkcommon VERSION 1.13.0 LANGUAGES CXX) include(GNUInstallDirs) configure_file( ${PROJECT_SOURCE_DIR}/rkcommon/version.h.in ${PROJECT_BINARY_DIR}/rkcommon/version.h @ONLY ) set(RKCOMMON_RESOURCE ${PROJECT_SOURCE_DIR}/rkcommon/rkcommon.rc) ## Add rkcommon specific macros ## set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${PROJECT_SOURCE_DIR}/cmake) include(rkcommon_macros) rkcommon_configure_build_type() rkcommon_configure_compiler() rkcommon_configure_tasking_system() rkcommon_create_tasking_target(FALSE) ## Build options and specific configuration ## option(BUILD_SHARED_LIBS "Build rkcommon as a shared library" ON) option(RKCOMMON_ADDRSAN "Build rkcommon with dlclose disabled for addrsan" OFF) option(RKCOMMON_NO_SIMD "Build rkcommon not using SIMD instructions" OFF) set(CMAKE_SKIP_INSTALL_RPATH OFF) if (APPLE) set(CMAKE_MACOSX_RPATH ON) set(CMAKE_INSTALL_RPATH "@loader_path/") else() set(CMAKE_INSTALL_RPATH "\$ORIGIN") endif() include(CTest) if (BUILD_TESTING) enable_testing() endif() if (WIN32) option(INSTALL_DEPS "Install rkcommon DLL dependencies" ON) else() set(INSTALL_DEPS OFF) endif() if (INSTALL_DEPS) include(rkcommon_redist_deps) endif() set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${PROJECT_BINARY_DIR}) ## Build library and tests ## add_subdirectory(rkcommon) if (BUILD_TESTING) add_subdirectory(tests) endif() ## Configure CMake find_package() config files ## include(CMakePackageConfigHelpers) configure_package_config_file( "${PROJECT_SOURCE_DIR}/cmake/${PROJECT_NAME}Config.cmake.in" "${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake" INSTALL_DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} ) write_basic_package_version_file( "${PROJECT_NAME}ConfigVersion.cmake" VERSION ${PROJECT_VERSION} COMPATIBILITY SameMajorVersion ) install(FILES ${PROJECT_BINARY_DIR}/${PROJECT_NAME}Config.cmake ${PROJECT_BINARY_DIR}/${PROJECT_NAME}ConfigVersion.cmake cmake/FindTBB.cmake cmake/rkcommon_macros.cmake DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} ) # Must be last include(CPack) ospray-rkcommon-538f8a2/LICENSE.txt000066400000000000000000000261361456117377200170640ustar00rootroot00000000000000 Apache License Version 2.0, January 2004 http://www.apache.org/licenses/ TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION 1. Definitions. "License" shall mean the terms and conditions for use, reproduction, and distribution as defined by Sections 1 through 9 of this document. "Licensor" shall mean the copyright owner or entity authorized by the copyright owner that is granting the License. "Legal Entity" shall mean the union of the acting entity and all other entities that control, are controlled by, or are under common control with that entity. For the purposes of this definition, "control" means (i) the power, direct or indirect, to cause the direction or management of such entity, whether by contract or otherwise, or (ii) ownership of fifty percent (50%) or more of the outstanding shares, or (iii) beneficial ownership of such entity. "You" (or "Your") shall mean an individual or Legal Entity exercising permissions granted by this License. "Source" form shall mean the preferred form for making modifications, including but not limited to software source code, documentation source, and configuration files. "Object" form shall mean any form resulting from mechanical transformation or translation of a Source form, including but not limited to compiled object code, generated documentation, and conversions to other media types. "Work" shall mean the work of authorship, whether in Source or Object form, made available under the License, as indicated by a copyright notice that is included in or attached to the work (an example is provided in the Appendix below). "Derivative Works" shall mean any work, whether in Source or Object form, that is based on (or derived from) the Work and for which the editorial revisions, annotations, elaborations, or other modifications represent, as a whole, an original work of authorship. For the purposes of this License, Derivative Works shall not include works that remain separable from, or merely link (or bind by name) to the interfaces of, the Work and Derivative Works thereof. "Contribution" shall mean any work of authorship, including the original version of the Work and any modifications or additions to that Work or Derivative Works thereof, that is intentionally submitted to Licensor for inclusion in the Work by the copyright owner or by an individual or Legal Entity authorized to submit on behalf of the copyright owner. For the purposes of this definition, "submitted" means any form of electronic, verbal, or written communication sent to the Licensor or its representatives, including but not limited to communication on electronic mailing lists, source code control systems, and issue tracking systems that are managed by, or on behalf of, the Licensor for the purpose of discussing and improving the Work, but excluding communication that is conspicuously marked or otherwise designated in writing by the copyright owner as "Not a Contribution." "Contributor" shall mean Licensor and any individual or Legal Entity on behalf of whom a Contribution has been received by Licensor and subsequently incorporated within the Work. 2. Grant of Copyright License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable copyright license to reproduce, prepare Derivative Works of, publicly display, publicly perform, sublicense, and distribute the Work and such Derivative Works in Source or Object form. 3. Grant of Patent License. Subject to the terms and conditions of this License, each Contributor hereby grants to You a perpetual, worldwide, non-exclusive, no-charge, royalty-free, irrevocable (except as stated in this section) patent license to make, have made, use, offer to sell, sell, import, and otherwise transfer the Work, where such license applies only to those patent claims licensable by such Contributor that are necessarily infringed by their Contribution(s) alone or by combination of their Contribution(s) with the Work to which such Contribution(s) was submitted. If You institute patent litigation against any entity (including a cross-claim or counterclaim in a lawsuit) alleging that the Work or a Contribution incorporated within the Work constitutes direct or contributory patent infringement, then any patent licenses granted to You under this License for that Work shall terminate as of the date such litigation is filed. 4. Redistribution. You may reproduce and distribute copies of the Work or Derivative Works thereof in any medium, with or without modifications, and in Source or Object form, provided that You meet the following conditions: (a) You must give any other recipients of the Work or Derivative Works a copy of this License; and (b) You must cause any modified files to carry prominent notices stating that You changed the files; and (c) You must retain, in the Source form of any Derivative Works that You distribute, all copyright, patent, trademark, and attribution notices from the Source form of the Work, excluding those notices that do not pertain to any part of the Derivative Works; and (d) If the Work includes a "NOTICE" text file as part of its distribution, then any Derivative Works that You distribute must include a readable copy of the attribution notices contained within such NOTICE file, excluding those notices that do not pertain to any part of the Derivative Works, in at least one of the following places: within a NOTICE text file distributed as part of the Derivative Works; within the Source form or documentation, if provided along with the Derivative Works; or, within a display generated by the Derivative Works, if and wherever such third-party notices normally appear. The contents of the NOTICE file are for informational purposes only and do not modify the License. You may add Your own attribution notices within Derivative Works that You distribute, alongside or as an addendum to the NOTICE text from the Work, provided that such additional attribution notices cannot be construed as modifying the License. You may add Your own copyright statement to Your modifications and may provide additional or different license terms and conditions for use, reproduction, or distribution of Your modifications, or for any such Derivative Works as a whole, provided Your use, reproduction, and distribution of the Work otherwise complies with the conditions stated in this License. 5. Submission of Contributions. Unless You explicitly state otherwise, any Contribution intentionally submitted for inclusion in the Work by You to the Licensor shall be under the terms and conditions of this License, without any additional terms or conditions. Notwithstanding the above, nothing herein shall supersede or modify the terms of any separate license agreement you may have executed with Licensor regarding such Contributions. 6. Trademarks. This License does not grant permission to use the trade names, trademarks, service marks, or product names of the Licensor, except as required for reasonable and customary use in describing the origin of the Work and reproducing the content of the NOTICE file. 7. Disclaimer of Warranty. Unless required by applicable law or agreed to in writing, Licensor provides the Work (and each Contributor provides its Contributions) on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied, including, without limitation, any warranties or conditions of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A PARTICULAR PURPOSE. You are solely responsible for determining the appropriateness of using or redistributing the Work and assume any risks associated with Your exercise of permissions under this License. 8. Limitation of Liability. In no event and under no legal theory, whether in tort (including negligence), contract, or otherwise, unless required by applicable law (such as deliberate and grossly negligent acts) or agreed to in writing, shall any Contributor be liable to You for damages, including any direct, indirect, special, incidental, or consequential damages of any character arising as a result of this License or out of the use or inability to use the Work (including but not limited to damages for loss of goodwill, work stoppage, computer failure or malfunction, or any and all other commercial damages or losses), even if such Contributor has been advised of the possibility of such damages. 9. Accepting Warranty or Additional Liability. While redistributing the Work or Derivative Works thereof, You may choose to offer, and charge a fee for, acceptance of support, warranty, indemnity, or other liability obligations and/or rights consistent with this License. However, in accepting such obligations, You may act only on Your own behalf and on Your sole responsibility, not on behalf of any other Contributor, and only if You agree to indemnify, defend, and hold each Contributor harmless for any liability incurred by, or claims asserted against, such Contributor by reason of your accepting any such warranty or additional liability. END OF TERMS AND CONDITIONS APPENDIX: How to apply the Apache License to your work. To apply the Apache License to your work, attach the following boilerplate notice, with the fields enclosed by brackets "[]" replaced with your own identifying information. (Don't include the brackets!) The text should be enclosed in the appropriate comment syntax for the file format. We also recommend that a file or class name and description of purpose be included on the same "printed page" as the copyright notice for easier identification within third-party archives. Copyright [yyyy] [name of copyright owner] Licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. You may obtain a copy of the License at http://www.apache.org/licenses/LICENSE-2.0 Unless required by applicable law or agreed to in writing, software distributed under the License is distributed on an "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. ospray-rkcommon-538f8a2/README.md000066400000000000000000000010621456117377200165070ustar00rootroot00000000000000# rkcommon - C++/CMake infrastructure This project represents a common set of C++ infrastructure and CMake utilities used by various components of IntelĀ® oneAPI Rendering Toolkit. ### Requirements - CMake - C++11 compiler - TBB 4.4.3 or higher (by default, other tasking system options available via the `RKCOMMON_TASKING_SYSTEM` CMake variable) ### Building Build with: ```bash git clone https://github.com/ospray/rkcommon.git cd rkcommon mkdir build cd build cmake .. cmake --build . ``` Run tests from the build directory with: ```bash ctest . ``` ospray-rkcommon-538f8a2/cmake/000077500000000000000000000000001456117377200163115ustar00rootroot00000000000000ospray-rkcommon-538f8a2/cmake/FindTBB.cmake000066400000000000000000000410461456117377200205300ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 #=============================================================================== # This script will attempt to find TBB and set up a TBB target. # # The user may specify a version and lists of required and optional components: # # find_package(TBB 2017.0 EXACT REQUIRED # tbb tbbmalloc # OPTIONAL_COMPONENTS tbbmalloc_proxy # QUIET) # # If this target exists already, the script will attempt to re-use it, but fail # if version or components do not match the user-specified requirements. # # If all the required component targets (e.g. TBB::tbb) exist, the script will # attempt to create a target TBB and link existing component targets to it. # It will fail if the component target version does not match the user-specified # requirements. # # The user may specify the following variables to help the search process: # - TBB_ROOT # - TBB_INCLUDE_DIR # # After the script has run successfully, there is a target TBB, as well as # component targets TBB::, e.g. TBB::tbbmalloc. # # The targets will attempt to link to release versions of TBB in release mode, # and debug versions in debug mode. # # In addition to the targets, the script defines: # # TBB_FOUND # TBB_INCLUDE_DIRS # #=============================================================================== # We use INTERFACE libraries, which are only supported in 3.x cmake_minimum_required(VERSION 3.1) # These two are used to automatically find the root and include directories. set(_TBB_INCLUDE_SUBDIR "include") set(_TBB_HEADER "tbb/tbb.h") # Initialize cache variable; but use existing non-cache variable as the default, # and fall back to the environment variable. if (NOT TBB_ROOT) set(TBB_ROOT "$ENV{TBB_ROOT}") endif() set(TBB_ROOT "${TBB_ROOT}" CACHE PATH "The root path of TBB.") #=============================================================================== # Error messages that respect the user's wishes about peace and quiet. #=============================================================================== function(rk_tbb_status) if (NOT TBB_FIND_QUIETLY) message(STATUS "${ARGV}") endif() endfunction() function(rk_tbb_warning) if (NOT TBB_FIND_QUIETLY) message(WARNING "${ARGV}") endif() endfunction() macro(rk_tbb_error) if (TBB_FIND_REQUIRED) message(FATAL_ERROR "${ARGV}") else() rk_tbb_warning("${ARGV}") endif() return() endmacro() #=============================================================================== # Extract a list of required and optional components. #=============================================================================== macro(rk_tbb_list_components) # cmake provides the TBB_FIND_COMPONENTS and # TBB_FIND_REQUIRED_ variables based on the invocation # of find_package. if (TBB_FIND_COMPONENTS STREQUAL "") set(_REQUIRED_COMPONENTS "tbb") set(_OPTIONAL_COMPONENTS "tbbmalloc" "tbbmalloc_proxy" "tbbbind" "tbbpreview") else() set(_REQUIRED_COMPONENTS "") set(_OPTIONAL_COMPONENTS "") foreach (C IN LISTS TBB_FIND_COMPONENTS) if (${TBB_FIND_REQUIRED_${C}}) list(APPEND _REQUIRED_COMPONENTS ${C}) else() list(APPEND _OPTIONAL_COMPONENTS ${C}) endif() endforeach() endif() rk_tbb_status("Looking for TBB components ${_REQUIRED_COMPONENTS}" " (${_OPTIONAL_COMPONENTS})") endmacro() #=============================================================================== # List components that are available, and check if any REQUIRED components # are missing. #=============================================================================== macro(rk_tbb_check_components) set(_TBB_MISSING_COMPONENTS "") set(_TBB_AVAILABLE_COMPONENTS "") foreach (C IN LISTS _REQUIRED_COMPONENTS) if (TARGET TBB::${C}) list(APPEND _TBB_AVAILABLE_COMPONENTS ${C}) else() list(APPEND _TBB_MISSING_COMPONENTS ${C}) endif() endforeach() foreach (C IN LISTS _OPTIONAL_COMPONENTS) if (TARGET TBB::${C}) list(APPEND _TBB_AVAILABLE_COMPONENTS ${C}) endif() endforeach() endmacro() #=============================================================================== # Check the version of the TBB root we found. #=============================================================================== macro(rk_tbb_check_version) # Extract the version we found in our root. if(EXISTS "${TBB_INCLUDE_DIR}/oneapi/tbb/version.h") set(_TBB_VERSION_HEADER "oneapi/tbb/version.h") elseif(EXISTS "${TBB_INCLUDE_DIR}/tbb/tbb_stddef.h") set(_TBB_VERSION_HEADER "tbb/tbb_stddef.h") elseif(EXISTS "${TBB_INCLUDE_DIR}/tbb/version.h") set(_TBB_VERSION_HEADER "tbb/version.h") else() rk_tbb_error("Missing TBB version information. Could not find" "tbb/tbb_stddef.h or tbb/version.h in ${TBB_INCLUDE_DIR}") endif() file(READ ${TBB_INCLUDE_DIR}/${_TBB_VERSION_HEADER} VERSION_HEADER_CONTENT) string(REGEX MATCH "#define TBB_VERSION_MAJOR ([0-9]+)" DUMMY "${VERSION_HEADER_CONTENT}") set(TBB_VERSION_MAJOR ${CMAKE_MATCH_1}) string(REGEX MATCH "#define TBB_VERSION_MINOR ([0-9]+)" DUMMY "${VERSION_HEADER_CONTENT}") set(TBB_VERSION_MINOR ${CMAKE_MATCH_1}) set(TBB_VERSION "${TBB_VERSION_MAJOR}.${TBB_VERSION_MINOR}") set(TBB_VERSION_STRING "${TBB_VERSION}") # If the user provided information about required versions, check them! if (TBB_FIND_VERSION) if (${TBB_FIND_VERSION_EXACT} AND NOT TBB_VERSION VERSION_EQUAL ${TBB_FIND_VERSION}) rk_tbb_error("Requested exact TBB version ${TBB_FIND_VERSION}," " but found ${TBB_VERSION}") elseif(TBB_VERSION VERSION_LESS ${TBB_FIND_VERSION}) rk_tbb_error("Requested minimum TBB version ${TBB_FIND_VERSION}," " but found ${TBB_VERSION}") endif() endif() rk_tbb_status("Found TBB version ${TBB_VERSION} at ${TBB_ROOT}") endmacro() #=============================================================================== # Reuse existing targets. # NOTE: This must be a macro, as we rely on return() to exit this script. #=============================================================================== macro(rk_tbb_reuse_existing_target_components) rk_tbb_check_components() if (_TBB_MISSING_COMPONENTS STREQUAL "") rk_tbb_status("Found existing TBB component targets: ${_TBB_AVAILABLE_COMPONENTS}") # Get TBB_INCLUDE_DIR if not already set to check for the version of the # existing component targets (making the assumption that they all have # the same version) if (NOT TBB_INCLUDE_DIR) list(GET _TBB_AVAILABLE_COMPONENTS 0 first_target) get_target_property(TBB_INCLUDE_DIR TBB::${first_target} INTERFACE_INCLUDE_DIRECTORIES) foreach(TGT IN LISTS _TBB_AVAILABLE_COMPONENTS) get_target_property(_TGT_INCLUDE_DIR TBB::${TGT} INTERFACE_INCLUDE_DIRECTORIES) if (NOT _TGT_INCLUDE_DIR STREQUAL "${TBB_INCLUDE_DIR}") rk_tbb_error("Existing TBB component targets have inconsistent include directories.") endif() endforeach() endif() find_path(TBB_INCLUDE_DIR NAMES "${_TBB_HEADER}" PATHS "${TBB_INCLUDE_DIRS}") # Extract TBB_ROOT from the include path so that rk_tbb_check_version # prints the correct tbb location string(REPLACE "/${_TBB_INCLUDE_SUBDIR}" "" TBB_ROOT "${TBB_INCLUDE_DIR}") rk_tbb_check_version() # Add target TBB and link all available components if (NOT TARGET TBB) add_library(TBB INTERFACE) foreach(C IN LISTS _TBB_AVAILABLE_COMPONENTS) target_link_libraries(TBB INTERFACE TBB::${C}) endforeach() endif() set(TBB_FOUND TRUE) set(TBB_INCLUDE_DIRS "${TBB_INCLUDE_DIR}") return() elseif ((TARGET TBB) OR (NOT _TBB_AVAILABLE_COMPONENTS STREQUAL "")) rk_tbb_error("Ignoring existing TBB targets because required components are missing: ${_TBB_MISSING_COMPONENTS}") endif() endmacro() #=============================================================================== # Find the root directory if a manual override is not specified. # Sets TBB_ROOT in the parent scope, but does not check for failure. #=============================================================================== function(rk_tbb_find_root) if (NOT TBB_ROOT OR TBB_ROOT STREQUAL "") set(TBB_HINTS "") set(TBB_PATHS "") if (WIN32) # workaround for parentheses in variable name / CMP0053 set(PROGRAMFILESx86 "PROGRAMFILES(x86)") set(PROGRAMFILES32 "$ENV{${PROGRAMFILESx86}}") if(NOT PROGRAMFILES32) set(PROGRAMFILES32 "$ENV{PROGRAMFILES}") endif() if(NOT PROGRAMFILES32) set(PROGRAMFILES32 "C:/Program Files (x86)") endif() set(TBB_PATHS "${PROJECT_SOURCE_DIR}/../tbb" "${PROGRAMFILES32}/IntelSWTools/compilers_and_libraries/windows/tbb" "${PROGRAMFILES32}/Intel/Composer XE/tbb" "${PROGRAMFILES32}/Intel/compilers_and_libraries/windows/tbb") else() set(TBB_HINTS "/usr/local") set(TBB_PATHS "${PROJECT_SOURCE_DIR}/tbb" "/opt/intel/oneapi/tbb/latest" "/opt/intel/tbb" "/opt/intel/compilers_and_libraries/tbb" "/opt/intel/compilers_and_libraries/linux/tbb" "/opt/intel/composerxe/tbb") endif() set(TBB_ROOT "TBB_ROOT-NOTFOUND") find_path(TBB_ROOT NAMES "${_TBB_INCLUDE_SUBDIR}/${_TBB_HEADER}" HINTS ${TBB_HINTS} PATHS ${TBB_PATHS} NO_PACKAGE_ROOT_PATH) endif() endfunction() #=============================================================================== # Find the include directory if a manual override is not specified. # Assumes TBB_ROOT to be set. #=============================================================================== function(rk_tbb_find_include_directory) find_path(TBB_INCLUDE_DIR NAMES "${_TBB_HEADER}" HINTS "${TBB_ROOT}/${_TBB_INCLUDE_SUBDIR}" NO_PACKAGE_ROOT_PATH) endfunction() #=============================================================================== # Find a specific library and create a target for it. #=============================================================================== function(rk_tbb_find_library COMPONENT_NAME BUILD_CONFIG) set(LIB_VAR "${COMPONENT_NAME}_LIBRARY_${BUILD_CONFIG}") set(BIN_DIR_VAR "${COMPONENT_NAME}_BIN_DIR_${BUILD_CONFIG}") set(DLL_VAR "${COMPONENT_NAME}_DLL_${BUILD_CONFIG}") if (BUILD_CONFIG STREQUAL "DEBUG") set(LIB_NAME "${COMPONENT_NAME}_debug") else() set(LIB_NAME "${COMPONENT_NAME}") endif() unset(LIB_PATHS) if (WIN32) if(CMAKE_SIZEOF_VOID_P EQUAL 8) set(TBB_ARCH intel64) else() set(TBB_ARCH ia32) endif() if(MSVC10) set(TBB_VCVER vc10) elseif(MSVC11) set(TBB_VCVER vc11) elseif(MSVC12) set(TBB_VCVER vc12) else() set(TBB_VCVER vc14) endif() set(LIB_PATHS ${TBB_ROOT}/lib/${TBB_ARCH}/${TBB_VCVER} ${TBB_ROOT}/lib ) # On window, also search the DLL so that the client may install it. set(DLL_NAME "${LIB_NAME}.dll") # lib name with version suffix to handle oneTBB tbb12.dll set(LIB_NAME_VERSION "") if (${COMPONENT_NAME} STREQUAL "tbb") if (BUILD_CONFIG STREQUAL "DEBUG") set(LIB_NAME_VERSION "tbb12_debug") else() set(LIB_NAME_VERSION "tbb12") endif() endif() set(DLL_NAME_VERSION "${LIB_NAME_VERSION}.dll") set(BIN_FILE BIN_FILE-NOTFOUND) find_file(BIN_FILE NAMES ${DLL_NAME} ${DLL_NAME_VERSION} PATHS "${TBB_ROOT}/bin/${TBB_ARCH}/${TBB_VCVER}" "${TBB_ROOT}/bin" "${TBB_ROOT}/redist/${TBB_ARCH}/${TBB_VCVER}" "${TBB_ROOT}/../redist/${TBB_ARCH}/tbb/${TBB_VCVER}" "${TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/${TBB_VCVER}" NO_DEFAULT_PATH) get_filename_component(${BIN_DIR_VAR} ${BIN_FILE} DIRECTORY) set(${DLL_VAR} "${BIN_FILE}" CACHE PATH "${COMPONENT_NAME} ${BUILD_CONFIG} dll path") elseif(APPLE) set(LIB_PATHS ${TBB_ROOT}/lib) else() file(GLOB LIB_PATHS PATHS ${TBB_ROOT}/lib/intel64/gcc*) list(REVERSE LIB_PATHS) list(APPEND LIB_PATHS ${TBB_ROOT}/lib ${TBB_ROOT}/lib/x86_64-linux-gnu ${TBB_ROOT}/lib64 ${TBB_ROOT}/libx86_64-linux-gnu) endif() # We prefer finding the versioned file on Unix so that the library path # variable will not point to a symlink. This makes installing TBB as a # dependency easier. if (UNIX) set(LIB_NAME lib${LIB_NAME}.so.2 ${LIB_NAME}) endif() find_library(${LIB_VAR} NAMES ${LIB_NAME} PATHS ${LIB_PATHS} NO_DEFAULT_PATH) # Hide this variable if we found something, otherwise display it for # easy override. if(${LIB_VAR}) mark_as_advanced(${LIB_VAR}) endif() if(${BIN_DIR_VAR}) mark_as_advanced(${BIN_DIR_VAR}) endif() if(${DLL_VAR}) mark_as_advanced(${DLL_VAR}) endif() endfunction() #=============================================================================== # Find the given component. # This macro attempts to find both release and debug versions, and falls back # appropriately if only one can be found. # On success, it creates a target ${TARGET}::${COMPONENT_NAME} and links # it to the overall ${TARGET}. # # For more information on the variables set here, see # https://cmake.org/cmake/help/v3.17/manual/cmake-developer.7.html#a-sample-find-module #=============================================================================== function(rk_tbb_find_and_link_component COMPONENT_NAME) set(COMPONENT_TARGET "TBB::${COMPONENT_NAME}") rk_tbb_find_library("${COMPONENT_NAME}" RELEASE) rk_tbb_find_library("${COMPONENT_NAME}" DEBUG) if (${COMPONENT_NAME}_LIBRARY_RELEASE OR ${COMPONENT_NAME}_LIBRARY_DEBUG) # Note: We *must* use SHARED here rather than UNKNOWN as our # IMPORTED_NO_SONAME trick a few lines down does not work with # UNKNOWN. add_library(${COMPONENT_TARGET} SHARED IMPORTED) if (${COMPONENT_NAME}_LIBRARY_RELEASE) set_property(TARGET ${COMPONENT_TARGET} APPEND PROPERTY IMPORTED_CONFIGURATIONS RELEASE) if(WIN32) set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_RELEASE "${${COMPONENT_NAME}_DLL_RELEASE}") set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_IMPLIB_RELEASE "${${COMPONENT_NAME}_LIBRARY_RELEASE}") else() set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_RELEASE "${${COMPONENT_NAME}_LIBRARY_RELEASE}") endif() endif() if (${COMPONENT_NAME}_LIBRARY_DEBUG) set_property(TARGET ${COMPONENT_TARGET} APPEND PROPERTY IMPORTED_CONFIGURATIONS DEBUG) if(WIN32) set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_DEBUG "${${COMPONENT_NAME}_DLL_DEBUG}") set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_IMPLIB_DEBUG "${${COMPONENT_NAME}_LIBRARY_DEBUG}") else() set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_LOCATION_DEBUG "${${COMPONENT_NAME}_LIBRARY_DEBUG}") endif() endif() set_target_properties(${COMPONENT_TARGET} PROPERTIES INTERFACE_INCLUDE_DIRECTORIES "${TBB_INCLUDE_DIR}" INTERFACE_COMPILE_DEFINITIONS "__TBB_NO_IMPLICIT_LINKAGE=1" ) if(NOT WIN32) # Note: IMPORTED_NO_SONAME must be set or cmake will attempt # to link to the full path of libtbb.so. Instead, we # rely on the linker to find libtbb.so.2. set_target_properties(${COMPONENT_TARGET} PROPERTIES IMPORTED_NO_SONAME TRUE ) endif() target_link_libraries(TBB INTERFACE ${COMPONENT_TARGET}) endif() endfunction() #=============================================================================== # Note: The order of these is important. # Some of these macros create variables that are used in later calls. rk_tbb_list_components() rk_tbb_reuse_existing_target_components() rk_tbb_find_root() if (NOT EXISTS "${TBB_ROOT}") rk_tbb_error("Unable to find root directory ${TBB_ROOT}") endif() mark_as_advanced(TBB_ROOT) # Hide, we found something. rk_tbb_find_include_directory() if (NOT EXISTS "${TBB_INCLUDE_DIR}") rk_tbb_error("Unable to find include directory ${TBB_INCLUDE_DIR}") endif() mark_as_advanced(TBB_INCLUDE_DIR) # Hide, we found something. rk_tbb_check_version() add_library(TBB INTERFACE) foreach(C IN LISTS _REQUIRED_COMPONENTS _OPTIONAL_COMPONENTS) rk_tbb_find_and_link_component(${C}) endforeach() rk_tbb_check_components() if (_TBB_MISSING_COMPONENTS) rk_tbb_error("Cannot find required components: " "${_TBB_MISSING_COMPONENTS}") endif() set(TBB_FOUND TRUE) set(TBB_INCLUDE_DIRS "${TBB_INCLUDE_DIR}") ospray-rkcommon-538f8a2/cmake/rkcommonConfig.cmake.in000066400000000000000000000017251456117377200227000ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 @PACKAGE_INIT@ include("${CMAKE_CURRENT_LIST_DIR}/@PROJECT_NAME@_Exports.cmake") include("${CMAKE_CURRENT_LIST_DIR}/rkcommon_macros.cmake") check_required_components("@PROJECT_NAME@") ## Stash incoming CMAKE_MODULE_PATH ## set(RKCOMMON_CALLERS_CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH}) set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} ${CMAKE_CURRENT_LIST_DIR}) ## Create rkcommon tasking target ## set(RKCOMMON_TASKING_SYSTEM @RKCOMMON_TASKING_SYSTEM@) set(RKCOMMON_TASKING_TBB @RKCOMMON_TASKING_TBB@) set(RKCOMMON_TASKING_OPENMP @RKCOMMON_TASKING_OPENMP@) set(RKCOMMON_TASKING_INTERNAL @RKCOMMON_TASKING_INTERNAL@) set(RKCOMMON_TASKING_DEBUG @RKCOMMON_TASKING_DEBUG@) rkcommon_create_tasking_target(TRUE) ## Restore CMAKE_MODULE_PATH ## set(CMAKE_MODULE_PATH ${RKCOMMON_CALLERS_CMAKE_MODULE_PATH}) ## Standard signal that the package was found ## set(RKCOMMON_FOUND TRUE) ospray-rkcommon-538f8a2/cmake/rkcommon_macros.cmake000066400000000000000000000204171456117377200225100ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 # use a backported version of find_dependency(), renamed as # find_dependency_39(), from CMake 3.9.0, which correctly supports passing # components to find_package(). this allows us to maintain our current minimum # CMake version of 3.1. macro(find_dependency_39 dep) if (NOT ${dep}_FOUND) set(cmake_fd_quiet_arg) if(${CMAKE_FIND_PACKAGE_NAME}_FIND_QUIETLY) set(cmake_fd_quiet_arg QUIET) endif() set(cmake_fd_required_arg) if(${CMAKE_FIND_PACKAGE_NAME}_FIND_REQUIRED) set(cmake_fd_required_arg REQUIRED) endif() get_property(cmake_fd_alreadyTransitive GLOBAL PROPERTY _CMAKE_${dep}_TRANSITIVE_DEPENDENCY ) find_package(${dep} ${ARGN} ${cmake_fd_quiet_arg} ${cmake_fd_required_arg} ) if(NOT DEFINED cmake_fd_alreadyTransitive OR cmake_fd_alreadyTransitive) set_property(GLOBAL PROPERTY _CMAKE_${dep}_TRANSITIVE_DEPENDENCY TRUE) endif() if (NOT ${dep}_FOUND) set(${CMAKE_FIND_PACKAGE_NAME}_NOT_FOUND_MESSAGE "${CMAKE_FIND_PACKAGE_NAME} could not be found because dependency ${dep} could not be found.") set(${CMAKE_FIND_PACKAGE_NAME}_FOUND False) return() endif() set(cmake_fd_required_arg) set(cmake_fd_quiet_arg) set(cmake_fd_exact_arg) endif() endmacro() ## Macro for printing CMake variables ## macro(print var) message("${var} = ${${var}}") endmacro() ## Macro to print a warning message that only appears once ## macro(rkcommon_warn_once IDENTIFIER MESSAGE) set(INTERNAL_WARNING "RKCOMMON_WARNED_${IDENTIFIER}") if(NOT ${INTERNAL_WARNING}) message(WARNING ${MESSAGE}) set(${INTERNAL_WARNING} ON CACHE INTERNAL "Warned about '${MESSAGE}'") endif() endmacro() ## Get a list of subdirectories (single level) under a given directory macro(get_subdirectories result curdir) file(GLOB children RELATIVE ${curdir} ${curdir}/*) set(dirlist "") foreach(child ${children}) if(IS_DIRECTORY ${curdir}/${child}) list(APPEND dirlist ${child}) endif() endforeach() set(${result} ${dirlist}) endmacro() ## Setup CMAKE_BUILD_TYPE to have a default + cycle between options in UI macro(rkcommon_configure_build_type) set(CONFIGURATION_TYPES "Debug;Release;RelWithDebInfo") if (WIN32) if (NOT RKCOMMON_DEFAULT_CMAKE_CONFIGURATION_TYPES_SET) set(CMAKE_CONFIGURATION_TYPES "${CONFIGURATION_TYPES}" CACHE STRING "List of generated configurations." FORCE) set(RKCOMMON_DEFAULT_CMAKE_CONFIGURATION_TYPES_SET ON CACHE INTERNAL "Default CMake configuration types set.") endif() else() if(NOT CMAKE_BUILD_TYPE) set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the build type." FORCE) endif() set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS ${CONFIGURATION_TYPES}) endif() endmacro() ## Compiler configuration macros ## macro(rkcommon_configure_compiler) if (WIN32) set(RKCOMMON_PLATFORM_WIN 1) set(RKCOMMON_PLATFORM_UNIX 0) else() set(RKCOMMON_PLATFORM_WIN 0) set(RKCOMMON_PLATFORM_UNIX 1) endif() # unhide compiler to make it easier for users to see what they are using mark_as_advanced(CLEAR CMAKE_CXX_COMPILER) option(RKCOMMON_STRICT_BUILD "Build with additional warning flags" ON) mark_as_advanced(RKCOMMON_STRICT_BUILD) option(RKCOMMON_WARN_AS_ERRORS "Treat warnings as errors" OFF) mark_as_advanced(RKCOMMON_WARN_AS_ERRORS) set(RKCOMMON_COMPILER_ICC FALSE) set(RKCOMMON_COMPILER_GCC FALSE) set(RKCOMMON_COMPILER_CLANG FALSE) set(RKCOMMON_COMPILER_MSVC FALSE) set(RKCOMMON_COMPILER_DPCPP FALSE) if ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Intel") set(RKCOMMON_COMPILER_ICC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "GNU") set(RKCOMMON_COMPILER_GCC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "Clang" OR "${CMAKE_CXX_COMPILER_ID}" STREQUAL "AppleClang") set(RKCOMMON_COMPILER_CLANG TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "MSVC") set(RKCOMMON_COMPILER_MSVC TRUE) elseif ("${CMAKE_CXX_COMPILER_ID}" STREQUAL "IntelLLVM") set(RKCOMMON_COMPILER_DPCPP TRUE) else() message(FATAL_ERROR "Unsupported compiler specified: '${CMAKE_CXX_COMPILER_ID}'") endif() if (WIN32 AND NOT RKCOMMON_COMPILER_MSVC) # workaround for https://gitlab.kitware.com/cmake/cmake/-/issues/18311 set(CMAKE_NINJA_CMCLDEPS_RC OFF) endif() # setting DEPENDENTLOADFLAG:LOAD_LIBRARY_SAFE_CURRENT_DIRS on rkcommon DLL if(WIN32) if(RKCOMMON_COMPILER_MSVC) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /DEPENDENTLOADFLAG:0x2000") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /DEPENDENTLOADFLAG:0x2000") elseif(RKCOMMON_COMPILER_DPCPP) set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} /Qoption,link,/DEPENDENTLOADFLAG:0x2000") set(CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} /Qoption,link,/DEPENDENTLOADFLAG:0x2000") else() message(WARNING "Unrecognized WIN32 compiler, DEPENDENTLOADFLAG can't be set") endif() endif() endmacro() ## Tasking System macros ## macro(rkcommon_configure_tasking_system) set(RKCOMMON_TASKING_SYSTEM TBB CACHE STRING "Per-node thread tasking system [TBB,OpenMP,Internal,Debug]") set_property(CACHE RKCOMMON_TASKING_SYSTEM PROPERTY STRINGS TBB OpenMP Internal Debug) # NOTE(jda) - Make the RKCOMMON_TASKING_SYSTEM build option case-insensitive string(TOUPPER ${RKCOMMON_TASKING_SYSTEM} RKCOMMON_TASKING_SYSTEM_ID) set(RKCOMMON_TASKING_TBB FALSE) set(RKCOMMON_TASKING_OPENMP FALSE) set(RKCOMMON_TASKING_INTERNAL FALSE) set(RKCOMMON_TASKING_DEBUG FALSE) if(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "TBB") set(RKCOMMON_TASKING_TBB TRUE) else() unset(TBB_INCLUDE_DIR CACHE) unset(TBB_LIBRARY CACHE) unset(TBB_LIBRARY_DEBUG CACHE) unset(TBB_LIBRARY_MALLOC CACHE) unset(TBB_LIBRARY_MALLOC_DEBUG CACHE) if(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "OPENMP") set(RKCOMMON_TASKING_OPENMP TRUE) elseif(${RKCOMMON_TASKING_SYSTEM_ID} STREQUAL "INTERNAL") set(RKCOMMON_TASKING_INTERNAL TRUE) else() set(RKCOMMON_TASKING_DEBUG TRUE) endif() endif() endmacro() macro(rkcommon_create_tasking_target FROM_INSTALL) set(CMAKE_THREAD_PREFER_PTHREAD TRUE) set(THREADS_PREFER_PTHREAD_FLAG TRUE) find_package(Threads REQUIRED) set(RKCOMMON_TASKING_LIBS ${CMAKE_THREAD_LIBS_INIT}) if(RKCOMMON_TASKING_TBB) if(POLICY CMP0074) # Our FindTBB script uses TBB_ROOT, which is the NEW behaviour for # CMP0074. cmake_policy(SET CMP0074 NEW) endif() if (DEFINED RKCOMMON_TBB_ROOT AND NOT RKCOMMON_TBB_ROOT STREQUAL "") set(TBB_FIND_PACKAGE_OPTION "ONLY_CMAKE_FIND_ROOT_PATH") set(CMAKE_FIND_ROOT_PATH ${RKCOMMON_TBB_ROOT}) set(TBB_ROOT ${RKCOMMON_TBB_ROOT}) list(APPEND CMAKE_PREFIX_PATH ${RKCOMMON_TBB_ROOT}) endif() # Try getting TBB via config first find_package(TBB 2021.1 QUIET COMPONENTS tbb tbbmalloc CONFIG ${TBB_FIND_PACKAGE_OPTION}) if (TBB_FOUND) list(APPEND RKCOMMON_TASKING_LIBS TBB::tbb TBB::tbbmalloc) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_TBB) else() # If not found try getting older TBB via module (FindTBB.cmake) unset(TBB_DIR CACHE) if (${FROM_INSTALL}) find_dependency_39(TBB 4.4 REQUIRED tbb tbbmalloc) else() find_package(TBB 4.4 REQUIRED tbb tbbmalloc) endif() if (TBB_FOUND) list(APPEND RKCOMMON_TASKING_LIBS TBB) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_TBB) endif() endif() elseif(RKCOMMON_TASKING_OPENMP) find_dependency_39(OpenMP) if (OPENMP_FOUND) list(APPEND RKCOMMON_TASKING_LIBS OpenMP::OpenMP_CXX) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_OMP) endif() elseif(RKCOMMON_TASKING_INTERNAL) set(RKCOMMON_TASKING_DEFINITIONS RKCOMMON_TASKING_INTERNAL) else()#Debug # Do nothing, will fall back to scalar code (useful for debugging) endif() if (NOT TARGET rkcommon_tasking) add_library(rkcommon_tasking INTERFACE IMPORTED) set_target_properties(rkcommon_tasking PROPERTIES INTERFACE_LINK_LIBRARIES "${RKCOMMON_TASKING_LIBS}" INTERFACE_COMPILE_DEFINITIONS "${RKCOMMON_TASKING_DEFINITIONS}" ) endif() endmacro() ospray-rkcommon-538f8a2/cmake/rkcommon_redist_deps.cmake000066400000000000000000000025661456117377200235360ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 if (WIN32 AND RKCOMMON_TASKING_TBB) if (NOT TBB_ARCH) if (CMAKE_SIZEOF_VOID_P EQUAL 8) set(TBB_ARCH intel64) else() set(TBB_ARCH ia32) endif() endif() set(TBB_DLL_HINTS HINTS ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}_win/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/tbb/vc14 ${RKCOMMON_TBB_ROOT}/../redist/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/redist/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/bin/${TBB_ARCH}/vc14 ${RKCOMMON_TBB_ROOT}/bin ) find_file(TBB_DLL NAMES tbb12.dll tbb.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_DEBUG NAMES tbb12_debug.dll tbb_debug.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_MALLOC tbbmalloc.dll ${TBB_DLL_HINTS}) find_file(TBB_DLL_MALLOC_DEBUG tbbmalloc_debug.dll ${TBB_DLL_HINTS}) mark_as_advanced(TBB_DLL) mark_as_advanced(TBB_DLL_DEBUG) mark_as_advanced(TBB_DLL_MALLOC) mark_as_advanced(TBB_DLL_MALLOC_DEBUG) install(PROGRAMS ${TBB_DLL} ${TBB_DLL_MALLOC} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS Release RelWithDebInfo) install(PROGRAMS ${TBB_DLL_DEBUG} ${TBB_DLL_MALLOC_DEBUG} DESTINATION ${CMAKE_INSTALL_BINDIR} CONFIGURATIONS Debug) endif() ospray-rkcommon-538f8a2/gitlab/000077500000000000000000000000001456117377200164735ustar00rootroot00000000000000ospray-rkcommon-538f8a2/gitlab/.gitlab-ci.yml000066400000000000000000000067061456117377200211400ustar00rootroot00000000000000variables: GIT_DEPTH: "15" ErrorActionPreference: STOP stages: - build - test - build-kw - test-kw - test-external ## Job Templates ## .job_template: &base_build_job stage: build script: - gitlab/build.sh -G Ninja artifacts: paths: - build .job_template: &build_job_docker <<: *base_build_job tags: [docker] .job_template: &base_functional_test_job type: test script: - ./build/rkcommon_test_suite .job_template: &test_functional_job_docker <<: *base_functional_test_job tags: [docker] ## Build Jobs ## build-centos7: <<: *build_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:centos7 script: - gitlab/build.sh build-ubuntu18.04: <<: *build_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:ubuntu18.04 build-ubuntu20.04: <<: *build_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:ubuntu20.04 build-arch: <<: *build_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:arch build-arch-clang: <<: *build_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:arch script: - export CC=clang - export CXX=clang++ - gitlab/build.sh -G Ninja build-macOS: <<: *base_build_job stage: build script: - gitlab/build.sh tags: - osx - clang build-windows-msvc15: stage: build script: - gitlab\build.ps1 "Visual Studio 15 2017 Win64" "v141" tags: - msvc15 - win artifacts: paths: - build expire_in: 3 day ## Functional Test Jobs ## test-centos7: <<: *test_functional_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:centos7 dependencies: [build-centos7] test-ubuntu18.04: <<: *test_functional_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:ubuntu18.04 dependencies: [build-ubuntu18.04] test-ubuntu20.04: <<: *test_functional_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:ubuntu20.04 dependencies: [build-ubuntu20.04] test-arch: <<: *test_functional_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:arch dependencies: [build-arch] test-arch-clang: <<: *test_functional_job_docker image: $DOCKER_REGISTRY/ospray/docker-images:arch dependencies: [build-arch-clang] test-macOS: <<: *base_functional_test_job dependencies: [build-macOS] script: - export DYLD_FALLBACK_LIBRARY_PATH=./build - ./build/rkcommon_test_suite tags: [osx,clang] test-windows-msvc15: <<: *base_functional_test_job script: - gitlab\run_tests.ps1 dependencies: [build-windows-msvc15] tags: [msvc15,win] ## Klocwork Jobs ## .job_template: &base_kw_job image: $DOCKER_REGISTRY/ospray/docker-images:centos7 tags: [docker] build-kw: <<: *base_kw_job stage: build-kw script: - gitlab/build-kw.sh - test -s $CI_PROJECT_DIR/klocwork/build_name artifacts: paths: - $CI_PROJECT_DIR/klocwork/* needs: [] check-kw-issues: <<: *base_kw_job stage: test-kw script: - gitlab/test-kw.sh needs: [build-kw] allow_failure: true gen-kw-report: <<: *base_kw_job stage: test-kw needs: [build-kw] script: - gitlab/kw-gen-report.sh - gitlab/store-files.sh $CI_PROJECT_NAME $CI_PIPELINE_ID klocwork "klocwork/report.log" artifacts: paths: - ./klocwork/* ## External Jobs ## test-openvkl: variables: RKCOMMON_BRANCH_NAME: $CI_COMMIT_REF_NAME RKCOMMON_PROJECT_PATH: $CI_PROJECT_PATH RKCOMMON_GIT_SHA: $CI_COMMIT_SHA stage: test-external trigger: project: renderkit/OpenVKL branch: devel strategy: depend ospray-rkcommon-538f8a2/gitlab/build-kw.sh000077500000000000000000000016201456117377200205470ustar00rootroot00000000000000#!/bin/bash ## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 set -e KW_SERVER_PATH=$KW_PATH/server KW_CLIENT_PATH=$KW_PATH/client export KLOCWORK_LTOKEN=/tmp/ltoken echo "$KW_SERVER_IP;$KW_SERVER_PORT;$KW_USER;$KW_LTOKEN" > $KLOCWORK_LTOKEN mkdir -p $CI_PROJECT_DIR/klocwork log_file=$CI_PROJECT_DIR/klocwork/build.log mkdir build cd build cmake --version cmake -DRKCOMMON_TASKING_SYSTEM=INTERNAL .. $KW_CLIENT_PATH/bin/kwinject make -j `nproc` | tee -a $log_file $KW_SERVER_PATH/bin/kwbuildproject --classic --url http://$KW_SERVER_IP:$KW_SERVER_PORT/$KW_PROJECT_NAME --tables-directory $CI_PROJECT_DIR/kw_tables kwinject.out | tee -a $log_file $KW_SERVER_PATH/bin/kwadmin --url http://$KW_SERVER_IP:$KW_SERVER_PORT/ load --force --name build-$CI_JOB_ID $KW_PROJECT_NAME $CI_PROJECT_DIR/kw_tables | tee -a $log_file echo "build-$CI_JOB_ID" > $CI_PROJECT_DIR/klocwork/build_name ospray-rkcommon-538f8a2/gitlab/build.ps1000077500000000000000000000004111456117377200202160ustar00rootroot00000000000000## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 md build cd build cmake --version cmake -L ` -G $args[0] ` -T $args[1] ` -D RKCOMMON_TASKING_SYSTEM=INTERNAL ` .. cmake --build . --config Release --target ALL_BUILD exit $LASTEXITCODE ospray-rkcommon-538f8a2/gitlab/build.sh000077500000000000000000000002771456117377200201370ustar00rootroot00000000000000#!/bin/bash -x ## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 mkdir build cd build cmake --version cmake -DRKCOMMON_TASKING_SYSTEM=INTERNAL .. cmake --build . ospray-rkcommon-538f8a2/gitlab/kw-gen-report.sh000077500000000000000000000033261456117377200215370ustar00rootroot00000000000000#!/bin/bash -xe ## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 KW_SERVER_API_URL=http://$KW_SERVER_IP:$KW_SERVER_PORT/review/api KW_BUILD_NAME=$(cat $CI_PROJECT_DIR/klocwork/build_name) KW_BUILD_LOG_FILE=$CI_PROJECT_DIR/klocwork/build.log export PATH="$SHARED_TOOLS_PATH:$PATH" [ -f $KW_BUILD_LOG_FILE ] || (echo "Build log file not found. Expected to be in: $KW_BUILD_LOG_FILE." ; exit 1;) mkdir -p $CI_PROJECT_DIR/klocwork report_file=$CI_PROJECT_DIR/klocwork/report.log echo "------------------" >> $report_file echo "Report generated at: "$(date '+%d/%m/%Y %H:%M:%S') >> $report_file echo "Project source code url: $CI_PROJECT_URL" >> $report_file echo "Project source code sha: $CI_COMMIT_SHA" >> $report_file echo "Klocwork server: http://$KW_SERVER_IP:$KW_SERVER_PORT" >> $report_file echo "------------------" >> $report_file echo -e "\n\n\n" >> $report_file # Get all issues list and put to report file column_list=".id, .code, .severity, .state, .status, .taxonomyName, .owner, .url, .file, .line" echo "------------------" >> $report_file echo "Issues list:" >> $report_file echo "------------------" >> $report_file echo $column_list | sed 's/\\t/ ,/g' | column -t -s, >> $report_file echo "------------------" >> $report_file curl -f --data "action=search&project=$KW_PROJECT_NAME&query=build:'$KW_BUILD_NAME'&user=$KW_USER<oken=$KW_LTOKEN" $KW_SERVER_API_URL | jq-linux64 "[${column_list}] | @tsv" | sed 's/\\t/|/g' | column -t -s'|' | cut -d'"' -f2 >> $report_file echo -e "\n\n\n" >> $report_file # Attach build log to report file echo "------------------" >> $report_file echo "Build & scan log:" >> $report_file echo "------------------" >> $report_file cat $KW_BUILD_LOG_FILE >> $report_file ospray-rkcommon-538f8a2/gitlab/run_tests.ps1000066400000000000000000000002771456117377200211540ustar00rootroot00000000000000## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 echo "Running tests" $env:Path += ";.\build\Release" .\build\Release\rkcommon_test_suite.exe exit $LASTEXITCODE ospray-rkcommon-538f8a2/gitlab/store-files.sh000077500000000000000000000003741456117377200212720ustar00rootroot00000000000000#!/bin/bash -xe ## Copyright 2019 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 project_name=$1 build_id=$2 group_name=$3 files=$4 STORAGE_DIR=$STORAGE_PATH/$project_name/$build_id/$group_name/ mkdir -p $STORAGE_DIR cp $files $STORAGE_DIR/ ospray-rkcommon-538f8a2/gitlab/test-kw.sh000077500000000000000000000015101456117377200204250ustar00rootroot00000000000000#!/bin/bash -xe ## Copyright 2020 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 KW_ISSUES_FILE=/tmp/issues KW_SERVER_API_URL=http://$KW_SERVER_IP:$KW_SERVER_PORT/review/api KW_BUILD_NAME=$(cat $CI_PROJECT_DIR/klocwork/build_name) echo "Checking for issues in $KW_BUILD_NAME ..." curl -f --data "action=search&project=$KW_PROJECT_NAME&query=build:'$KW_BUILD_NAME'%20status:Analyze,Fix,Fix%20in%20Next%20Release,Fix%20in%20Later%20Release,Defer,Filter&user=$KW_USER<oken=$KW_LTOKEN" $KW_SERVER_API_URL -o $KW_ISSUES_FILE getCriticalCount() { cat $KW_ISSUES_FILE | wc -l } if [ -f $KW_ISSUES_FILE ]; then echo "Issues found - $(getCriticalCount) in $KW_BUILD_NAME"; while IFS= read -r line; do echo $line | python -m json.tool; done < $KW_ISSUES_FILE exit 1; else echo "Found no issues in $KW_BUILD_NAME" fi ospray-rkcommon-538f8a2/rkcommon/000077500000000000000000000000001456117377200170565ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/CMakeLists.txt000066400000000000000000000044361456117377200216250ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 if (RKCOMMON_TASKING_INTERNAL) set(EXTRA_TASKING_SOURCES tasking/detail/enkiTS/TaskScheduler.cpp tasking/detail/TaskSys.cpp ) endif() add_library(${PROJECT_NAME} ${RKCOMMON_RESOURCE} common.cpp memory/malloc.cpp networking/DataStreaming.cpp networking/Fabric.cpp os/FileName.cpp os/library.cpp tasking/detail/tasking_system_init.cpp ${EXTRA_TASKING_SOURCES} utility/demangle.cpp utility/ParameterizedObject.cpp utility/PseudoURL.cpp utility/TimeStamp.cpp xml/XML.cpp tracing/Tracing.cpp ) target_link_libraries(${PROJECT_NAME} PUBLIC rkcommon_tasking ${CMAKE_DL_LIBS} $<${RKCOMMON_PLATFORM_WIN}:ws2_32> ) target_include_directories(${PROJECT_NAME} PUBLIC $ $ $ PRIVATE ${CMAKE_CURRENT_LIST_DIR} ) if (RKCOMMON_ADDRSAN) target_compile_definitions(${PROJECT_NAME} PUBLIC -DRKCOMMON_ADDRSAN) endif() if (RKCOMMON_NO_SIMD) target_compile_definitions(${PROJECT_NAME} PUBLIC -DRKCOMMON_NO_SIMD) endif() set_property(TARGET rkcommon PROPERTY POSITION_INDEPENDENT_CODE ON) ## Install library + targets ################################################## set_target_properties(${PROJECT_NAME} PROPERTIES VERSION ${PROJECT_VERSION} SOVERSION ${PROJECT_VERSION_MAJOR}) install(TARGETS ${PROJECT_NAME} EXPORT rkcommon_Exports LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_SKIP # on Windows put the dlls into bin RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} # ... and the import lib into the devel package ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) install(EXPORT rkcommon_Exports DESTINATION ${CMAKE_INSTALL_LIBDIR}/cmake/rkcommon-${PROJECT_VERSION} NAMESPACE rkcommon:: ) install(TARGETS ${PROJECT_NAME} LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} NAMELINK_ONLY RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} ) ## Install headers ############################################################ install(DIRECTORY ${CMAKE_CURRENT_LIST_DIR} DESTINATION ${CMAKE_INSTALL_INCLUDEDIR} FILES_MATCHING PATTERN *.h PATTERN *.inl PATTERN *.hpp PATTERN *.ih ) ospray-rkcommon-538f8a2/rkcommon/array3D/000077500000000000000000000000001456117377200203635ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/array3D/Array3D.h000066400000000000000000000300361456117377200220030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // ospray #include #include #include "../common.h" #include "../math/range.h" #include "for_each.h" namespace rkcommon { namespace array3D { /*! ABSTRACTION for a 3D array of data */ template struct Array3D { virtual ~Array3D() = default; /*! return size (ie, "dimensions") of volume */ virtual vec3i size() const = 0; /*! get cell value at given location, but ensure that location is actually a valid cell ID inside the volume (clamps to nearest cell in volume if 'where' is outside) */ virtual value_t get(const vec3i &where) const = 0; /*! get the range/interval of all cell values in the given begin/end region of the volume */ range_t getValueRange(const vec3i &begin, const vec3i &end) const { range_t v = get(begin); for_each(begin, end, [&](const vec3i &idx) { v.extend(get(idx)); }); return v; } /*! get value range over entire volume */ range_t getValueRange() const { return getValueRange(vec3i(0), size()); } /*! returns number of elements (as 64-bit int) across all dimensions */ virtual size_t numElements() const = 0; }; /*! implementation for an actual array3d that stores a 3D array of values */ template struct ActualArray3D : public Array3D { ActualArray3D(const vec3i &dims, void *externalMem = nullptr); ~ActualArray3D() override { if (valuesAreMine) delete[] value; } /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override; /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &where, const value_t &t); void clear(const value_t &t); /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; /* compute the (1D) linear array index for a (3D) grid coordinate */ size_t indexOf(const vec3i &pos) const { return pos.x + size_t(dims.x) * (pos.y + size_t(dims.y) * pos.z); } const vec3i dims; value_t *value; // bool that specified whether it was us that alloc'ed this mem, // and thus, whether we should free it upon termination. bool valuesAreMine; }; /*! shifts another array3d by a given amount */ template struct IndexShiftedArray3D : public Array3D { IndexShiftedArray3D(std::shared_ptr> _actual, const vec3i &_shift) : actual(_actual), shift(_shift) { } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return actual->size(); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return actual->get((where + size() + shift) % size()); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a IndexShiftArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { return actual->numElements(); } const vec3i shift; const std::shared_ptr> actual; }; /*! implemnetaiton of a wrapper class that makes an actual array3d of one type look like that of another type */ template struct Array3DAccessor : public Array3D { Array3DAccessor(std::shared_ptr> actual); /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ out_t get(const vec3i &where) const override; /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; private: //! the actual 3D array we're wrapping around const std::shared_ptr> actual; }; /*! wrapper class that generates an artificially larger data set by simply repeating the given input */ template struct Array3DRepeater : public Array3D { Array3DRepeater(const std::shared_ptr> &actual, const vec3i &repeatedSize); /*! return size (ie, "dimensions") of volume */ vec3i size() const override; /*! get cell value at location \warning 'where' MUST be a valid cell location */ T get(const vec3i &where) const override; /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override; const vec3i repeatedSize; const std::shared_ptr> actual; }; /*! implements a sub-set of another array3d */ template struct SubBoxArray3D : public Array3D { SubBoxArray3D(const std::shared_ptr> &actual, const box3i &clipBox) : clipBox(clipBox), actual(actual) { assert(actual); assert(clipBox.upper.x <= actual->size().x); assert(clipBox.upper.y <= actual->size().y); assert(clipBox.upper.z <= actual->size().z); } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return clipBox.size(); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return actual->get(where + clipBox.lower); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a SubBoxArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { vec3i dims = clipBox.size(); return size_t(dims.x) * size_t(dims.y) * size_t(dims.z); } const box3i clipBox; const std::shared_ptr> actual; }; /*! implements a array3d that's composed of multiple individual slices */ template struct MultiSliceArray3D : public Array3D { MultiSliceArray3D( const std::vector>> &slice) : slice(slice) { } /*! return size (ie, "dimensions") of volume */ vec3i size() const override { return vec3i(slice[0]->size().x, slice[0]->size().y, slice.size()); } /*! get cell value at location \warning 'where' MUST be a valid cell location */ value_t get(const vec3i &where) const override { return slice[clamp(where.z, 0, (int)slice.size() - 1)]->get( vec3i(where.x, where.y, 0)); } /*! set cell value at location to given value \warning 'where' MUST be a valid cell location */ void set(const vec3i &, const value_t &) { throw std::runtime_error("cannot 'set' in a MultiSliceArray3D"); } /*! returns number of elements (as 64-bit int) across all dimensions */ size_t numElements() const override { return slice[0]->numElements() * slice.size(); } const std::vector>> slice; }; #ifndef _WIN32 /*! load raw file with given dimensions. the 'type' of the raw file (uint8,float,...) is given through the function's template parameter */ template std::shared_ptr> RKCOMMON_INTERFACE loadRAW(const std::string &fileName, const vec3i &dims); /*! load raw file with given dimensions. the 'type' of the raw file (uint8,float,...) is given through the function's template parameter */ template std::shared_ptr> RKCOMMON_INTERFACE mmapRAW(const std::string &fileName, const vec3i &dims); #endif // Inlined definitions //////////////////////////////////////////////////// // ActualArray3D // template inline vec3i ActualArray3D::size() const { return dims; } template inline T ActualArray3D::get(const vec3i &_where) const { assert(value != nullptr); const vec3i where = max(vec3i(0), min(_where, dims - vec3i(1))); size_t index = where.x + size_t(dims.x) * (where.y + size_t(dims.y) * (where.z)); assert(value); assert(index < numElements()); const T v = value[index]; return v; } template inline size_t ActualArray3D::numElements() const { return size_t(dims.x) * size_t(dims.y) * size_t(dims.z); } template inline ActualArray3D::ActualArray3D(const vec3i &dims, void *externalMem) : dims(dims), value((T *)externalMem), valuesAreMine(externalMem == nullptr) { try { if (!value) { const size_t numVoxels = longProduct(dims); value = new T[numVoxels]; } } catch (const std::bad_alloc &) { std::stringstream ss; ss << "could not allocate memory for Array3D of dimensions " << dims << " (in Array3D::Array3D())"; throw std::runtime_error(ss.str()); } } template inline void ActualArray3D::set(const vec3i &where, const T &t) { value[longIndex(where, size())] = t; } template inline void ActualArray3D::clear(const T &t) { for_each(size(), [&](const vec3i &idx) { set(idx, t); }); } // Array3DAccessor // template inline Array3DAccessor::Array3DAccessor( std::shared_ptr> actual) : actual(actual) { } template inline vec3i Array3DAccessor::size() const { return actual->size(); } template inline out_t Array3DAccessor::get(const vec3i &where) const { return (out_t)actual->get(where); } template inline size_t Array3DAccessor::numElements() const { assert(actual); return actual->numElements(); } // Array3DRepeater // template inline Array3DRepeater::Array3DRepeater( const std::shared_ptr> &actual, const vec3i &repeatedSize) : repeatedSize(repeatedSize), actual(actual) { } template inline vec3i Array3DRepeater::size() const { return repeatedSize; } template inline T Array3DRepeater::get(const vec3i &_where) const { vec3i where(_where.x % repeatedSize.x, _where.y % repeatedSize.y, _where.z % repeatedSize.z); if ((_where.x / repeatedSize.x) % 2) where.x = repeatedSize.x - 1 - where.x; if ((_where.y / repeatedSize.y) % 2) where.y = repeatedSize.y - 1 - where.y; if ((_where.z / repeatedSize.z) % 2) where.z = repeatedSize.z - 1 - where.z; return actual->get(where); } template inline size_t Array3DRepeater::numElements() const { return size_t(repeatedSize.x) * size_t(repeatedSize.y) * size_t(repeatedSize.z); } } // namespace array3D } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/array3D/for_each.h000066400000000000000000000043131456117377200223030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/box.h" /*! \file array3D/for_each Helper templates to do 3D iterations via lambda functions */ namespace rkcommon { namespace array3D { using namespace rkcommon::math; /*! compute - in 64 bit - the number of voxels in a vec3i */ inline size_t longProduct(const vec3i &dims) { return dims.x * size_t(dims.y) * dims.z; } /*! compute - in 64 bit - the linear array index of vec3i index in a vec3i sized array */ inline size_t longIndex(const vec3i &idx, const vec3i &dims) { return idx.x + size_t(dims.x) * (idx.y + size_t(dims.y) * idx.z); } /*! reverse mapping form a cell index to the cell coordinates, for a given volume size 'dims' */ inline vec3i coordsOf(const size_t idx, const vec3i &dims) { return vec3i( idx % dims.x, (idx / dims.x) % dims.y, (idx / dims.x) / dims.y); } /*! iterate through all indices in [lower,upper), EXCLUSING the 'upper' value */ template inline void for_each(const vec3i &lower, const vec3i &upper, Functor &&functor) { for (int iz = lower.z; iz < upper.z; iz++) for (int iy = lower.y; iy < upper.y; iy++) for (int ix = lower.x; ix < upper.x; ix++) functor(vec3i(ix, iy, iz)); } /*! a template that calls the given functor (typically a lambda) for every vec3i(ix,iy,iz) with 0<=ixsize(),[&](const vec3i &idx){ doSomeThing(volume,index); }); */ template inline void for_each(const vec3i &size, Functor &&functor) { for_each({0, 0, 0}, size, std::forward(functor)); } /*! iterate through all indices in [lower,upper), EXCLUSING the 'upper' value */ template inline void for_each(const box3i &coords, Functor &&functor) { for_each(coords.lower, coords.upper, std::forward(functor)); } } // namespace array3D } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/common.cpp000066400000000000000000000053531456117377200210600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "common.h" #include "os/library.h" #include namespace rkcommon { void removeArgs(int &ac, const char **&av, int where, int howMany) { for (int i = where + howMany; i < ac; i++) av[i - howMany] = av[i]; ac -= howMany; } void loadLibrary(const void *anchorAddress, const std::string &name, const std::vector &version) { LibraryRepository::getInstance()->add(anchorAddress, name, version); } void unloadLibrary(const std::string &name) { LibraryRepository::getInstance()->remove(name); } void *getSymbol(const std::string &name) { return LibraryRepository::getInstance()->getSymbol(name); } #ifdef _WIN32 #define osp_snprintf sprintf_s #else #define osp_snprintf snprintf #endif std::string prettyDouble(double val) { const double absVal = std::abs(val); char result[1000]; if (absVal >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e18f, 'E'); else if (absVal >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e15f, 'P'); else if (absVal >= 1e+12f) osp_snprintf(result, 1000, "%.1f%c", val / 1e12f, 'T'); else if (absVal >= 1e+09f) osp_snprintf(result, 1000, "%.1f%c", val / 1e09f, 'G'); else if (absVal >= 1e+06f) osp_snprintf(result, 1000, "%.1f%c", val / 1e06f, 'M'); else if (absVal >= 1e+03f) osp_snprintf(result, 1000, "%.1f%c", val / 1e03f, 'k'); else if (absVal <= 1e-12f) osp_snprintf(result, 1000, "%.1f%c", val * 1e15f, 'f'); else if (absVal <= 1e-09f) osp_snprintf(result, 1000, "%.1f%c", val * 1e12f, 'p'); else if (absVal <= 1e-06f) osp_snprintf(result, 1000, "%.1f%c", val * 1e09f, 'n'); else if (absVal <= 1e-03f) osp_snprintf(result, 1000, "%.1f%c", val * 1e06f, 'u'); else if (absVal <= 1e-00f) osp_snprintf(result, 1000, "%.1f%c", val * 1e03f, 'm'); else osp_snprintf(result, 1000, "%f", (float)val); return result; } std::string prettyNumber(size_t s) { const double val = s; char result[1000]; if (val >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e18f, 'E'); else if (val >= 1e+15f) osp_snprintf(result, 1000, "%.1f%c", val / 1e15f, 'P'); else if (val >= 1e+12f) osp_snprintf(result, 1000, "%.1f%c", val / 1e12f, 'T'); else if (val >= 1e+09f) osp_snprintf(result, 1000, "%.1f%c", val / 1e09f, 'G'); else if (val >= 1e+06f) osp_snprintf(result, 1000, "%.1f%c", val / 1e06f, 'M'); else if (val >= 1e+03f) osp_snprintf(result, 1000, "%.1f%c", val / 1e03f, 'k'); else osp_snprintf(result, 1000, "%zu", s); return result; } #undef osp_snprintf } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/common.h000066400000000000000000000035571456117377200205310ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // rkcommon #include "platform.h" // std #include #include #ifdef _WIN32 // ----------- windows only ----------- typedef unsigned long long id_t; #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include #ifdef _M_X64 typedef long long ssize_t; #else typedef int ssize_t; #endif #else // ----------- NOT windows ----------- #include "unistd.h" #endif #ifdef _WIN32 #ifdef rkcommon_EXPORTS #define RKCOMMON_INTERFACE __declspec(dllexport) #else #define RKCOMMON_INTERFACE __declspec(dllimport) #endif #else #define RKCOMMON_INTERFACE #endif #ifdef _WIN32 #define __PRETTY_FUNCTION__ __FUNCSIG__ #endif namespace rkcommon { using byte_t = unsigned char; /*! remove specified num arguments from an ac/av arglist */ RKCOMMON_INTERFACE void removeArgs(int &ac, const char **&av, int where, int howMany); // anchorAddress = nullptr will disable anchored loads RKCOMMON_INTERFACE void loadLibrary( const void *anchorAddress, const std::string &name, const std::vector &version = {}); RKCOMMON_INTERFACE void unloadLibrary(const std::string &name); RKCOMMON_INTERFACE void *getSymbol(const std::string &name); RKCOMMON_INTERFACE std::string prettyDouble(double x); RKCOMMON_INTERFACE std::string prettyNumber(size_t x); // NOTE(jda) - Implement make_unique() as it didn't show up until C++14... template inline std::unique_ptr make_unique(Args &&... args) { return std::unique_ptr(new T(std::forward(args)...)); } template T *getDataSafe(std::vector &v) { return v.empty() ? nullptr : v.data(); } } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/containers/000077500000000000000000000000001456117377200212235ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/containers/AlignedVector.h000066400000000000000000000005131456117377200241210ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "aligned_allocator.h" namespace rkcommon { namespace containers { template using AlignedVector = std::vector>; } // namespace containers } // namespace rkcommonospray-rkcommon-538f8a2/rkcommon/containers/FlatMap.h000066400000000000000000000170661456117377200227320ustar00rootroot00000000000000// Copyright 2018 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include namespace rkcommon { namespace containers { // A small map data structure with a similar interface to std::map<>, but // uses an underlying std::vector<> to store the key/value pairs instead of // a tree. This makes lookups O(n), but inserts are O(1) and it is sortable // like an array to enable things like std::binary_search() on either the // keys or values. template struct FlatMap { using item_t = std::pair; using storage_t = std::vector; using iterator_t = decltype(std::declval().begin()); using citerator_t = decltype(std::declval().cbegin()); using riterator_t = decltype(std::declval().rbegin()); using criterator_t = decltype(std::declval().crbegin()); FlatMap() = default; ~FlatMap() = default; // Key-based lookups // VALUE &at(const KEY &key); const VALUE &at(const KEY &key) const; VALUE &operator[](const KEY &key); const VALUE &operator[](const KEY &key) const; // Index-based lookups // item_t &at_index(size_t index); const item_t &at_index(size_t index) const; // Property queries // size_t size() const; size_t empty() const; bool contains(const KEY &key) const; // Storage mutation // void erase(const KEY &key); void clear(); void reserve(size_t size); // Iterators // iterator_t begin(); citerator_t begin() const; citerator_t cbegin() const; iterator_t end(); citerator_t end() const; citerator_t cend() const; riterator_t rbegin(); criterator_t rbegin() const; criterator_t crbegin() const; riterator_t rend(); criterator_t rend() const; criterator_t crend() const; private: // Helpers // iterator_t lookup(const KEY &key); citerator_t lookup(const KEY &key) const; // Data // storage_t values; }; // Inlined definitions //////////////////////////////////////////////////// template inline VALUE &FlatMap::at(const KEY &key) { auto itr = lookup(key); if (itr == values.end()) throw std::out_of_range("key wasn't found in FlatMap<>"); return itr->second; } template inline const VALUE &FlatMap::at(const KEY &key) const { auto itr = lookup(key); if (itr == values.end()) throw std::out_of_range("key wasn't found in FlatMap<>"); return itr->second; } template inline VALUE &FlatMap::operator[](const KEY &key) { auto itr = lookup(key); if (itr == values.end()) { values.push_back(std::make_pair(key, VALUE())); return values.back().second; } else { return itr->second; } } template inline const VALUE &FlatMap::operator[](const KEY &key) const { auto itr = lookup(key); if (itr == values.end()) { values.push_back(std::make_pair(key, VALUE())); return values.back().second; } else { return itr->second; } } template inline typename FlatMap::item_t &FlatMap::at_index( size_t index) { return values.at(index); } template inline const typename FlatMap::item_t & FlatMap::at_index(size_t index) const { return values.at(index); } template inline size_t FlatMap::size() const { return values.size(); } template inline size_t FlatMap::empty() const { return values.empty(); } template inline bool FlatMap::contains(const KEY &key) const { return lookup(key) != values.cend(); } template inline void FlatMap::erase(const KEY &key) { auto itr = std::stable_partition( values.begin(), values.end(), [&](const item_t &i) { return i.first != key; }); values.resize(std::distance(values.begin(), itr)); } template inline void FlatMap::clear() { values.clear(); } template inline void FlatMap::reserve(size_t size) { return values.reserve(size); } // Iterators // template inline typename FlatMap::iterator_t FlatMap::begin() { return values.begin(); } template inline typename FlatMap::citerator_t FlatMap::begin() const { return cbegin(); } template inline typename FlatMap::citerator_t FlatMap::cbegin() const { return values.cbegin(); } template inline typename FlatMap::iterator_t FlatMap::end() { return values.end(); } template inline typename FlatMap::citerator_t FlatMap::end() const { return cend(); } template inline typename FlatMap::citerator_t FlatMap::cend() const { return values.cend(); } template inline typename FlatMap::riterator_t FlatMap::rbegin() { return values.rbegin(); } template inline typename FlatMap::criterator_t FlatMap::rbegin() const { return crbegin(); } template inline typename FlatMap::criterator_t FlatMap::crbegin() const { return values.crbegin(); } template inline typename FlatMap::riterator_t FlatMap::rend() { return values.rend(); } template inline typename FlatMap::criterator_t FlatMap::rend() const { return crend(); } template inline typename FlatMap::criterator_t FlatMap::crend() const { return values.crend(); } // Helper functions // template inline typename FlatMap::iterator_t FlatMap::lookup( const KEY &key) { return std::find_if(values.begin(), values.end(), [&](item_t &item) { return item.first == key; }); } template inline typename FlatMap::citerator_t FlatMap::lookup(const KEY &key) const { return std::find_if( values.cbegin(), values.cend(), [&](const item_t &item) { return item.first == key; }); } } // namespace containers } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/containers/TransactionalBuffer.h000066400000000000000000000034321456117377200253320ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace containers { template struct TransactionalBuffer { TransactionalBuffer() = default; // Insert into the buffer (producer) void push_back(const T &); void push_back(T &&); // Take all contents of the buffer (consumer) std::vector consume(); size_t size() const; bool empty() const; private: // Data members // std::vector buffer; mutable std::mutex bufferMutex; // NOTE(jda) - Marked mutable so 'const' // methods can take the lock... }; // Inlined members //////////////////////////////////////////////////////// template inline void TransactionalBuffer::push_back(const T &v) { std::lock_guard lock(bufferMutex); buffer.push_back(v); } template inline void TransactionalBuffer::push_back(T &&v) { std::lock_guard lock(bufferMutex); buffer.push_back(std::forward(v)); } template inline std::vector TransactionalBuffer::consume() { std::lock_guard lock(bufferMutex); return std::move(buffer); } template inline size_t TransactionalBuffer::size() const { std::lock_guard lock(bufferMutex); return buffer.size(); } template inline bool TransactionalBuffer::empty() const { std::lock_guard lock(bufferMutex); return buffer.empty(); } } // namespace containers } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/containers/aligned_allocator.h000066400000000000000000000116141456117377200250420ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include // Required for size_t and ptrdiff_t and nullptr #include // Required for placement new and std::bad_alloc #include // Required for std::length_error #include "../memory/malloc.h" namespace rkcommon { namespace containers { // NOTE(jda) - aligned_allocator implementation loosely based off of Stephen // T. Lavavej's "Mallocator" example: // // https://blogs.msdn.microsoft.com/vcblog/2008/08/28/the-mallocator/ #define OSPRAY_DEFAULT_ALIGNMENT 64 template struct aligned_allocator { // Compile-time info // using pointer = T *; using const_pointer = const T *; using reference = T &; using const_reference = const T &; using value_type = T; using size_type = size_t; using difference_type = ptrdiff_t; template struct rebind { using other = aligned_allocator; }; // Implementation // aligned_allocator() = default; aligned_allocator(const aligned_allocator &) = default; ~aligned_allocator() = default; aligned_allocator &operator=(const aligned_allocator &) = delete; template aligned_allocator(const aligned_allocator &); template aligned_allocator &operator=(const aligned_allocator &); T *address(T &r) const; const T *address(const T &s) const; size_t max_size() const; bool operator!=(const aligned_allocator &other) const; void construct(T *const p, const T &t) const; void destroy(T *const p) const; // Returns true if and only if storage allocated from *this // can be deallocated from other, and vice versa. // Always returns true for stateless allocators. bool operator==(const aligned_allocator &) const; // The following will be different for each allocator. T *allocate(const size_t n) const; void deallocate(T *const p, const size_t n) const; template T *allocate(const size_t n, const U * /* const hint */) const; }; // Inlined member definitions ///////////////////////////////////////////// template template aligned_allocator::aligned_allocator(const aligned_allocator &) { } template template aligned_allocator &aligned_allocator::operator=( const aligned_allocator &) { } template inline T *aligned_allocator::address(T &r) const { return &r; } template inline const T *aligned_allocator::address(const T &s) const { return &s; } template inline size_t aligned_allocator::max_size() const { // The following has been carefully written to be independent of // the definition of size_t and to avoid signed/unsigned warnings. return (static_cast(0) - static_cast(1)) / sizeof(T); } template inline bool aligned_allocator::operator!=( const aligned_allocator &other) const { return !(*this == other); } template inline void aligned_allocator::construct(T *const p, const T &t) const { void *const pv = static_cast(p); new (pv) T(t); } template inline bool aligned_allocator::operator==( const aligned_allocator &) const { return true; } template inline T *aligned_allocator::allocate(const size_t n) const { if (n == 0) return nullptr; if (n > max_size()) { throw std::length_error( "aligned_allocator::allocate() – Integer overflow."); } void *const pv = memory::alignedMalloc(n * sizeof(T), A); if (pv == nullptr) throw std::bad_alloc(); return static_cast(pv); } template inline void aligned_allocator::deallocate(T *const p, const size_t) const { memory::alignedFree(p); } template template inline T *aligned_allocator::allocate(const size_t n, const U *) const { return allocate(n); } template inline void aligned_allocator::destroy(T *const p) const { p->~T(); } } // namespace containers } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/000077500000000000000000000000001456117377200200075ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/math/AffineSpace.h000066400000000000000000000237011456117377200223270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "LinearSpace.h" #include "box.h" namespace rkcommon { namespace math { #define VectorT typename L::Vector #define ScalarT typename L::Vector::Scalar /////////////////////////////////////////////////////////////////////////// // Affine Space /////////////////////////////////////////////////////////////////////////// template struct AffineSpaceT { L l; /*< linear part of affine space */ VectorT p; /*< affine part of affine space */ ///////////////////////////////////////////////////////////////////////// // Constructors, Assignment, Cast, Copy Operations ///////////////////////////////////////////////////////////////////////// inline AffineSpaceT() = default; inline AffineSpaceT(const AffineSpaceT &other) { l = other.l; p = other.p; } inline AffineSpaceT(const L &other) { l = other; p = VectorT(zero); } inline AffineSpaceT &operator=(const AffineSpaceT &other) { l = other.l; p = other.p; return *this; } inline AffineSpaceT(const VectorT &vx, const VectorT &vy, const VectorT &vz, const VectorT &p) : l(vx, vy, vz), p(p) { } inline AffineSpaceT(const L &l, const VectorT &p) : l(l), p(p) {} template inline AffineSpaceT(const AffineSpaceT &s) : l(s.l), p(s.p) { } inline operator L*() { return static_cast(&l); } inline operator const L*() const { return static_cast(&l); } ///////////////////////////////////////////////////////////////////////// // Constants ///////////////////////////////////////////////////////////////////////// inline AffineSpaceT(ZeroTy) : l(zero), p(zero) {} inline AffineSpaceT(OneTy) : l(one), p(zero) {} /*! return matrix for scaling */ static inline AffineSpaceT scale(const VectorT &s) { return L::scale(s); } /*! return matrix for translation */ static inline AffineSpaceT translate(const VectorT &p) { return AffineSpaceT(one, p); } /*! return matrix for rotation, only in 2D */ static inline AffineSpaceT rotate(const ScalarT &r) { return L::rotate(r); } /*! return matrix for rotation around arbitrary point (2D) or axis (3D) */ static inline AffineSpaceT rotate(const VectorT &u, const ScalarT &r) { return L::rotate(u, r); } /*! return matrix for rotation quaternion, only in 3D */ static inline AffineSpaceT rotate(const QuaternionT &q) { return L(q); } /*! return matrix for rotation around arbitrary axis and point, only in 3D */ static inline AffineSpaceT rotate(const VectorT &p, const VectorT &u, const ScalarT &r) { return translate(+p) * rotate(u, r) * translate(-p); } /*! return matrix for rotation with quaternion around point, only in 3D */ static inline AffineSpaceT rotate(const VectorT &p, const QuaternionT &q) { return translate(+p) * L(q) * translate(-p); } /*! return matrix for looking at given point, only in 3D; right-handed * coordinate system */ static inline AffineSpaceT lookat(const VectorT &eye, const VectorT &point, const VectorT &up) { VectorT Z = normalize(point - eye); VectorT U = normalize(cross(Z, up)); VectorT V = cross(U, Z); return AffineSpaceT(L(U, V, Z), eye); } }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline AffineSpaceT operator-(const AffineSpaceT &a) { return AffineSpaceT(-a.l, -a.p); } template inline AffineSpaceT operator+(const AffineSpaceT &a) { return AffineSpaceT(+a.l, +a.p); } template inline AffineSpaceT rcp(const AffineSpaceT &a) { L il = rcp(a.l); return AffineSpaceT(il, -(il * a.p)); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline AffineSpaceT operator+(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l + b.l, a.p + b.p); } template inline AffineSpaceT operator-(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l - b.l, a.p - b.p); } template inline AffineSpaceT operator*(const ScalarT &a, const AffineSpaceT &b) { return AffineSpaceT(a * b.l, a * b.p); } template inline AffineSpaceT operator*(const AffineSpaceT &a, const AffineSpaceT &b) { return AffineSpaceT(a.l * b.l, a.l * b.p + a.p); } template inline AffineSpaceT operator/(const AffineSpaceT &a, const AffineSpaceT &b) { return a * rcp(b); } template inline AffineSpaceT operator/(const AffineSpaceT &a, const ScalarT &b) { return a * rcp(b); } template inline AffineSpaceT &operator*=(AffineSpaceT &a, const AffineSpaceT &b) { return a = a * b; } template inline AffineSpaceT &operator*=(AffineSpaceT &a, const ScalarT &b) { return a = a * b; } template inline AffineSpaceT &operator/=(AffineSpaceT &a, const AffineSpaceT &b) { return a = a / b; } template inline AffineSpaceT &operator/=(AffineSpaceT &a, const ScalarT &b) { return a = a / b; } template inline const VectorT xfmPoint(const AffineSpaceT &m, const VectorT &p) { return madd(VectorT(p.x), m.l.vx, madd(VectorT(p.y), m.l.vy, madd(VectorT(p.z), m.l.vz, m.p))); } template inline const VectorT xfmVector(const AffineSpaceT &m, const VectorT &v) { return xfmVector(m.l, v); } template inline const VectorT xfmNormal(const AffineSpaceT &m, const VectorT &n) { return xfmNormal(m.l, n); } template inline const box_t xfmBounds( const AffineSpaceT>> &m, const box_t &b) { box_t dst = empty; const vec_t p0(b.lower.x, b.lower.y, b.lower.z); dst.extend(xfmPoint(m, p0)); const vec_t p1(b.lower.x, b.lower.y, b.upper.z); dst.extend(xfmPoint(m, p1)); const vec_t p2(b.lower.x, b.upper.y, b.lower.z); dst.extend(xfmPoint(m, p2)); const vec_t p3(b.lower.x, b.upper.y, b.upper.z); dst.extend(xfmPoint(m, p3)); const vec_t p4(b.upper.x, b.lower.y, b.lower.z); dst.extend(xfmPoint(m, p4)); const vec_t p5(b.upper.x, b.lower.y, b.upper.z); dst.extend(xfmPoint(m, p5)); const vec_t p6(b.upper.x, b.upper.y, b.lower.z); dst.extend(xfmPoint(m, p6)); const vec_t p7(b.upper.x, b.upper.y, b.upper.z); dst.extend(xfmPoint(m, p7)); return dst; } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const AffineSpaceT &a, const AffineSpaceT &b) { return a.l == b.l && a.p == b.p; } template inline bool operator!=(const AffineSpaceT &a, const AffineSpaceT &b) { return a.l != b.l || a.p != b.p; } /////////////////////////////////////////////////////////////////////////// // Output Operators /////////////////////////////////////////////////////////////////////////// template inline std::ostream &operator<<(std::ostream &cout, const AffineSpaceT &m) { return cout << "{ l = " << m.l << ", p = " << m.p << " }"; } /////////////////////////////////////////////////////////////////////////// // Type Aliases /////////////////////////////////////////////////////////////////////////// using AffineSpace2f = AffineSpaceT; using AffineSpace3f = AffineSpaceT; using AffineSpace3fa = AffineSpaceT; using OrthonormalSpace3f = AffineSpaceT; using affine2f = AffineSpace2f; using affine3f = AffineSpace3f; /////////////////////////////////////////////////////////////////////////// /*! Template Specialization for 2D: return matrix for rotation around point * (rotation around arbitrarty vector is not meaningful in 2D) */ template <> inline AffineSpace2f AffineSpace2f::rotate(const vec2f &p, const float &r) { return translate(+p) * AffineSpace2f(LinearSpace2f::rotate(r)) * translate(-p); } #undef VectorT #undef ScalarT } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/AffineSpace.ih000066400000000000000000000213611456117377200225000ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "LinearSpace.ih" #ifndef ISPC namespace ispc { #endif // A Affine vector space; i.e., a Linear Space with a translation struct AffineSpace3f { LinearSpace3f l; vec3f p; #ifndef ISPC AffineSpace3f() = default; AffineSpace3f(const float v) : l(v), p(v) {} #endif }; // short-hand name for AffineSpace3f typedef AffineSpace3f affine3f; // create a new affine space from given basis vectors v{x,y,z} and translation p inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f( const ISPC_UNIFORM vec3f vx, const ISPC_UNIFORM vec3f vy, const ISPC_UNIFORM vec3f vz, const ISPC_UNIFORM vec3f p) { ISPC_UNIFORM AffineSpace3f xfm; xfm.l.vx = vx; xfm.l.vy = vy; xfm.l.vz = vz; xfm.p = p; return xfm; } inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f_identity() { return make_AffineSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f), make_vec3f(0.f)); } inline ISPC_UNIFORM AffineSpace3f make_AffineSpace3f( const ISPC_UNIFORM LinearSpace3f l) { ISPC_UNIFORM AffineSpace3f xfm; xfm.l = l; xfm.p = make_vec3f(0, 0, 0); return xfm; } #define __define_transform(univary_r, univary_a, univary_v) \ /* apply given affine transformation to given _point_ v */ \ inline univary_r vec3f xfmPoint( \ const univary_a AffineSpace3f a, const univary_v vec3f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply affine transform to given _vector_ v, i.e., _without_ the \ * translation */ \ inline univary_r vec3f xfmVector( \ const univary_a AffineSpace3f a, const univary_v vec3f v) \ { \ return xfmVector(a.l, v); \ } #ifdef ISPC __define_transform(uniform, uniform, uniform); __define_transform(varying, uniform, varying); __define_transform(varying, varying, varying); #else __define_transform(, , ); #endif #undef __define_transform #define __define_other(univary) \ inline univary AffineSpace3f make_AffineSpace3f( \ const univary LinearSpace3f l, const univary vec3f p) \ { \ univary AffineSpace3f xfm; \ xfm.l = l; \ xfm.p = p; \ return xfm; \ } \ inline univary AffineSpace3f operator+( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l + b.l, a.p + b.p); \ } \ inline univary AffineSpace3f operator-( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l - b.l, a.p - b.p); \ } \ inline univary AffineSpace3f operator*( \ const univary float a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a * b.l, a * b.p); \ } \ inline univary AffineSpace3f operator*( \ const univary AffineSpace3f a, const univary float b) \ { \ return make_AffineSpace3f(a.l * b, a.p * b); \ } \ inline univary AffineSpace3f operator*( \ const univary AffineSpace3f a, const univary AffineSpace3f b) \ { \ return make_AffineSpace3f(a.l * b.l, a.l * b.p + a.p); \ } \ inline univary AffineSpace3f neg(const univary AffineSpace3f a) \ { \ return make_AffineSpace3f(neg(a.l), neg(a.p)); \ } \ inline univary AffineSpace3f rcp(const univary AffineSpace3f a) \ { \ univary LinearSpace3f il = rcp(a.l); \ return make_AffineSpace3f(il, neg(il * a.p)); \ } #ifdef ISPC __define_other(uniform); __define_other(varying); #else __define_other(); #endif #undef __define_other //////////////////////////////////////////////////////////////////////////////// // Rudimentary 2D affine space, used for texture coordinate transformations //////////////////////////////////////////////////////////////////////////////// // A 2D Affine vector space; i.e., a Linear Space with a translation struct AffineSpace2f { LinearSpace2f l; vec2f p; #ifndef ISPC AffineSpace2f() = default; AffineSpace2f(const float v) : l(v), p(v) {} #endif }; // short-hand name for AffineSpace2f typedef AffineSpace2f affine2f; // create a new affine space from given basis vectors v{x,y,z} and translation p inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f( const ISPC_UNIFORM LinearSpace2f l, const ISPC_UNIFORM vec2f p) { ISPC_UNIFORM AffineSpace2f xfm; xfm.l = l; xfm.p = p; return xfm; } inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f( const ISPC_UNIFORM vec2f vx, const ISPC_UNIFORM vec2f vy, const ISPC_UNIFORM vec2f p) { return make_AffineSpace2f(make_LinearSpace2f(vx, vy), p); } inline ISPC_UNIFORM AffineSpace2f make_AffineSpace2f_identity() { return make_AffineSpace2f( make_vec2f(1.f, 0.f), make_vec2f(0.f, 1.f), make_vec2f(0.f)); } #define __define_transform2f(univary_r, univary_a, univary_v) \ inline univary_r vec2f operator*( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply given affine transformation to given _point_ v */ \ inline univary_r vec2f xfmPoint( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return a.p + xfmVector(a.l, v); \ } \ /* apply affine transform to given _vector_ v, i.e., _without_ the \ * translation */ \ inline univary_r vec2f xfmVector( \ const univary_a AffineSpace2f a, const univary_v vec2f v) \ { \ return xfmVector(a.l, v); \ } #ifdef ISPC __define_transform2f(uniform, uniform, uniform); __define_transform2f(varying, uniform, varying); __define_transform2f(varying, varying, varying); #else __define_transform2f(, , ); #endif #undef __define_transform2f #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/math/LinearSpace.h000066400000000000000000000431661456117377200223600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "Quaternion.h" #include "vec.h" namespace rkcommon { namespace math { /////////////////////////////////////////////////////////////////////////// // 2D Linear Transform (2x2 Matrix) /////////////////////////////////////////////////////////////////////////// template struct LinearSpace2 { using Vector = T; using Scalar = typename T::scalar_t; /*! default matrix constructor */ inline LinearSpace2() = default; inline LinearSpace2(const LinearSpace2 &other) { vx = other.vx; vy = other.vy; } inline LinearSpace2 &operator=(const LinearSpace2 &other) { vx = other.vx; vy = other.vy; return *this; } template inline LinearSpace2(const LinearSpace2 &s) : vx(s.vx), vy(s.vy) { } /*! matrix construction from column vectors */ inline LinearSpace2(const Vector &vx, const Vector &vy) : vx(vx), vy(vy) { } /*! matrix construction from row mayor data */ inline LinearSpace2(const Scalar &m00, const Scalar &m01, const Scalar &m10, const Scalar &m11) : vx(m00, m10), vy(m01, m11) { } /*! compute the determinant of the matrix */ inline const Scalar det() const { return vx.x * vy.y - vx.y * vy.x; } /*! compute adjoint matrix */ inline const LinearSpace2 adjoint() const { return LinearSpace2(vy.y, -vy.x, -vx.y, vx.x); } /*! compute inverse matrix */ inline const LinearSpace2 inverse() const { return adjoint() / det(); } /*! compute transposed matrix */ inline const LinearSpace2 transposed() const { return LinearSpace2(vx.x, vx.y, vy.x, vy.y); } /*! returns first row of matrix */ inline const Vector row0() const { return Vector(vx.x, vy.x); } /*! returns second row of matrix */ inline const Vector row1() const { return Vector(vx.y, vy.y); } ///////////////////////////////////////////////////////////////////////// /// Constants ///////////////////////////////////////////////////////////////////////// inline LinearSpace2(ZeroTy) : vx(zero), vy(zero) {} inline LinearSpace2(OneTy) : vx(one, zero), vy(zero, one) {} /*! return matrix for scaling */ static inline LinearSpace2 scale(const Vector &s) { return LinearSpace2(s.x, 0, 0, s.y); } /*! return matrix for rotation */ static inline LinearSpace2 rotate(const Scalar &r) { Scalar s = sin(r), c = cos(r); return LinearSpace2(c, -s, s, c); } /*! return closest orthogonal matrix (i.e. a general rotation including * reflection) */ LinearSpace2 orthogonal() const { LinearSpace2 m = *this; // mirrored? Scalar mirror(one); if (m.det() < Scalar(zero)) { m.vx = -m.vx; mirror = -mirror; } // rotation for (int i = 0; i < 99; i++) { const LinearSpace2 m_next = 0.5 * (m + m.transposed().inverse()); const LinearSpace2 d = m_next - m; m = m_next; // norm^2 of difference small enough? if (max(dot(d.vx, d.vx), dot(d.vy, d.vy)) < 1e-8) break; } // rotation * mirror_x return LinearSpace2(mirror * m.vx, m.vy); } inline operator Scalar*() { return static_cast(&vx); } inline operator const Scalar*() const { return static_cast(&vx); } public: /*! the column vectors of the matrix */ Vector vx, vy; }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace2 operator-(const LinearSpace2 &a) { return LinearSpace2(-a.vx, -a.vy); } template inline LinearSpace2 operator+(const LinearSpace2 &a) { return LinearSpace2(+a.vx, +a.vy); } template inline LinearSpace2 rcp(const LinearSpace2 &a) { return a.inverse(); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace2 operator+(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a.vx + b.vx, a.vy + b.vy); } template inline LinearSpace2 operator-(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a.vx - b.vx, a.vy - b.vy); } template inline LinearSpace2 operator*(const typename T::Scalar &a, const LinearSpace2 &b) { return LinearSpace2(a * b.vx, a * b.vy); } template inline T operator*(const LinearSpace2 &a, const T &b) { return b.x * a.vx + b.y * a.vy; } template inline LinearSpace2 operator*(const LinearSpace2 &a, const LinearSpace2 &b) { return LinearSpace2(a * b.vx, a * b.vy); } template inline LinearSpace2 operator/(const LinearSpace2 &a, const typename T::Scalar &b) { return LinearSpace2(a.vx / b, a.vy / b); } template inline LinearSpace2 operator/(const LinearSpace2 &a, const LinearSpace2 &b) { return a * rcp(b); } template inline LinearSpace2 &operator*=(LinearSpace2 &a, const LinearSpace2 &b) { return a = a * b; } template inline LinearSpace2 &operator/=(LinearSpace2 &a, const LinearSpace2 &b) { return a = a / b; } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const LinearSpace2 &a, const LinearSpace2 &b) { return a.vx == b.vx && a.vy == b.vy; } template inline bool operator!=(const LinearSpace2 &a, const LinearSpace2 &b) { return a.vx != b.vx || a.vy != b.vy; } /////////////////////////////////////////////////////////////////////////// /// Output Operators /////////////////////////////////////////////////////////////////////////// template static std::ostream &operator<<(std::ostream &cout, const LinearSpace2 &m) { return cout << "{ vx = " << m.vx << ", vy = " << m.vy << "}"; } /////////////////////////////////////////////////////////////////////////// /// 3D Linear Transform (3x3 Matrix) /////////////////////////////////////////////////////////////////////////// template struct LinearSpace3 { using Vector = T; using Scalar = typename T::scalar_t; /*! default matrix constructor */ inline LinearSpace3() = default; inline LinearSpace3(const LinearSpace3 &other) { vx = other.vx; vy = other.vy; vz = other.vz; } inline LinearSpace3 &operator=(const LinearSpace3 &other) { vx = other.vx; vy = other.vy; vz = other.vz; return *this; } template inline LinearSpace3(const LinearSpace3 &s) : vx(s.vx), vy(s.vy), vz(s.vz) { } /*! matrix construction from column vectors */ inline LinearSpace3(const Vector &vx, const Vector &vy, const Vector &vz) : vx(vx), vy(vy), vz(vz) { } /*! construction from quaternion */ inline LinearSpace3(const QuaternionT &q) : vx((q.r * q.r + q.i * q.i - q.j * q.j - q.k * q.k), Scalar(2.0) * (q.i * q.j + q.r * q.k), Scalar(2.0) * (q.i * q.k - q.r * q.j)), vy(Scalar(2.0) * (q.i * q.j - q.r * q.k), (q.r * q.r - q.i * q.i + q.j * q.j - q.k * q.k), Scalar(2.0) * (q.j * q.k + q.r * q.i)), vz(Scalar(2.0) * (q.i * q.k + q.r * q.j), Scalar(2.0) * (q.j * q.k - q.r * q.i), (q.r * q.r - q.i * q.i - q.j * q.j + q.k * q.k)) { } /*! matrix construction from row mayor data */ inline LinearSpace3(const Scalar &m00, const Scalar &m01, const Scalar &m02, const Scalar &m10, const Scalar &m11, const Scalar &m12, const Scalar &m20, const Scalar &m21, const Scalar &m22) : vx(m00, m10, m20), vy(m01, m11, m21), vz(m02, m12, m22) { } /*! compute the determinant of the matrix */ inline const Scalar det() const { return dot(vx, cross(vy, vz)); } /*! compute adjoint matrix */ inline const LinearSpace3 adjoint() const { return LinearSpace3(cross(vy, vz), cross(vz, vx), cross(vx, vy)) .transposed(); } /*! compute inverse matrix */ inline const LinearSpace3 inverse() const { return adjoint() / det(); } /*! compute transposed matrix */ inline const LinearSpace3 transposed() const { return LinearSpace3( vx.x, vx.y, vx.z, vy.x, vy.y, vy.z, vz.x, vz.y, vz.z); } /*! returns first row of matrix */ inline const Vector row0() const { return Vector(vx.x, vy.x, vz.x); } /*! returns second row of matrix */ inline const Vector row1() const { return Vector(vx.y, vy.y, vz.y); } /*! returns third row of matrix */ inline const Vector row2() const { return Vector(vx.z, vy.z, vz.z); } ///////////////////////////////////////////////////////////////////////// // Constants ///////////////////////////////////////////////////////////////////////// inline LinearSpace3(ZeroTy) : vx(zero), vy(zero), vz(zero) {} inline LinearSpace3(OneTy) : vx(one, zero, zero), vy(zero, one, zero), vz(zero, zero, one) { } /*! return matrix for scaling */ static inline LinearSpace3 scale(const Vector &s) { return LinearSpace3(s.x, 0, 0, 0, s.y, 0, 0, 0, s.z); } /*! return matrix for rotation around arbitrary axis */ static inline LinearSpace3 rotate(const Vector &_u, const Scalar &r) { Vector u = normalize(_u); Scalar s = sin(r), c = cos(r); return LinearSpace3(u.x * u.x + (1 - u.x * u.x) * c, u.x * u.y * (1 - c) - u.z * s, u.x * u.z * (1 - c) + u.y * s, u.x * u.y * (1 - c) + u.z * s, u.y * u.y + (1 - u.y * u.y) * c, u.y * u.z * (1 - c) - u.x * s, u.x * u.z * (1 - c) - u.y * s, u.y * u.z * (1 - c) + u.x * s, u.z * u.z + (1 - u.z * u.z) * c); } inline operator Scalar*() { return static_cast(&vx); } inline operator const Scalar*() const { return static_cast(&vx); } public: /*! the column vectors of the matrix */ Vector vx, vy, vz; }; /////////////////////////////////////////////////////////////////////////// // Unary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace3 operator-(const LinearSpace3 &a) { return LinearSpace3(-a.vx, -a.vy, -a.vz); } template inline LinearSpace3 operator+(const LinearSpace3 &a) { return LinearSpace3(+a.vx, +a.vy, +a.vz); } template inline LinearSpace3 rcp(const LinearSpace3 &a) { return a.inverse(); } /* constructs a coordinate frame from a normalized normal */ template inline LinearSpace3 frame(const T &N) { const T dx0 = cross(T(one, zero, zero), N); const T dx1 = cross(T(zero, one, zero), N); const T dx = normalize(dot(dx0, dx0) > dot(dx1, dx1) ? dx0 : dx1); const T dy = normalize(cross(N, dx)); return LinearSpace3(dx, dy, N); } /* constructs a coordinate frame from a normal and approximate up direction */ template inline LinearSpace3 frame(const T &N, const T &up) { if (abs(dot(up, N)) > 0.99f) return frame(N); // fallback in case N and up are very parallel const T dx = normalize(cross(up, N)); const T dy = normalize(cross(N, dx)); return LinearSpace3(dx, dy, N); } /* clamps linear space to range -1 to +1 */ template inline LinearSpace3 clamp(const LinearSpace3 &space) { return LinearSpace3(clamp(space.vx, T(-1.0f), T(1.0f)), clamp(space.vy, T(-1.0f), T(1.0f)), clamp(space.vz, T(-1.0f), T(1.0f))); } /////////////////////////////////////////////////////////////////////////// // Binary Operators /////////////////////////////////////////////////////////////////////////// template inline LinearSpace3 operator+(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a.vx + b.vx, a.vy + b.vy, a.vz + b.vz); } template inline LinearSpace3 operator-(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a.vx - b.vx, a.vy - b.vy, a.vz - b.vz); } template inline LinearSpace3 operator*(const typename T::Scalar &a, const LinearSpace3 &b) { return LinearSpace3(a * b.vx, a * b.vy, a * b.vz); } template inline T operator*(const LinearSpace3 &a, const T &b) { return b.x * a.vx + b.y * a.vy + b.z * a.vz; } template inline LinearSpace3 operator*(const LinearSpace3 &a, const LinearSpace3 &b) { return LinearSpace3(a * b.vx, a * b.vy, a * b.vz); } template inline LinearSpace3 operator/(const LinearSpace3 &a, const typename T::Scalar &b) { return LinearSpace3(a.vx / b, a.vy / b, a.vz / b); } template inline LinearSpace3 operator/(const LinearSpace3 &a, const LinearSpace3 &b) { return a * rcp(b); } template inline LinearSpace3 &operator*=(LinearSpace3 &a, const LinearSpace3 &b) { return a = a * b; } template inline LinearSpace3 &operator/=(LinearSpace3 &a, const LinearSpace3 &b) { return a = a / b; } template inline T xfmPoint(const LinearSpace3 &s, const T &a) { return madd(T(a.x), s.vx, madd(T(a.y), s.vy, T(a.z * s.vz))); } template inline T xfmVector(const LinearSpace3 &s, const T &a) { return madd(T(a.x), s.vx, madd(T(a.y), s.vy, T(a.z * s.vz))); } template inline T xfmNormal(const LinearSpace3 &s, const T &a) { return xfmVector(s.inverse().transposed(), a); } /////////////////////////////////////////////////////////////////////////// /// Comparison Operators /////////////////////////////////////////////////////////////////////////// template inline bool operator==(const LinearSpace3 &a, const LinearSpace3 &b) { return a.vx == b.vx && a.vy == b.vy && a.vz == b.vz; } template inline bool operator!=(const LinearSpace3 &a, const LinearSpace3 &b) { return a.vx != b.vx || a.vy != b.vy || a.vz != b.vz; } /////////////////////////////////////////////////////////////////////////// /// Output Operators /////////////////////////////////////////////////////////////////////////// template inline std::ostream &operator<<(std::ostream &cout, const LinearSpace3 &m) { return cout << "{ vx = " << m.vx << ", vy = " << m.vy << ", vz = " << m.vz << "}"; } /*! Shortcuts for common linear spaces. */ using LinearSpace2f = LinearSpace2; using LinearSpace3f = LinearSpace3; using LinearSpace3fa = LinearSpace3; using linear2f = LinearSpace2f; using linear3f = LinearSpace3f; } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/LinearSpace.ih000066400000000000000000000303321456117377200225200ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.ih" #ifndef ISPC namespace ispc { #endif // Linear vector space, or a linear transformation struct LinearSpace3f { vec3f vx; vec3f vy; vec3f vz; #ifndef ISPC LinearSpace3f() = default; LinearSpace3f(const float v) : vx(v), vy(v), vz(v) {} LinearSpace3f(const vec3f &vx, const vec3f &vy, const vec3f &vz) : vx(vx), vy(vy), vz(vz) {} #endif }; // short-hand name for LinearSpace3f typedef LinearSpace3f linear3f; //////////////////////////////////////////////////////////////////////////////// /// Constructors //////////////////////////////////////////////////////////////////////////////// #define __define_make_LinearSpace3f(univary) \ inline univary LinearSpace3f make_LinearSpace3f( \ const univary vec3f x, const univary vec3f y, const univary vec3f z) \ { \ univary LinearSpace3f l; \ l.vx = x; \ l.vy = y; \ l.vz = z; \ return l; \ } #ifdef ISPC __define_make_LinearSpace3f(uniform); __define_make_LinearSpace3f(varying); #else __define_make_LinearSpace3f(); #endif #undef __define_make_LinearSpace3f inline ISPC_UNIFORM LinearSpace3f make_LinearSpace3f_identity() { return make_LinearSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f)); } inline ISPC_VARYING LinearSpace3f make_LinearSpace3f_varying_identity() { return make_LinearSpace3f(make_vec3f(1.f, 0.f, 0.f), make_vec3f(0.f, 1.f, 0.f), make_vec3f(0.f, 0.f, 1.f)); } //////////////////////////////////////////////////////////////////////////////// // Transformation //////////////////////////////////////////////////////////////////////////////// #define __define_transform(univary_r, univary_l, univary_v) \ inline univary_r vec3f operator*( \ const univary_l LinearSpace3f l, const univary_v vec3f v) \ { \ return v.x * l.vx + v.y * l.vy + v.z * l.vz; \ } \ inline univary_r vec3f xfmVector( \ const univary_l LinearSpace3f l, const univary_v vec3f v) \ { \ return v.x * l.vx + v.y * l.vy + v.z * l.vz; \ } #ifdef ISPC __define_transform(uniform, uniform, uniform); __define_transform(varying, uniform, varying); __define_transform(varying, varying, varying); #else __define_transform(, , ); #endif #undef __define_transform //////////////////////////////////////////////////////////////////////////////// /// Binary Operators //////////////////////////////////////////////////////////////////////////////// #define __define_binary_ops(univary) \ inline univary LinearSpace3f operator+( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f(a.vx + b.vx, a.vy + b.vy, a.vz + b.vz); \ } \ inline univary LinearSpace3f operator-( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f(a.vx - b.vx, a.vy - b.vy, a.vz - b.vz); \ } \ inline univary LinearSpace3f operator*( \ const univary float a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f((a * b.vx), (a * b.vy), (a * b.vz)); \ } \ inline univary LinearSpace3f operator*( \ const univary LinearSpace3f a, const univary float b) \ { \ return make_LinearSpace3f((a.vx * b), (a.vy * b), (a.vz * b)); \ } \ inline univary LinearSpace3f operator*( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return make_LinearSpace3f((a * b.vx), (a * b.vy), (a * b.vz)); \ } #ifdef ISPC __define_binary_ops(uniform); __define_binary_ops(varying); #else __define_binary_ops(); #endif #undef __define_binary_ops //////////////////////////////////////////////////////////////////////////////// /// Comparison Operators //////////////////////////////////////////////////////////////////////////////// #define __define_comp_ops(univary) \ inline univary bool eq( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return eq(a.vx, b.vx) & eq(a.vy, b.vy) & eq(a.vz, b.vz); \ } \ inline univary bool ne( \ const univary LinearSpace3f a, const univary LinearSpace3f b) \ { \ return ne(a.vx, b.vx) | ne(a.vy, b.vy) | ne(a.vz, b.vz); \ } #ifdef ISPC __define_comp_ops(uniform); __define_comp_ops(varying); #else __define_comp_ops(); #endif #undef __define_comp_ops //////////////////////////////////////////////////////////////////////////////// // Unary Operators //////////////////////////////////////////////////////////////////////////////// #define __define_unary_fct(univary) \ inline univary LinearSpace3f neg(const univary LinearSpace3f l) \ { \ return make_LinearSpace3f(neg(l.vx), neg(l.vy), neg(l.vz)); \ } \ /* compute the determinant of the matrix */ \ inline univary float det(const univary LinearSpace3f l) \ { \ return dot(l.vx, cross(l.vy, l.vz)); \ } \ /* compute transposed matrix */ \ inline univary LinearSpace3f transposed(const univary LinearSpace3f l) \ { \ return make_LinearSpace3f(make_vec3f(l.vx.x, l.vy.x, l.vz.x), \ make_vec3f(l.vx.y, l.vy.y, l.vz.y), \ make_vec3f(l.vx.z, l.vy.z, l.vz.z)); \ } \ /* compute adjoint matrix */ \ inline univary LinearSpace3f adjoint(const univary LinearSpace3f l) \ { \ return transposed(make_LinearSpace3f( \ cross(l.vy, l.vz), cross(l.vz, l.vx), cross(l.vx, l.vy))); \ } \ /* calculates orthogonal coordinate frame with z-vector pointing towards N \ */ \ inline univary LinearSpace3f frame(const univary vec3f N) \ { \ const univary vec3f dx0 = make_vec3f(0.0f, N.z, -N.y); \ const univary vec3f dx1 = make_vec3f(-N.z, 0.0f, N.x); \ const univary vec3f dx = normalize(abs(N.x) < abs(N.y) ? dx0 : dx1); \ const univary vec3f dy = cross(N, dx); \ return make_LinearSpace3f(dx, dy, N); \ } \ inline univary LinearSpace3f rcp(const univary LinearSpace3f l) \ { \ return adjoint(l) * rcpf(det(l)); \ } #ifdef ISPC __define_unary_fct(uniform); __define_unary_fct(varying); #else __define_unary_fct(); #endif #undef __define_unary_fct //////////////////////////////////////////////////////////////////////////////// // Rudimentary 2D linear space, used for texture coordinate transformations //////////////////////////////////////////////////////////////////////////////// struct LinearSpace2f { vec2f vx; vec2f vy; #ifndef ISPC LinearSpace2f() = default; LinearSpace2f(const float v) : vx(v), vy(v) {} LinearSpace2f(const vec2f &vx, const vec2f &vy) : vx(vx), vy(vy) {} #endif }; // short-hand name for LinearSpace2f typedef LinearSpace2f linear2f; #define __define_make_LinearSpace2f(univary) \ inline univary LinearSpace2f make_LinearSpace2f( \ const univary vec2f x, const univary vec2f y) \ { \ univary LinearSpace2f l; \ l.vx = x; \ l.vy = y; \ return l; \ } #ifdef ISPC __define_make_LinearSpace2f(uniform); __define_make_LinearSpace2f(varying); #else __define_make_LinearSpace2f(); #endif #undef __define_make_LinearSpace2f inline ISPC_UNIFORM LinearSpace2f make_LinearSpace2f_identity() { return make_LinearSpace2f(make_vec2f(1.f, 0.f), make_vec2f(0.f, 1.f)); } #define __define_transform2f(univary_r, univary_l, univary_v) \ inline univary_r vec2f operator*( \ const univary_l LinearSpace2f l, const univary_v vec2f v) \ { \ return v.x * l.vx + v.y * l.vy; \ } \ inline univary_r vec2f xfmVector( \ const univary_l LinearSpace2f l, const univary_v vec2f v) \ { \ return v.x * l.vx + v.y * l.vy; \ } #ifdef ISPC __define_transform2f(uniform, uniform, uniform); __define_transform2f(varying, uniform, varying); __define_transform2f(varying, varying, varying); #else __define_transform2f(, , ); #endif #undef __define_transform2f #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/math/Quaternion.h000066400000000000000000000261421456117377200223120ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.h" #include "../traits/rktraits.h" namespace rkcommon { namespace math { template > struct QuaternionT { using Vector = vec_t; using Scalar = T; QuaternionT() {} QuaternionT(const QuaternionT &other) { r = other.r; i = other.i; j = other.j; k = other.k; } QuaternionT &operator=(const QuaternionT &other) { r = other.r; i = other.i; j = other.j; k = other.k; return *this; } QuaternionT(const T &r) : r(r), i(zero), j(zero), k(zero) {} explicit QuaternionT(const Vector &v) : r(zero), i(v.x), j(v.y), k(v.z) {} QuaternionT(const T &r, const T &i, const T &j, const T &k) : r(r), i(i), j(j), k(k) { } QuaternionT(const T &r, const Vector &v) : r(r), i(v.x), j(v.y), k(v.z) {} QuaternionT(const Vector &vx, const Vector &vy, const Vector &vz); QuaternionT(const T &yaw, const T &pitch, const T &roll); QuaternionT(ZeroTy) : r(zero), i(zero), j(zero), k(zero) {} QuaternionT(OneTy) : r(one), i(zero), j(zero), k(zero) {} static inline QuaternionT rotate(const Vector &u, const T &r) { return QuaternionT(cos(T(0.5) * r), sin(T(0.5) * r) * normalize(u)); } /*! returns the rotation axis of the quaternion as a vector */ const Vector v() const { return Vector(i, j, k); } T i, j, k, r; }; template inline QuaternionT operator*(const T &a, const QuaternionT &b) { return QuaternionT(a * b.r, a * b.i, a * b.j, a * b.k); } template inline QuaternionT operator*(const QuaternionT &a, const T &b) { return QuaternionT(a.r * b, a.i * b, a.j * b, a.k * b); } template > inline auto operator*(const T &a, const QuaternionT &b) -> QuaternionT { using scalar_t = decltype(T() * U()); using quaternion_t = QuaternionT; return quaternion_t(scalar_t(a) * quaternion_t(b)); } template > inline auto operator*(const QuaternionT &a, const U &b) -> QuaternionT { using scalar_t = decltype(T() * U()); using quaternion_t = QuaternionT; return quaternion_t(quaternion_t(a) * scalar_t(b)); } template inline QuaternionT operator+(const QuaternionT &a) { return QuaternionT(+a.r, +a.i, +a.j, +a.k); } template inline QuaternionT operator-(const QuaternionT &a) { return QuaternionT(-a.r, -a.i, -a.j, -a.k); } template inline QuaternionT conj(const QuaternionT &a) { return QuaternionT(a.r, -a.i, -a.j, -a.k); } template inline T abs(const QuaternionT &a) { return sqrt(a.r * a.r + a.i * a.i + a.j * a.j + a.k * a.k); } template inline QuaternionT rcp(const QuaternionT &a) { return conj(a) * rcp(a.r * a.r + a.i * a.i + a.j * a.j + a.k * a.k); } template inline T dot(const QuaternionT &a, const QuaternionT &b) { return a.r * b.r + a.i * b.i + a.j * b.j + a.k * b.k; } template inline QuaternionT normalize(const QuaternionT &a) { return a * rsqrt(dot(a, a)); } template inline QuaternionT operator+(const T &a, const QuaternionT &b) { return QuaternionT(a + b.r, b.i, b.j, b.k); } template inline QuaternionT operator+(const QuaternionT &a, const T &b) { return QuaternionT(a.r + b, a.i, a.j, a.k); } template inline QuaternionT operator+(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r + b.r, a.i + b.i, a.j + b.j, a.k + b.k); } template inline QuaternionT operator-(const T &a, const QuaternionT &b) { return QuaternionT(a - b.r, -b.i, -b.j, -b.k); } template inline QuaternionT operator-(const QuaternionT &a, const T &b) { return QuaternionT(a.r - b, a.i, a.j, a.k); } template inline QuaternionT operator-(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r - b.r, a.i - b.i, a.j - b.j, a.k - b.k); } template inline typename QuaternionT::Vector operator*( const QuaternionT &a, const typename QuaternionT::Vector &b) { return (a * QuaternionT(b) * conj(a)).v(); } template inline QuaternionT operator*(const QuaternionT &a, const QuaternionT &b) { return QuaternionT(a.r * b.r - a.i * b.i - a.j * b.j - a.k * b.k, a.r * b.i + a.i * b.r + a.j * b.k - a.k * b.j, a.r * b.j - a.i * b.k + a.j * b.r + a.k * b.i, a.r * b.k + a.i * b.j - a.j * b.i + a.k * b.r); } template inline QuaternionT operator/(const T &a, const QuaternionT &b) { return a * rcp(b); } template inline QuaternionT operator/(const QuaternionT &a, const T &b) { return a * rcp(b); } template inline QuaternionT operator/(const QuaternionT &a, const QuaternionT &b) { return a * rcp(b); } template inline QuaternionT &operator+=(QuaternionT &a, const T &b) { return a = a + b; } template inline QuaternionT &operator+=(QuaternionT &a, const QuaternionT &b) { return a = a + b; } template inline QuaternionT &operator-=(QuaternionT &a, const T &b) { return a = a - b; } template inline QuaternionT &operator-=(QuaternionT &a, const QuaternionT &b) { return a = a - b; } template inline QuaternionT &operator*=(QuaternionT &a, const T &b) { return a = a * b; } template inline QuaternionT &operator*=(QuaternionT &a, const QuaternionT &b) { return a = a * b; } template inline QuaternionT &operator/=(QuaternionT &a, const T &b) { return a = a * rcp(b); } template inline QuaternionT &operator/=(QuaternionT &a, const QuaternionT &b) { return a = a * rcp(b); } template inline typename QuaternionT::Vector xfmPoint( const QuaternionT &a, const typename QuaternionT::Vector &b) { return a * b; } template inline QuaternionT xfmQuaternion( const QuaternionT &a, const QuaternionT &b) { return a * b; } template inline typename QuaternionT::Vector xfmNormal( const QuaternionT &a, const typename QuaternionT::Vector &b) { return a * b; } template inline bool operator==(const QuaternionT &a, const QuaternionT &b) { return a.r == b.r && a.i == b.i && a.j == b.j && a.k == b.k; } template inline bool operator!=(const QuaternionT &a, const QuaternionT &b) { return a.r != b.r || a.i != b.i || a.j != b.j || a.k != b.k; } template QuaternionT::QuaternionT(const typename QuaternionT::Vector &vx, const typename QuaternionT::Vector &vy, const typename QuaternionT::Vector &vz) { if (vx.x + vy.y + vz.z >= T(zero)) { const T t = T(one) + (vx.x + vy.y + vz.z); const T s = rsqrt(t) * T(.5); r = t * s; i = (vy.z - vz.y) * s; j = (vz.x - vx.z) * s; k = (vx.y - vy.x) * s; } else if (vx.x >= max(vy.y, vz.z)) { const T t = (T(one) + vx.x) - (vy.y + vz.z); const T s = rsqrt(t) * T(.5); r = (vy.z - vz.y) * s; i = t * s; j = (vx.y + vy.x) * s; k = (vz.x + vx.z) * s; } else if (vy.y >= vz.z) // if ( vy.y >= max(vz.z, vx.x) ) { const T t = (T(one) + vy.y) - (vz.z + vx.x); const T s = rsqrt(t) * T(.5); r = (vz.x - vx.z) * s; i = (vx.y + vy.x) * s; j = t * s; k = (vy.z + vz.y) * s; } else // if ( vz.z >= max(vy.y, vx.x) ) { const T t = (T(one) + vz.z) - (vx.x + vy.y); const T s = rsqrt(t) * T(.5); r = (vx.y - vy.x) * s; i = (vz.x + vx.z) * s; j = (vy.z + vz.y) * s; k = t * s; } } template QuaternionT::QuaternionT(const T &yaw, const T &pitch, const T &roll) { const T cya = cos(yaw * T(.5)); const T cpi = cos(pitch * T(.5)); const T cro = cos(roll * T(.5)); const T sya = sin(yaw * T(.5)); const T spi = sin(pitch * T(.5)); const T sro = sin(roll * T(.5)); r = cro * cya * cpi + sro * sya * spi; i = cro * cya * spi + sro * sya * cpi; j = cro * sya * cpi - sro * cya * spi; k = sro * cya * cpi - cro * sya * spi; } template static std::ostream &operator<<(std::ostream &cout, const QuaternionT &q) { return cout << "{ r = " << q.r << ", i = " << q.i << ", j = " << q.j << ", k = " << q.k << " }"; } template // a, b must be normalized inline QuaternionT slerp(const float factor, const QuaternionT &_a, const QuaternionT &b) { QuaternionT a(_a); T d = dot(a, b); if (d < 0.) { // prevent "long way around" a = -a; d = -d; } if (d > 0.9995) { // angles too small, fallback to linear interpolation return normalize(rkcommon::math::lerp(factor, a, b)); } const T theta0 = std::acos(d); const T theta = theta0 * factor; const T fb = std::sin(theta) / std::sin(theta0); const T fa = std::cos(theta) - d * fb; return fa * a + fb * b; } using quaternionf = QuaternionT; using quatf = QuaternionT; using quaterniond = QuaternionT; using quatd = QuaternionT; } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/arm/000077500000000000000000000000001456117377200205665ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/math/arm/emulation.h000066400000000000000000000024731456117377200227420ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once /* Make precision match SSE, at the cost of some performance */ #if !defined(__aarch64__) # define SSE2NEON_PRECISE_DIV 1 # define SSE2NEON_PRECISE_SQRT 1 #endif #include "sse2neon.h" __forceinline __m128 _mm_fmsub_ps(__m128 a, __m128 b, __m128 c) { __m128 neg_c = vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(c))); return _mm_fmadd_ps(a, b, neg_c); } __forceinline __m128 _mm_fnmadd_ps(__m128 a, __m128 b, __m128 c) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vfmsq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(a))); #else return _mm_sub_ps(c, _mm_mul_ps(a, b)); #endif } __forceinline __m128 _mm_fnmsub_ps(__m128 a, __m128 b, __m128 c) { return vreinterpretq_m128_f32(vnegq_f32(vreinterpretq_f32_m128(_mm_fmadd_ps(a,b,c)))); } /* Dummy defines for floating point control */ #define _MM_MASK_MASK 0x1f80 #define _MM_MASK_DIV_ZERO 0x200 #define _MM_FLUSH_ZERO_ON 0x8000 #define _MM_MASK_DENORM 0x100 #define _MM_SET_EXCEPTION_MASK(x) #define _MM_SET_FLUSH_ZERO_MODE(x) __forceinline int _mm_getcsr() { return 0; } __forceinline void _mm_mfence() { __sync_synchronize(); } ospray-rkcommon-538f8a2/rkcommon/math/arm/sse2neon.h000066400000000000000000010170331456117377200225000ustar00rootroot00000000000000#ifndef SSE2NEON_H #define SSE2NEON_H // This header file provides a simple API translation layer // between SSE intrinsics to their corresponding Arm/Aarch64 NEON versions // // This header file does not yet translate all of the SSE intrinsics. // // Contributors to this work are: // John W. Ratcliff // Brandon Rowlett // Ken Fast // Eric van Beurden // Alexander Potylitsin // Hasindu Gamaarachchi // Jim Huang // Mark Cheng // Malcolm James MacLeod // Devin Hussey (easyaspi314) // Sebastian Pop // Developer Ecosystem Engineering // Danila Kutenin // FranƧois Turban (JishinMaster) // Pei-Hsuan Hung // Yang-Hao Yuan // Syoyo Fujita // Brecht Van Lommel /* * sse2neon is freely redistributable under the MIT License. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ /* Tunable configurations */ /* Enable precise implementation of math operations * This would slow down the computation a bit, but gives consistent result with * x86 SSE2. (e.g. would solve a hole or NaN pixel in the rendering result) */ /* _mm_min_ps and _mm_max_ps */ #ifndef SSE2NEON_PRECISE_MINMAX #define SSE2NEON_PRECISE_MINMAX (0) #endif /* _mm_rcp_ps and _mm_div_ps */ #ifndef SSE2NEON_PRECISE_DIV #define SSE2NEON_PRECISE_DIV (0) #endif /* _mm_sqrt_ps and _mm_rsqrt_ps */ #ifndef SSE2NEON_PRECISE_SQRT #define SSE2NEON_PRECISE_SQRT (0) #endif #ifndef SSE2NEON_PRECISE_RSQRT #define SSE2NEON_PRECISE_RSQRT (0) #endif #if defined(__GNUC__) || defined(__clang__) #pragma push_macro("FORCE_INLINE") #pragma push_macro("ALIGN_STRUCT") #define FORCE_INLINE static inline __attribute__((always_inline)) #define ALIGN_STRUCT(x) __attribute__((aligned(x))) #ifndef likely #define likely(x) __builtin_expect(!!(x), 1) #endif #ifndef unlikely #define unlikely(x) __builtin_expect(!!(x), 0) #endif #else #error "Macro name collisions may happen with unsupported compiler." #ifdef FORCE_INLINE #undef FORCE_INLINE #endif #define FORCE_INLINE static inline #ifndef ALIGN_STRUCT #define ALIGN_STRUCT(x) __declspec(align(x)) #endif #endif #ifndef likely #define likely(x) (x) #endif #ifndef unlikely #define unlikely(x) (x) #endif #include #include /* Architecture-specific build options */ /* FIXME: #pragma GCC push_options is only available on GCC */ #if defined(__GNUC__) #if defined(__arm__) && __ARM_ARCH == 7 /* According to ARM C Language Extensions Architecture specification, * __ARM_NEON is defined to a value indicating the Advanced SIMD (NEON) * architecture supported. */ #if !defined(__ARM_NEON) || !defined(__ARM_NEON__) #error "You must enable NEON instructions (e.g. -mfpu=neon) to use SSE2NEON." #endif #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("fpu=neon") #endif #elif defined(__aarch64__) #if !defined(__clang__) #pragma GCC push_options #pragma GCC target("+simd") #endif #else #error "Unsupported target. Must be either ARMv7-A+NEON or ARMv8-A." #endif #endif #include /* Rounding functions require either Aarch64 instructions or libm failback */ #if !defined(__aarch64__) #include #endif /* "__has_builtin" can be used to query support for built-in functions * provided by gcc/clang and other compilers that support it. */ #ifndef __has_builtin /* GCC prior to 10 or non-clang compilers */ /* Compatibility with gcc <= 9 */ #if __GNUC__ <= 9 #define __has_builtin(x) HAS##x #define HAS__builtin_popcount 1 #define HAS__builtin_popcountll 1 #else #define __has_builtin(x) 0 #endif #endif /** * MACRO for shuffle parameter for _mm_shuffle_ps(). * Argument fp3 is a digit[0123] that represents the fp from argument "b" * of mm_shuffle_ps that will be placed in fp3 of result. fp2 is the same * for fp2 in result. fp1 is a digit[0123] that represents the fp from * argument "a" of mm_shuffle_ps that will be places in fp1 of result. * fp0 is the same for fp0 of result. */ #define _MM_SHUFFLE(fp3, fp2, fp1, fp0) \ (((fp3) << 6) | ((fp2) << 4) | ((fp1) << 2) | ((fp0))) /* Rounding mode macros. */ #define _MM_FROUND_TO_NEAREST_INT 0x00 #define _MM_FROUND_TO_NEG_INF 0x01 #define _MM_FROUND_TO_POS_INF 0x02 #define _MM_FROUND_TO_ZERO 0x03 #define _MM_FROUND_CUR_DIRECTION 0x04 #define _MM_FROUND_NO_EXC 0x08 #define _MM_ROUND_NEAREST 0x0000 #define _MM_ROUND_DOWN 0x2000 #define _MM_ROUND_UP 0x4000 #define _MM_ROUND_TOWARD_ZERO 0x6000 /* indicate immediate constant argument in a given range */ #define __constrange(a, b) const /* A few intrinsics accept traditional data types like ints or floats, but * most operate on data types that are specific to SSE. * If a vector type ends in d, it contains doubles, and if it does not have * a suffix, it contains floats. An integer vector type can contain any type * of integer, from chars to shorts to unsigned long longs. */ typedef int64x1_t __m64; typedef float32x4_t __m128; /* 128-bit vector containing 4 floats */ // On ARM 32-bit architecture, the float64x2_t is not supported. // The data type __m128d should be represented in a different way for related // intrinsic conversion. #if defined(__aarch64__) typedef float64x2_t __m128d; /* 128-bit vector containing 2 doubles */ #else typedef float32x4_t __m128d; #endif typedef int64x2_t __m128i; /* 128-bit vector containing integers */ /* type-safe casting between types */ #define vreinterpretq_m128_f16(x) vreinterpretq_f32_f16(x) #define vreinterpretq_m128_f32(x) (x) #define vreinterpretq_m128_f64(x) vreinterpretq_f32_f64(x) #define vreinterpretq_m128_u8(x) vreinterpretq_f32_u8(x) #define vreinterpretq_m128_u16(x) vreinterpretq_f32_u16(x) #define vreinterpretq_m128_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128_s8(x) vreinterpretq_f32_s8(x) #define vreinterpretq_m128_s16(x) vreinterpretq_f32_s16(x) #define vreinterpretq_m128_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f16_m128(x) vreinterpretq_f16_f32(x) #define vreinterpretq_f32_m128(x) (x) #define vreinterpretq_f64_m128(x) vreinterpretq_f64_f32(x) #define vreinterpretq_u8_m128(x) vreinterpretq_u8_f32(x) #define vreinterpretq_u16_m128(x) vreinterpretq_u16_f32(x) #define vreinterpretq_u32_m128(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128(x) vreinterpretq_u64_f32(x) #define vreinterpretq_s8_m128(x) vreinterpretq_s8_f32(x) #define vreinterpretq_s16_m128(x) vreinterpretq_s16_f32(x) #define vreinterpretq_s32_m128(x) vreinterpretq_s32_f32(x) #define vreinterpretq_s64_m128(x) vreinterpretq_s64_f32(x) #define vreinterpretq_m128i_s8(x) vreinterpretq_s64_s8(x) #define vreinterpretq_m128i_s16(x) vreinterpretq_s64_s16(x) #define vreinterpretq_m128i_s32(x) vreinterpretq_s64_s32(x) #define vreinterpretq_m128i_s64(x) (x) #define vreinterpretq_m128i_u8(x) vreinterpretq_s64_u8(x) #define vreinterpretq_m128i_u16(x) vreinterpretq_s64_u16(x) #define vreinterpretq_m128i_u32(x) vreinterpretq_s64_u32(x) #define vreinterpretq_m128i_u64(x) vreinterpretq_s64_u64(x) #define vreinterpretq_f32_m128i(x) vreinterpretq_f32_s64(x) #define vreinterpretq_f64_m128i(x) vreinterpretq_f64_s64(x) #define vreinterpretq_s8_m128i(x) vreinterpretq_s8_s64(x) #define vreinterpretq_s16_m128i(x) vreinterpretq_s16_s64(x) #define vreinterpretq_s32_m128i(x) vreinterpretq_s32_s64(x) #define vreinterpretq_s64_m128i(x) (x) #define vreinterpretq_u8_m128i(x) vreinterpretq_u8_s64(x) #define vreinterpretq_u16_m128i(x) vreinterpretq_u16_s64(x) #define vreinterpretq_u32_m128i(x) vreinterpretq_u32_s64(x) #define vreinterpretq_u64_m128i(x) vreinterpretq_u64_s64(x) #define vreinterpret_m64_s8(x) vreinterpret_s64_s8(x) #define vreinterpret_m64_s16(x) vreinterpret_s64_s16(x) #define vreinterpret_m64_s32(x) vreinterpret_s64_s32(x) #define vreinterpret_m64_s64(x) (x) #define vreinterpret_m64_u8(x) vreinterpret_s64_u8(x) #define vreinterpret_m64_u16(x) vreinterpret_s64_u16(x) #define vreinterpret_m64_u32(x) vreinterpret_s64_u32(x) #define vreinterpret_m64_u64(x) vreinterpret_s64_u64(x) #define vreinterpret_m64_f16(x) vreinterpret_s64_f16(x) #define vreinterpret_m64_f32(x) vreinterpret_s64_f32(x) #define vreinterpret_m64_f64(x) vreinterpret_s64_f64(x) #define vreinterpret_u8_m64(x) vreinterpret_u8_s64(x) #define vreinterpret_u16_m64(x) vreinterpret_u16_s64(x) #define vreinterpret_u32_m64(x) vreinterpret_u32_s64(x) #define vreinterpret_u64_m64(x) vreinterpret_u64_s64(x) #define vreinterpret_s8_m64(x) vreinterpret_s8_s64(x) #define vreinterpret_s16_m64(x) vreinterpret_s16_s64(x) #define vreinterpret_s32_m64(x) vreinterpret_s32_s64(x) #define vreinterpret_s64_m64(x) (x) #define vreinterpret_f32_m64(x) vreinterpret_f32_s64(x) #if defined(__aarch64__) #define vreinterpretq_m128d_s32(x) vreinterpretq_f64_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f64_s64(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f64_u64(x) #define vreinterpretq_m128d_f32(x) vreinterpretq_f64_f32(x) #define vreinterpretq_m128d_f64(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f64(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f64(x) #define vreinterpretq_f64_m128d(x) (x) #define vreinterpretq_f32_m128d(x) vreinterpretq_f32_f64(x) #else #define vreinterpretq_m128d_s32(x) vreinterpretq_f32_s32(x) #define vreinterpretq_m128d_s64(x) vreinterpretq_f32_s64(x) #define vreinterpretq_m128d_u32(x) vreinterpretq_f32_u32(x) #define vreinterpretq_m128d_u64(x) vreinterpretq_f32_u64(x) #define vreinterpretq_m128d_f32(x) (x) #define vreinterpretq_s64_m128d(x) vreinterpretq_s64_f32(x) #define vreinterpretq_u32_m128d(x) vreinterpretq_u32_f32(x) #define vreinterpretq_u64_m128d(x) vreinterpretq_u64_f32(x) #define vreinterpretq_f32_m128d(x) (x) #endif // A struct is defined in this header file called 'SIMDVec' which can be used // by applications which attempt to access the contents of an _m128 struct // directly. It is important to note that accessing the __m128 struct directly // is bad coding practice by Microsoft: @see: // https://msdn.microsoft.com/en-us/library/ayeb3ayc.aspx // // However, some legacy source code may try to access the contents of an __m128 // struct directly so the developer can use the SIMDVec as an alias for it. Any // casting must be done manually by the developer, as you cannot cast or // otherwise alias the base NEON data type for intrinsic operations. // // union intended to allow direct access to an __m128 variable using the names // that the MSVC compiler provides. This union should really only be used when // trying to access the members of the vector as integer values. GCC/clang // allow native access to the float members through a simple array access // operator (in C since 4.6, in C++ since 4.8). // // Ideally direct accesses to SIMD vectors should not be used since it can cause // a performance hit. If it really is needed however, the original __m128 // variable can be aliased with a pointer to this union and used to access // individual components. The use of this union should be hidden behind a macro // that is used throughout the codebase to access the members instead of always // declaring this type of variable. typedef union ALIGN_STRUCT(16) SIMDVec { float m128_f32[4]; // as floats - DON'T USE. Added for convenience. int8_t m128_i8[16]; // as signed 8-bit integers. int16_t m128_i16[8]; // as signed 16-bit integers. int32_t m128_i32[4]; // as signed 32-bit integers. int64_t m128_i64[2]; // as signed 64-bit integers. uint8_t m128_u8[16]; // as unsigned 8-bit integers. uint16_t m128_u16[8]; // as unsigned 16-bit integers. uint32_t m128_u32[4]; // as unsigned 32-bit integers. uint64_t m128_u64[2]; // as unsigned 64-bit integers. } SIMDVec; // casting using SIMDVec #define vreinterpretq_nth_u64_m128i(x, n) (((SIMDVec *) &x)->m128_u64[n]) #define vreinterpretq_nth_u32_m128i(x, n) (((SIMDVec *) &x)->m128_u32[n]) #define vreinterpretq_nth_u8_m128i(x, n) (((SIMDVec *) &x)->m128_u8[n]) /* Backwards compatibility for compilers with lack of specific type support */ // Older gcc does not define vld1q_u8_x4 type #if defined(__GNUC__) && !defined(__clang__) && \ ((__GNUC__ == 10 && (__GNUC_MINOR__ <= 1)) || \ (__GNUC__ == 9 && (__GNUC_MINOR__ <= 3)) || \ (__GNUC__ == 8 && (__GNUC_MINOR__ <= 4)) || __GNUC__ <= 7) FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { uint8x16x4_t ret; ret.val[0] = vld1q_u8(p + 0); ret.val[1] = vld1q_u8(p + 16); ret.val[2] = vld1q_u8(p + 32); ret.val[3] = vld1q_u8(p + 48); return ret; } #else // Wraps vld1q_u8_x4 FORCE_INLINE uint8x16x4_t _sse2neon_vld1q_u8_x4(const uint8_t *p) { return vld1q_u8_x4(p); } #endif /* Function Naming Conventions * The naming convention of SSE intrinsics is straightforward. A generic SSE * intrinsic function is given as follows: * _mm__ * * The parts of this format are given as follows: * 1. describes the operation performed by the intrinsic * 2. identifies the data type of the function's primary arguments * * This last part, , is a little complicated. It identifies the * content of the input values, and can be set to any of the following values: * + ps - vectors contain floats (ps stands for packed single-precision) * + pd - vectors cantain doubles (pd stands for packed double-precision) * + epi8/epi16/epi32/epi64 - vectors contain 8-bit/16-bit/32-bit/64-bit * signed integers * + epu8/epu16/epu32/epu64 - vectors contain 8-bit/16-bit/32-bit/64-bit * unsigned integers * + si128 - unspecified 128-bit vector or 256-bit vector * + m128/m128i/m128d - identifies input vector types when they are different * than the type of the returned vector * * For example, _mm_setzero_ps. The _mm implies that the function returns * a 128-bit vector. The _ps at the end implies that the argument vectors * contain floats. * * A complete example: Byte Shuffle - pshufb (_mm_shuffle_epi8) * // Set packed 16-bit integers. 128 bits, 8 short, per 16 bits * __m128i v_in = _mm_setr_epi16(1, 2, 3, 4, 5, 6, 7, 8); * // Set packed 8-bit integers * // 128 bits, 16 chars, per 8 bits * __m128i v_perm = _mm_setr_epi8(1, 0, 2, 3, 8, 9, 10, 11, * 4, 5, 12, 13, 6, 7, 14, 15); * // Shuffle packed 8-bit integers * __m128i v_out = _mm_shuffle_epi8(v_in, v_perm); // pshufb * * Data (Number, Binary, Byte Index): +------+------+-------------+------+------+-------------+ | 1 | 2 | 3 | 4 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0001 | 0000 | 0010 | 0000 | 0011 | 0000 | 0100 | Binary +------+------+------+------+------+------+------+------+ | 0 | 1 | 2 | 3 | 4 | 5 | 6 | 7 | Index +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 5 | 6 | 7 | 8 | Number +------+------+------+------+------+------+------+------+ | 0000 | 0101 | 0000 | 0110 | 0000 | 0111 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 8 | 9 | 10 | 11 | 12 | 13 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ * Index (Byte Index): +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | +------+------+------+------+------+------+------+------+ * Result: +------+------+------+------+------+------+------+------+ | 1 | 0 | 2 | 3 | 8 | 9 | 10 | 11 | Index +------+------+------+------+------+------+------+------+ | 0001 | 0000 | 0000 | 0010 | 0000 | 0101 | 0000 | 0110 | Binary +------+------+------+------+------+------+------+------+ | 256 | 2 | 5 | 6 | Number +------+------+------+------+------+------+------+------+ +------+------+------+------+------+------+------+------+ | 4 | 5 | 12 | 13 | 6 | 7 | 14 | 15 | Index +------+------+------+------+------+------+------+------+ | 0000 | 0011 | 0000 | 0111 | 0000 | 0100 | 0000 | 1000 | Binary +------+------+------+------+------+------+------+------+ | 3 | 7 | 4 | 8 | Number +------+------+------+------+------+------+-------------+ */ /* Set/get methods */ /* Constants for use with _mm_prefetch. */ enum _mm_hint { _MM_HINT_NTA = 0, /* load data to L1 and L2 cache, mark it as NTA */ _MM_HINT_T0 = 1, /* load data to L1 and L2 cache */ _MM_HINT_T1 = 2, /* load data to L2 cache only */ _MM_HINT_T2 = 3, /* load data to L2 cache only, mark it as NTA */ _MM_HINT_ENTA = 4, /* exclusive version of _MM_HINT_NTA */ _MM_HINT_ET0 = 5, /* exclusive version of _MM_HINT_T0 */ _MM_HINT_ET1 = 6, /* exclusive version of _MM_HINT_T1 */ _MM_HINT_ET2 = 7 /* exclusive version of _MM_HINT_T2 */ }; // Loads one cache line of data from address p to a location closer to the // processor. https://msdn.microsoft.com/en-us/library/84szxsww(v=vs.100).aspx FORCE_INLINE void _mm_prefetch(const void *p, int i) { (void) i; __builtin_prefetch(p); } // Pause the processor. This is typically used in spin-wait loops and depending // on the x86 processor typical values are in the 40-100 cycle range. The // 'yield' instruction isn't a good fit beacuse it's effectively a nop on most // Arm cores. Experience with several databases has shown has shown an 'isb' is // a reasonable approximation. FORCE_INLINE void _mm_pause() { __asm__ __volatile__("isb\n"); } // Copy the lower single-precision (32-bit) floating-point element of a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_f32 FORCE_INLINE float _mm_cvtss_f32(__m128 a) { return vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); } // Convert the lower single-precision (32-bit) floating-point element in b to a // double-precision (64-bit) floating-point element, store the result in the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := Convert_FP32_To_FP64(b[31:0]) // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_sd FORCE_INLINE __m128d _mm_cvtss_sd(__m128d a, __m128 b) { double d = (double) vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsetq_lane_f64(d, vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_s64( vsetq_lane_s64(*(int64_t *) &d, vreinterpretq_s64_m128d(a), 0)); #endif } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si32 #define _mm_cvtss_si32(a) _mm_cvt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtss_si64 FORCE_INLINE int _mm_cvtss_si64(__m128 a) { #if defined(__aarch64__) return vgetq_lane_s64( vreinterpretq_s64_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a))), 0); #else float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t diff = data - floor(data); if (diff > 0.5) return (int64_t) ceil(data); if (unlikely(diff == 0.5)) { int64_t f = (int64_t) floor(data); int64_t c = (int64_t) ceil(data); return c & 1 ? f : c; } return (int64_t) floor(data); #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ps2pi FORCE_INLINE __m64 _mm_cvtt_ps2pi(__m128 a) { return vreinterpret_m64_s32( vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtt_ss2si FORCE_INLINE int _mm_cvtt_ss2si(__m128 a) { return vgetq_lane_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)), 0); } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers with truncation, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32_Truncate(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttps_pi32 #define _mm_cvttps_pi32(a) _mm_cvtt_ps2pi(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer with truncation, and store the result in dst. // // dst[31:0] := Convert_FP32_To_Int32_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si32 #define _mm_cvttss_si32(a) _mm_cvtt_ss2si(a) // Convert the lower single-precision (32-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP32_To_Int64_Truncate(a[31:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttss_si64 FORCE_INLINE int64_t _mm_cvttss_si64(__m128 a) { return vgetq_lane_s64( vmovl_s32(vget_low_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a)))), 0); } // Sets the 128-bit value to zero // https://msdn.microsoft.com/en-us/library/vstudio/ys7dw0kh(v=vs.100).aspx FORCE_INLINE __m128i _mm_setzero_si128(void) { return vreinterpretq_m128i_s32(vdupq_n_s32(0)); } // Clears the four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/tk1t2tbz(v=vs.100).aspx FORCE_INLINE __m128 _mm_setzero_ps(void) { return vreinterpretq_m128_f32(vdupq_n_f32(0)); } // Return vector of type __m128d with all elements set to zero. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setzero_pd FORCE_INLINE __m128d _mm_setzero_pd(void) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(0)); #else return vreinterpretq_m128d_f32(vdupq_n_f32(0)); #endif } // Sets the four single-precision, floating-point values to w. // // r0 := r1 := r2 := r3 := w // // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set1_ps(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Sets the four single-precision, floating-point values to w. // https://msdn.microsoft.com/en-us/library/vstudio/2x1se8ha(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps1(float _w) { return vreinterpretq_m128_f32(vdupq_n_f32(_w)); } // Sets the four single-precision, floating-point values to the four inputs. // https://msdn.microsoft.com/en-us/library/vstudio/afh0zf75(v=vs.100).aspx FORCE_INLINE __m128 _mm_set_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {x, y, z, w}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Copy single-precision (32-bit) floating-point element a to the lower element // of dst, and zero the upper 3 elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_ss FORCE_INLINE __m128 _mm_set_ss(float a) { float ALIGN_STRUCT(16) data[4] = {a, 0, 0, 0}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the four single-precision, floating-point values to the four inputs in // reverse order. // https://msdn.microsoft.com/en-us/library/vstudio/d2172ct3(v=vs.100).aspx FORCE_INLINE __m128 _mm_setr_ps(float w, float z, float y, float x) { float ALIGN_STRUCT(16) data[4] = {w, z, y, x}; return vreinterpretq_m128_f32(vld1q_f32(data)); } // Sets the 8 signed 16-bit integer values in reverse order. // // Return Value // r0 := w0 // r1 := w1 // ... // r7 := w7 FORCE_INLINE __m128i _mm_setr_epi16(short w0, short w1, short w2, short w3, short w4, short w5, short w6, short w7) { int16_t ALIGN_STRUCT(16) data[8] = {w0, w1, w2, w3, w4, w5, w6, w7}; return vreinterpretq_m128i_s16(vld1q_s16((int16_t *) data)); } // Sets the 4 signed 32-bit integer values in reverse order // https://technet.microsoft.com/en-us/library/security/27yb3ee5(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i3, i2, i1, i0}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Set packed 64-bit integers in dst with the supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_epi64 FORCE_INLINE __m128i _mm_setr_epi64(__m64 e1, __m64 e0) { return vreinterpretq_m128i_s64(vcombine_s64(e1, e0)); } // Sets the 16 signed 8-bit integer values to b. // // r0 := b // r1 := b // ... // r15 := b // // https://msdn.microsoft.com/en-us/library/6e14xhyf(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi8(signed char w) { return vreinterpretq_m128i_s8(vdupq_n_s8(w)); } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_pd FORCE_INLINE __m128d _mm_set1_pd(double d) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vdupq_n_f64(d)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(int64_t *) &d)); #endif } // Sets the 8 signed 16-bit integer values to w. // // r0 := w // r1 := w // ... // r7 := w // // https://msdn.microsoft.com/en-us/library/k0ya3x0e(v=vs.90).aspx FORCE_INLINE __m128i _mm_set1_epi16(short w) { return vreinterpretq_m128i_s16(vdupq_n_s16(w)); } // Sets the 16 signed 8-bit integer values. // https://msdn.microsoft.com/en-us/library/x0cx8zd3(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi8(signed char b15, signed char b14, signed char b13, signed char b12, signed char b11, signed char b10, signed char b9, signed char b8, signed char b7, signed char b6, signed char b5, signed char b4, signed char b3, signed char b2, signed char b1, signed char b0) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Sets the 8 signed 16-bit integer values. // https://msdn.microsoft.com/en-au/library/3e0fek84(v=vs.90).aspx FORCE_INLINE __m128i _mm_set_epi16(short i7, short i6, short i5, short i4, short i3, short i2, short i1, short i0) { int16_t ALIGN_STRUCT(16) data[8] = {i0, i1, i2, i3, i4, i5, i6, i7}; return vreinterpretq_m128i_s16(vld1q_s16(data)); } // Sets the 16 signed 8-bit integer values in reverse order. // https://msdn.microsoft.com/en-us/library/2khb9c7k(v=vs.90).aspx FORCE_INLINE __m128i _mm_setr_epi8(signed char b0, signed char b1, signed char b2, signed char b3, signed char b4, signed char b5, signed char b6, signed char b7, signed char b8, signed char b9, signed char b10, signed char b11, signed char b12, signed char b13, signed char b14, signed char b15) { int8_t ALIGN_STRUCT(16) data[16] = {(int8_t) b0, (int8_t) b1, (int8_t) b2, (int8_t) b3, (int8_t) b4, (int8_t) b5, (int8_t) b6, (int8_t) b7, (int8_t) b8, (int8_t) b9, (int8_t) b10, (int8_t) b11, (int8_t) b12, (int8_t) b13, (int8_t) b14, (int8_t) b15}; return (__m128i) vld1q_s8(data); } // Sets the 4 signed 32-bit integer values to i. // // r0 := i // r1 := i // r2 := i // r3 := I // // https://msdn.microsoft.com/en-us/library/vstudio/h4xscxat(v=vs.100).aspx FORCE_INLINE __m128i _mm_set1_epi32(int _i) { return vreinterpretq_m128i_s32(vdupq_n_s32(_i)); } // Sets the 2 signed 64-bit integer values to i. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/whtfzhzk(v=vs.100) FORCE_INLINE __m128i _mm_set1_epi64(__m64 _i) { return vreinterpretq_m128i_s64(vdupq_n_s64((int64_t) _i)); } // Sets the 2 signed 64-bit integer values to i. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set1_epi64x FORCE_INLINE __m128i _mm_set1_epi64x(int64_t _i) { return vreinterpretq_m128i_s64(vdupq_n_s64(_i)); } // Sets the 4 signed 32-bit integer values. // https://msdn.microsoft.com/en-us/library/vstudio/019beekt(v=vs.100).aspx FORCE_INLINE __m128i _mm_set_epi32(int i3, int i2, int i1, int i0) { int32_t ALIGN_STRUCT(16) data[4] = {i0, i1, i2, i3}; return vreinterpretq_m128i_s32(vld1q_s32(data)); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64x(int64_t i1, int64_t i2) { return vreinterpretq_m128i_s64( vcombine_s64(vcreate_s64(i2), vcreate_s64(i1))); } // Returns the __m128i structure with its two 64-bit integer values // initialized to the values of the two 64-bit integers passed in. // https://msdn.microsoft.com/en-us/library/dk2sdw0h(v=vs.120).aspx FORCE_INLINE __m128i _mm_set_epi64(__m64 i1, __m64 i2) { return _mm_set_epi64x((int64_t) i1, (int64_t) i2); } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd FORCE_INLINE __m128d _mm_set_pd(double e1, double e0) { double ALIGN_STRUCT(16) data[2] = {e0, e1}; #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64((float64_t *) data)); #else return vreinterpretq_m128d_f32(vld1q_f32((float32_t *) data)); #endif } // Set packed double-precision (64-bit) floating-point elements in dst with the // supplied values in reverse order. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_setr_pd FORCE_INLINE __m128d _mm_setr_pd(double e1, double e0) { return _mm_set_pd(e0, e1); } // Copy double-precision (64-bit) floating-point element a to the lower element // of dst, and zero the upper element. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_sd FORCE_INLINE __m128d _mm_set_sd(double a) { return _mm_set_pd(0, a); } // Broadcast double-precision (64-bit) floating-point value a to all elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_set_pd1 #define _mm_set_pd1 _mm_set1_pd // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/s3h4ay6y(v=vs.100).aspx FORCE_INLINE void _mm_store_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_ps1 FORCE_INLINE void _mm_store_ps1(float *p, __m128 a) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); vst1q_f32(p, vdupq_n_f32(a0)); } // Store the lower single-precision (32-bit) floating-point element from a into // 4 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[31:0] // MEM[mem_addr+63:mem_addr+32] := a[31:0] // MEM[mem_addr+95:mem_addr+64] := a[31:0] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store1_ps #define _mm_store1_ps _mm_store_ps1 // Store 4 single-precision (32-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+31:mem_addr] := a[127:96] // MEM[mem_addr+63:mem_addr+32] := a[95:64] // MEM[mem_addr+95:mem_addr+64] := a[63:32] // MEM[mem_addr+127:mem_addr+96] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_ps FORCE_INLINE void _mm_storer_ps(float *p, __m128 a) { float32x4_t tmp = vrev64q_f32(vreinterpretq_f32_m128(a)); float32x4_t rev = vextq_f32(tmp, tmp, 2); vst1q_f32(p, rev); } // Stores four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/44e30x22(v=vs.100).aspx FORCE_INLINE void _mm_storeu_ps(float *p, __m128 a) { vst1q_f32(p, vreinterpretq_f32_m128(a)); } // Stores four 32-bit integer values as (as a __m128i value) at the address p. // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx FORCE_INLINE void _mm_store_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Stores four 32-bit integer values as (as a __m128i value) at the address p. // https://msdn.microsoft.com/en-us/library/vstudio/edk11s13(v=vs.100).aspx FORCE_INLINE void _mm_storeu_si128(__m128i *p, __m128i a) { vst1q_s32((int32_t *) p, vreinterpretq_s32_m128i(a)); } // Stores the lower single - precision, floating - point value. // https://msdn.microsoft.com/en-us/library/tzz10fbx(v=vs.100).aspx FORCE_INLINE void _mm_store_ss(float *p, __m128 a) { vst1q_lane_f32(p, vreinterpretq_f32_m128(a), 0); } // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr must be aligned on a 16-byte boundary // or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd FORCE_INLINE void _mm_store_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(a)); #else vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(a)); #endif } // Store the upper double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeh_pd FORCE_INLINE void _mm_storeh_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_high_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_high_f32(vreinterpretq_f32_m128d(a))); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // memory. // // MEM[mem_addr+63:mem_addr] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storel_pd FORCE_INLINE void _mm_storel_pd(double *mem_addr, __m128d a) { #if defined(__aarch64__) vst1_f64((float64_t *) mem_addr, vget_low_f64(vreinterpretq_f64_m128d(a))); #else vst1_f32((float32_t *) mem_addr, vget_low_f32(vreinterpretq_f32_m128d(a))); #endif } // Store 2 double-precision (64-bit) floating-point elements from a into memory // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // MEM[mem_addr+63:mem_addr] := a[127:64] // MEM[mem_addr+127:mem_addr+64] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storer_pd FORCE_INLINE void _mm_storer_pd(double *mem_addr, __m128d a) { float32x4_t f = vreinterpretq_f32_m128d(a); _mm_store_pd(mem_addr, vreinterpretq_m128d_f32(vextq_f32(f, f, 2))); } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_store_pd1 FORCE_INLINE void _mm_store_pd1(double *mem_addr, __m128d a) { #if defined(__aarch64__) float64x1_t a_low = vget_low_f64(vreinterpretq_f64_m128d(a)); vst1q_f64((float64_t *) mem_addr, vreinterpretq_f64_m128d(vcombine_f64(a_low, a_low))); #else float32x2_t a_low = vget_low_f32(vreinterpretq_f32_m128d(a)); vst1q_f32((float32_t *) mem_addr, vreinterpretq_f32_m128d(vcombine_f32(a_low, a_low))); #endif } // Store the lower double-precision (64-bit) floating-point element from a into // 2 contiguous elements in memory. mem_addr must be aligned on a 16-byte // boundary or a general-protection exception may be generated. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#expand=9,526,5601&text=_mm_store1_pd #define _mm_store1_pd _mm_store_pd1 // Store 128-bits (composed of 2 packed double-precision (64-bit) floating-point // elements) from a into memory. mem_addr does not need to be aligned on any // particular boundary. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_storeu_pd FORCE_INLINE void _mm_storeu_pd(double *mem_addr, __m128d a) { _mm_store_pd(mem_addr, a); } // Reads the lower 64 bits of b and stores them into the lower 64 bits of a. // https://msdn.microsoft.com/en-us/library/hhwf428f%28v=vs.90%29.aspx FORCE_INLINE void _mm_storel_epi64(__m128i *a, __m128i b) { uint64x1_t hi = vget_high_u64(vreinterpretq_u64_m128i(*a)); uint64x1_t lo = vget_low_u64(vreinterpretq_u64_m128i(b)); *a = vreinterpretq_m128i_u64(vcombine_u64(lo, hi)); } // Stores the lower two single-precision floating point values of a to the // address p. // // *p0 := a0 // *p1 := a1 // // https://msdn.microsoft.com/en-us/library/h54t98ks(v=vs.90).aspx FORCE_INLINE void _mm_storel_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_low_f32(a)); } // Stores the upper two single-precision, floating-point values of a to the // address p. // // *p0 := a2 // *p1 := a3 // // https://msdn.microsoft.com/en-us/library/a7525fs8(v%3dvs.90).aspx FORCE_INLINE void _mm_storeh_pi(__m64 *p, __m128 a) { *p = vreinterpret_m64_f32(vget_high_f32(a)); } // Loads a single single-precision, floating-point value, copying it into all // four words // https://msdn.microsoft.com/en-us/library/vstudio/5cdkf716(v=vs.100).aspx FORCE_INLINE __m128 _mm_load1_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_dup_f32(p)); } // Load a single-precision (32-bit) floating-point element from memory into all // elements of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[63:32] := MEM[mem_addr+31:mem_addr] // dst[95:64] := MEM[mem_addr+31:mem_addr] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_ps1 #define _mm_load_ps1 _mm_load1_ps // Sets the lower two single-precision, floating-point values with 64 // bits of data loaded from the address p; the upper two values are passed // through from a. // // Return Value // r0 := *p0 // r1 := *p1 // r2 := a2 // r3 := a3 // // https://msdn.microsoft.com/en-us/library/s57cyak2(v=vs.100).aspx FORCE_INLINE __m128 _mm_loadl_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vld1_f32((const float32_t *) p), vget_high_f32(a))); } // Load 4 single-precision (32-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[31:0] := MEM[mem_addr+127:mem_addr+96] // dst[63:32] := MEM[mem_addr+95:mem_addr+64] // dst[95:64] := MEM[mem_addr+63:mem_addr+32] // dst[127:96] := MEM[mem_addr+31:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_ps FORCE_INLINE __m128 _mm_loadr_ps(const float *p) { float32x4_t v = vrev64q_f32(vld1q_f32(p)); return vreinterpretq_m128_f32(vextq_f32(v, v, 2)); } // Sets the upper two single-precision, floating-point values with 64 // bits of data loaded from the address p; the lower two values are passed // through from a. // // r0 := a0 // r1 := a1 // r2 := *p0 // r3 := *p1 // // https://msdn.microsoft.com/en-us/library/w92wta0x(v%3dvs.100).aspx FORCE_INLINE __m128 _mm_loadh_pi(__m128 a, __m64 const *p) { return vreinterpretq_m128_f32( vcombine_f32(vget_low_f32(a), vld1_f32((const float32_t *) p))); } // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/zzd50xxt(v=vs.100).aspx FORCE_INLINE __m128 _mm_load_ps(const float *p) { return vreinterpretq_m128_f32(vld1q_f32(p)); } // Loads four single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/x1b16s7z%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_loadu_ps(const float *p) { // for neon, alignment doesn't matter, so _mm_load_ps and _mm_loadu_ps are // equivalent for neon return vreinterpretq_m128_f32(vld1q_f32(p)); } // Load unaligned 16-bit integer from memory into the first element of dst. // // dst[15:0] := MEM[mem_addr+15:mem_addr] // dst[MAX:16] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si16 FORCE_INLINE __m128i _mm_loadu_si16(const void *p) { return vreinterpretq_m128i_s16( vsetq_lane_s16(*(const int16_t *) p, vdupq_n_s16(0), 0)); } // Load unaligned 64-bit integer from memory into the first element of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[MAX:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si64 FORCE_INLINE __m128i _mm_loadu_si64(const void *p) { return vreinterpretq_m128i_s64( vcombine_s64(vld1_s64((const int64_t *) p), vdup_n_s64(0))); } // Load a double-precision (64-bit) floating-point element from memory into the // lower of dst, and zero the upper element. mem_addr does not need to be // aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_sd FORCE_INLINE __m128d _mm_load_sd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vsetq_lane_f64(*p, vdupq_n_f64(0), 0)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], 0, 0}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Loads two double-precision from 16-byte aligned memory, floating-point // values. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd FORCE_INLINE __m128d _mm_load_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_f64(p)); #else const float *fp = (const float *) p; float ALIGN_STRUCT(16) data[4] = {fp[0], fp[1], fp[2], fp[3]}; return vreinterpretq_m128d_f32(vld1q_f32(data)); #endif } // Loads two double-precision from unaligned memory, floating-point values. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_pd FORCE_INLINE __m128d _mm_loadu_pd(const double *p) { return _mm_load_pd(p); } // Loads an single - precision, floating - point value into the low word and // clears the upper three words. // https://msdn.microsoft.com/en-us/library/548bb9h4%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_load_ss(const float *p) { return vreinterpretq_m128_f32(vsetq_lane_f32(*p, vdupq_n_f32(0), 0)); } // Load 64-bit integer from memory into the first element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_epi64 FORCE_INLINE __m128i _mm_loadl_epi64(__m128i const *p) { /* Load the lower 64 bits of the value pointed to by p into the * lower 64 bits of the result, zeroing the upper 64 bits of the result. */ return vreinterpretq_m128i_s32( vcombine_s32(vld1_s32((int32_t const *) p), vcreate_s32(0))); } // Load a double-precision (64-bit) floating-point element from memory into the // lower element of dst, and copy the upper element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadl_pd FORCE_INLINE __m128d _mm_loadl_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vld1_f64(p), vget_high_f64(vreinterpretq_f64_m128d(a)))); #else return vreinterpretq_m128d_f32( vcombine_f32(vld1_f32((const float *) p), vget_high_f32(vreinterpretq_f32_m128d(a)))); #endif } // Load 2 double-precision (64-bit) floating-point elements from memory into dst // in reverse order. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[63:0] := MEM[mem_addr+127:mem_addr+64] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadr_pd FORCE_INLINE __m128d _mm_loadr_pd(const double *p) { #if defined(__aarch64__) float64x2_t v = vld1q_f64(p); return vreinterpretq_m128d_f64(vextq_f64(v, v, 1)); #else int64x2_t v = vld1q_s64((const int64_t *) p); return vreinterpretq_m128d_s64(vextq_s64(v, v, 1)); #endif } // Sets the low word to the single-precision, floating-point value of b // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/35hdzazd(v=vs.100) FORCE_INLINE __m128 _mm_move_ss(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsetq_lane_f32(vgetq_lane_f32(vreinterpretq_f32_m128(b), 0), vreinterpretq_f32_m128(a), 0)); } // Move the lower double-precision (64-bit) floating-point element from b to the // lower element of dst, and copy the upper element from a to the upper element // of dst. // // dst[63:0] := b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_sd FORCE_INLINE __m128d _mm_move_sd(__m128d a, __m128d b) { return vreinterpretq_m128d_f32( vcombine_f32(vget_low_f32(vreinterpretq_f32_m128d(b)), vget_high_f32(vreinterpretq_f32_m128d(a)))); } // Copy the lower 64-bit integer in a to the lower element of dst, and zero the // upper element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_move_epi64 FORCE_INLINE __m128i _mm_move_epi64(__m128i a) { return vreinterpretq_m128i_s64( vsetq_lane_s64(0, vreinterpretq_s64_m128i(a), 1)); } // Return vector of type __m128 with undefined elements. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_undefined_ps FORCE_INLINE __m128 _mm_undefined_ps(void) { #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic push #pragma GCC diagnostic ignored "-Wuninitialized" #endif __m128 a; return a; #if defined(__GNUC__) || defined(__clang__) #pragma GCC diagnostic pop #endif } /* Logic/Binary operations */ // Computes the bitwise AND-NOT of the four single-precision, floating-point // values of a and b. // // r0 := ~a0 & b0 // r1 := ~a1 & b1 // r2 := ~a2 & b2 // r3 := ~a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/68h7wd02(v=vs.100).aspx FORCE_INLINE __m128 _mm_andnot_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vbicq_s32(vreinterpretq_s32_m128(b), vreinterpretq_s32_m128(a))); // *NOTE* argument swap } // Compute the bitwise NOT of packed double-precision (64-bit) floating-point // elements in a and then AND with b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := ((NOT a[i+63:i]) AND b[i+63:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_andnot_pd FORCE_INLINE __m128d _mm_andnot_pd(__m128d a, __m128d b) { // *NOTE* argument swap return vreinterpretq_m128d_s64( vbicq_s64(vreinterpretq_s64_m128d(b), vreinterpretq_s64_m128d(a))); } // Computes the bitwise AND of the 128-bit value in b and the bitwise NOT of the // 128-bit value in a. // // r := (~a) & b // // https://msdn.microsoft.com/en-us/library/vstudio/1beaceh8(v=vs.100).aspx FORCE_INLINE __m128i _mm_andnot_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vbicq_s32(vreinterpretq_s32_m128i(b), vreinterpretq_s32_m128i(a))); // *NOTE* argument swap } // Computes the bitwise AND of the 128-bit value in a and the 128-bit value in // b. // // r := a & b // // https://msdn.microsoft.com/en-us/library/vstudio/6d1txsa8(v=vs.100).aspx FORCE_INLINE __m128i _mm_and_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vandq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Computes the bitwise AND of the four single-precision, floating-point values // of a and b. // // r0 := a0 & b0 // r1 := a1 & b1 // r2 := a2 & b2 // r3 := a3 & b3 // // https://msdn.microsoft.com/en-us/library/vstudio/73ck1xc5(v=vs.100).aspx FORCE_INLINE __m128 _mm_and_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vandq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Compute the bitwise AND of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] AND b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_and_pd FORCE_INLINE __m128d _mm_and_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vandq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise OR of the four single-precision, floating-point values // of a and b. // https://msdn.microsoft.com/en-us/library/vstudio/7ctdsyy0(v=vs.100).aspx FORCE_INLINE __m128 _mm_or_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( vorrq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Computes bitwise EXOR (exclusive-or) of the four single-precision, // floating-point values of a and b. // https://msdn.microsoft.com/en-us/library/ss6k3wk8(v=vs.100).aspx FORCE_INLINE __m128 _mm_xor_ps(__m128 a, __m128 b) { return vreinterpretq_m128_s32( veorq_s32(vreinterpretq_s32_m128(a), vreinterpretq_s32_m128(b))); } // Compute the bitwise XOR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] XOR b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_xor_pd FORCE_INLINE __m128d _mm_xor_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( veorq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Compute the bitwise OR of packed double-precision (64-bit) floating-point // elements in a and b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_or_pd FORCE_INLINE __m128d _mm_or_pd(__m128d a, __m128d b) { return vreinterpretq_m128d_s64( vorrq_s64(vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b))); } // Computes the bitwise OR of the 128-bit value in a and the 128-bit value in b. // // r := a | b // // https://msdn.microsoft.com/en-us/library/vstudio/ew8ty0db(v=vs.100).aspx FORCE_INLINE __m128i _mm_or_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vorrq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Computes the bitwise XOR of the 128-bit value in a and the 128-bit value in // b. https://msdn.microsoft.com/en-us/library/fzt08www(v=vs.100).aspx FORCE_INLINE __m128i _mm_xor_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( veorq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Duplicate the low double-precision (64-bit) floating-point element from a, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movedup_pd FORCE_INLINE __m128d _mm_movedup_pd(__m128d a) { #if (__aarch64__) return vreinterpretq_m128d_f64( vdupq_laneq_f64(vreinterpretq_f64_m128d(a), 0)); #else return vreinterpretq_m128d_u64( vdupq_n_u64(vgetq_lane_u64(vreinterpretq_u64_m128d(a), 0))); #endif } // Duplicate odd-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movehdup_ps FORCE_INLINE __m128 _mm_movehdup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 1, 1, 3, 3)); #else float32_t a1 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); float32_t a3 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 3); float ALIGN_STRUCT(16) data[4] = {a1, a1, a3, a3}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } // Duplicate even-indexed single-precision (32-bit) floating-point elements // from a, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_moveldup_ps FORCE_INLINE __m128 _mm_moveldup_ps(__m128 a) { #if __has_builtin(__builtin_shufflevector) return vreinterpretq_m128_f32(__builtin_shufflevector( vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 0, 0, 2, 2)); #else float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t a2 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 2); float ALIGN_STRUCT(16) data[4] = {a0, a0, a2, a2}; return vreinterpretq_m128_f32(vld1q_f32(data)); #endif } // Moves the upper two values of B into the lower two values of A. // // r3 := a3 // r2 := a2 // r1 := b3 // r0 := b2 FORCE_INLINE __m128 _mm_movehl_ps(__m128 __A, __m128 __B) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(__A)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(b32, a32)); } // Moves the lower two values of B into the upper two values of A. // // r3 := b1 // r2 := b0 // r1 := a1 // r0 := a0 FORCE_INLINE __m128 _mm_movelh_ps(__m128 __A, __m128 __B) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(__A)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(__B)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi32 FORCE_INLINE __m128i _mm_abs_epi32(__m128i a) { return vreinterpretq_m128i_s32(vabsq_s32(vreinterpretq_s32_m128i(a))); } // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi16 FORCE_INLINE __m128i _mm_abs_epi16(__m128i a) { return vreinterpretq_m128i_s16(vabsq_s16(vreinterpretq_s16_m128i(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_epi8 FORCE_INLINE __m128i _mm_abs_epi8(__m128i a) { return vreinterpretq_m128i_s8(vabsq_s8(vreinterpretq_s8_m128i(a))); } // Compute the absolute value of packed signed 32-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 1 // i := j*32 // dst[i+31:i] := ABS(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi32 FORCE_INLINE __m64 _mm_abs_pi32(__m64 a) { return vreinterpret_m64_s32(vabs_s32(vreinterpret_s32_m64(a))); } // Compute the absolute value of packed signed 16-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := ABS(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi16 FORCE_INLINE __m64 _mm_abs_pi16(__m64 a) { return vreinterpret_m64_s16(vabs_s16(vreinterpret_s16_m64(a))); } // Compute the absolute value of packed signed 8-bit integers in a, and store // the unsigned results in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := ABS(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_abs_pi8 FORCE_INLINE __m64 _mm_abs_pi8(__m64 a) { return vreinterpret_m64_s8(vabs_s8(vreinterpret_s8_m64(a))); } // Concatenate 16-byte blocks in a and b into a 32-byte temporary result, shift // the result right by imm8 bytes, and store the low 16 bytes in dst. // // tmp[255:0] := ((a[127:0] << 128)[255:0] OR b[127:0]) >> (imm8*8) // dst[127:0] := tmp[127:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_epi8 #define _mm_alignr_epi8(a, b, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) >= 32)) { \ ret = _mm_setzero_si128(); \ } else { \ uint8x16_t tmp_low, tmp_high; \ if (imm >= 16) { \ const int idx = imm - 16; \ tmp_low = vreinterpretq_u8_m128i(a); \ tmp_high = vdupq_n_u8(0); \ ret = \ vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ } else { \ const int idx = imm; \ tmp_low = vreinterpretq_u8_m128i(b); \ tmp_high = vreinterpretq_u8_m128i(a); \ ret = \ vreinterpretq_m128i_u8(vextq_u8(tmp_low, tmp_high, idx)); \ } \ } \ ret; \ }) // Concatenate 8-byte blocks in a and b into a 16-byte temporary result, shift // the result right by imm8 bytes, and store the low 8 bytes in dst. // // tmp[127:0] := ((a[63:0] << 64)[127:0] OR b[63:0]) >> (imm8*8) // dst[63:0] := tmp[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_alignr_pi8 #define _mm_alignr_pi8(a, b, imm) \ __extension__({ \ __m64 ret; \ if (unlikely((imm) >= 16)) { \ ret = vreinterpret_m64_s8(vdup_n_s8(0)); \ } else { \ uint8x8_t tmp_low, tmp_high; \ if (imm >= 8) { \ const int idx = imm - 8; \ tmp_low = vreinterpret_u8_m64(a); \ tmp_high = vdup_n_u8(0); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } else { \ const int idx = imm; \ tmp_low = vreinterpret_u8_m64(b); \ tmp_high = vreinterpret_u8_m64(a); \ ret = vreinterpret_m64_u8(vext_u8(tmp_low, tmp_high, idx)); \ } \ } \ ret; \ }) // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of b and places it into the high end of the result. FORCE_INLINE __m128 _mm_shuffle_ps_1032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a32, b10)); } // takes the lower two 32-bit values from a and swaps them and places in high // end of result takes the higher two 32 bit values from b and swaps them and // places in low end of result. FORCE_INLINE __m128 _mm_shuffle_ps_2301(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b23 = vrev64_f32(vget_high_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b23)); } FORCE_INLINE __m128 _mm_shuffle_ps_0321(__m128 a, __m128 b) { float32x2_t a21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a21, b03)); } FORCE_INLINE __m128 _mm_shuffle_ps_2103(__m128 a, __m128 b) { float32x2_t a03 = vget_low_f32( vextq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a), 3)); float32x2_t b21 = vget_high_f32( vextq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b), 3)); return vreinterpretq_m128_f32(vcombine_f32(a03, b21)); } FORCE_INLINE __m128 _mm_shuffle_ps_1010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_1001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a01, b10)); } FORCE_INLINE __m128 _mm_shuffle_ps_0101(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32x2_t b01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(b))); return vreinterpretq_m128_f32(vcombine_f32(a01, b01)); } // keeps the low 64 bits of b in the low and puts the high 64 bits of a in the // high FORCE_INLINE __m128 _mm_shuffle_ps_3210(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a10, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_0011(__m128 a, __m128 b) { float32x2_t a11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a11, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_0022(__m128 a, __m128 b) { float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a22, b00)); } FORCE_INLINE __m128 _mm_shuffle_ps_2200(__m128 a, __m128 b) { float32x2_t a00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t b22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(b)), 0); return vreinterpretq_m128_f32(vcombine_f32(a00, b22)); } FORCE_INLINE __m128 _mm_shuffle_ps_3202(__m128 a, __m128 b) { float32_t a0 = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32x2_t a22 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 0); float32x2_t a02 = vset_lane_f32(a0, a22, 1); /* TODO: use vzip ?*/ float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32(vcombine_f32(a02, b32)); } FORCE_INLINE __m128 _mm_shuffle_ps_1133(__m128 a, __m128 b) { float32x2_t a33 = vdup_lane_f32(vget_high_f32(vreinterpretq_f32_m128(a)), 1); float32x2_t b11 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 1); return vreinterpretq_m128_f32(vcombine_f32(a33, b11)); } FORCE_INLINE __m128 _mm_shuffle_ps_2010(__m128 a, __m128 b) { float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a10, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2001(__m128 a, __m128 b) { float32x2_t a01 = vrev64_f32(vget_low_f32(vreinterpretq_f32_m128(a))); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a01, b20)); } FORCE_INLINE __m128 _mm_shuffle_ps_2032(__m128 a, __m128 b) { float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32_t b2 = vgetq_lane_f32(b, 2); float32x2_t b00 = vdup_lane_f32(vget_low_f32(vreinterpretq_f32_m128(b)), 0); float32x2_t b20 = vset_lane_f32(b2, b00, 1); return vreinterpretq_m128_f32(vcombine_f32(a32, b20)); } // NEON does not support a general purpose permute intrinsic // Selects four specific single-precision, floating-point values from a and b, // based on the mask i. // // C equivalent: // __m128 _mm_shuffle_ps_default(__m128 a, __m128 b, // __constrange(0, 255) int imm) { // __m128 ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = b[(imm >> 4) & 0x03]; ret[3] = b[(imm >> 6) & 0x03]; // return ret; // } // // https://msdn.microsoft.com/en-us/library/vstudio/5f0858x0(v=vs.100).aspx #define _mm_shuffle_ps_default(a, b, imm) \ __extension__({ \ float32x4_t ret; \ ret = vmovq_n_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), (imm) & (0x3))); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_f32( \ vgetq_lane_f32(vreinterpretq_f32_m128(b), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128_f32(ret); \ }) // FORCE_INLINE __m128 _mm_shuffle_ps(__m128 a, __m128 b, __constrange(0,255) // int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ float32x4_t _input1 = vreinterpretq_f32_m128(a); \ float32x4_t _input2 = vreinterpretq_f32_m128(b); \ float32x4_t _shuf = __builtin_shufflevector( \ _input1, _input2, (imm) & (0x3), ((imm) >> 2) & 0x3, \ (((imm) >> 4) & 0x3) + 4, (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128_f32(_shuf); \ }) #else // generic #define _mm_shuffle_ps(a, b, imm) \ __extension__({ \ __m128 ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_ps_1032((a), (b)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_ps_2301((a), (b)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_ps_0321((a), (b)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_ps_2103((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_movelh_ps((a), (b)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_ps_1001((a), (b)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_ps_0101((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 1, 0): \ ret = _mm_shuffle_ps_3210((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 1, 1): \ ret = _mm_shuffle_ps_0011((a), (b)); \ break; \ case _MM_SHUFFLE(0, 0, 2, 2): \ ret = _mm_shuffle_ps_0022((a), (b)); \ break; \ case _MM_SHUFFLE(2, 2, 0, 0): \ ret = _mm_shuffle_ps_2200((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 0, 2): \ ret = _mm_shuffle_ps_3202((a), (b)); \ break; \ case _MM_SHUFFLE(3, 2, 3, 2): \ ret = _mm_movehl_ps((b), (a)); \ break; \ case _MM_SHUFFLE(1, 1, 3, 3): \ ret = _mm_shuffle_ps_1133((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 1, 0): \ ret = _mm_shuffle_ps_2010((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 0, 1): \ ret = _mm_shuffle_ps_2001((a), (b)); \ break; \ case _MM_SHUFFLE(2, 0, 3, 2): \ ret = _mm_shuffle_ps_2032((a), (b)); \ break; \ default: \ ret = _mm_shuffle_ps_default((a), (b), (imm)); \ break; \ } \ ret; \ }) #endif // Takes the upper 64 bits of a and places it in the low end of the result // Takes the lower 64 bits of a and places it into the high end of the result. FORCE_INLINE __m128i _mm_shuffle_epi_1032(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a32, a10)); } // takes the lower two 32-bit values from a and swaps them and places in low end // of result takes the higher two 32 bit values from a and swaps them and places // in high end of result. FORCE_INLINE __m128i _mm_shuffle_epi_2301(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a23 = vrev64_s32(vget_high_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a23)); } // rotates the least significant 32 bits into the most signficant 32 bits, and // shifts the rest down FORCE_INLINE __m128i _mm_shuffle_epi_0321(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 1)); } // rotates the most significant 32 bits into the least signficant 32 bits, and // shifts the rest up FORCE_INLINE __m128i _mm_shuffle_epi_2103(__m128i a) { return vreinterpretq_m128i_s32( vextq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(a), 3)); } // gets the lower 64 bits of a, and places it in the upper 64 bits // gets the lower 64 bits of a and places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1010(__m128i a) { int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a10, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements, and places it in the // lower 64 bits gets the lower 64 bits of a, and places it in the upper 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_1001(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); int32x2_t a10 = vget_low_s32(vreinterpretq_s32_m128i(a)); return vreinterpretq_m128i_s32(vcombine_s32(a01, a10)); } // gets the lower 64 bits of a, swaps the 0 and 1 elements and places it in the // upper 64 bits gets the lower 64 bits of a, swaps the 0 and 1 elements, and // places it in the lower 64 bits FORCE_INLINE __m128i _mm_shuffle_epi_0101(__m128i a) { int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a01, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_2211(__m128i a) { int32x2_t a11 = vdup_lane_s32(vget_low_s32(vreinterpretq_s32_m128i(a)), 1); int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); return vreinterpretq_m128i_s32(vcombine_s32(a11, a22)); } FORCE_INLINE __m128i _mm_shuffle_epi_0122(__m128i a) { int32x2_t a22 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 0); int32x2_t a01 = vrev64_s32(vget_low_s32(vreinterpretq_s32_m128i(a))); return vreinterpretq_m128i_s32(vcombine_s32(a22, a01)); } FORCE_INLINE __m128i _mm_shuffle_epi_3332(__m128i a) { int32x2_t a32 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t a33 = vdup_lane_s32(vget_high_s32(vreinterpretq_s32_m128i(a)), 1); return vreinterpretq_m128i_s32(vcombine_s32(a32, a33)); } // Shuffle packed 8-bit integers in a according to shuffle control mask in the // corresponding 8-bit element of b, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_epi8 FORCE_INLINE __m128i _mm_shuffle_epi8(__m128i a, __m128i b) { int8x16_t tbl = vreinterpretq_s8_m128i(a); // input a uint8x16_t idx = vreinterpretq_u8_m128i(b); // input b uint8x16_t idx_masked = vandq_u8(idx, vdupq_n_u8(0x8F)); // avoid using meaningless bits #if defined(__aarch64__) return vreinterpretq_m128i_s8(vqtbl1q_s8(tbl, idx_masked)); #elif defined(__GNUC__) int8x16_t ret; // %e and %f represent the even and odd D registers // respectively. __asm__ __volatile__( "vtbl.8 %e[ret], {%e[tbl], %f[tbl]}, %e[idx]\n" "vtbl.8 %f[ret], {%e[tbl], %f[tbl]}, %f[idx]\n" : [ret] "=&w"(ret) : [tbl] "w"(tbl), [idx] "w"(idx_masked)); return vreinterpretq_m128i_s8(ret); #else // use this line if testing on aarch64 int8x8x2_t a_split = {vget_low_s8(tbl), vget_high_s8(tbl)}; return vreinterpretq_m128i_s8( vcombine_s8(vtbl2_s8(a_split, vget_low_u8(idx_masked)), vtbl2_s8(a_split, vget_high_u8(idx_masked)))); #endif } // C equivalent: // __m128i _mm_shuffle_epi32_default(__m128i a, // __constrange(0, 255) int imm) { // __m128i ret; // ret[0] = a[imm & 0x3]; ret[1] = a[(imm >> 2) & 0x3]; // ret[2] = a[(imm >> 4) & 0x03]; ret[3] = a[(imm >> 6) & 0x03]; // return ret; // } #define _mm_shuffle_epi32_default(a, imm) \ __extension__({ \ int32x4_t ret; \ ret = vmovq_n_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm) & (0x3))); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 2) & 0x3), \ ret, 1); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 4) & 0x3), \ ret, 2); \ ret = vsetq_lane_s32( \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), ((imm) >> 6) & 0x3), \ ret, 3); \ vreinterpretq_m128i_s32(ret); \ }) // FORCE_INLINE __m128i _mm_shuffle_epi32_splat(__m128i a, __constrange(0,255) // int imm) #if defined(__aarch64__) #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_laneq_s32(vreinterpretq_s32_m128i(a), (imm))); \ }) #else #define _mm_shuffle_epi32_splat(a, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vdupq_n_s32(vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)))); \ }) #endif // Shuffles the 4 signed or unsigned 32-bit integers in a as specified by imm. // https://msdn.microsoft.com/en-us/library/56f67xbk%28v=vs.90%29.aspx // FORCE_INLINE __m128i _mm_shuffle_epi32(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ int32x4_t _input = vreinterpretq_s32_m128i(a); \ int32x4_t _shuf = __builtin_shufflevector( \ _input, _input, (imm) & (0x3), ((imm) >> 2) & 0x3, \ ((imm) >> 4) & 0x3, ((imm) >> 6) & 0x3); \ vreinterpretq_m128i_s32(_shuf); \ }) #else // generic #define _mm_shuffle_epi32(a, imm) \ __extension__({ \ __m128i ret; \ switch (imm) { \ case _MM_SHUFFLE(1, 0, 3, 2): \ ret = _mm_shuffle_epi_1032((a)); \ break; \ case _MM_SHUFFLE(2, 3, 0, 1): \ ret = _mm_shuffle_epi_2301((a)); \ break; \ case _MM_SHUFFLE(0, 3, 2, 1): \ ret = _mm_shuffle_epi_0321((a)); \ break; \ case _MM_SHUFFLE(2, 1, 0, 3): \ ret = _mm_shuffle_epi_2103((a)); \ break; \ case _MM_SHUFFLE(1, 0, 1, 0): \ ret = _mm_shuffle_epi_1010((a)); \ break; \ case _MM_SHUFFLE(1, 0, 0, 1): \ ret = _mm_shuffle_epi_1001((a)); \ break; \ case _MM_SHUFFLE(0, 1, 0, 1): \ ret = _mm_shuffle_epi_0101((a)); \ break; \ case _MM_SHUFFLE(2, 2, 1, 1): \ ret = _mm_shuffle_epi_2211((a)); \ break; \ case _MM_SHUFFLE(0, 1, 2, 2): \ ret = _mm_shuffle_epi_0122((a)); \ break; \ case _MM_SHUFFLE(3, 3, 3, 2): \ ret = _mm_shuffle_epi_3332((a)); \ break; \ case _MM_SHUFFLE(0, 0, 0, 0): \ ret = _mm_shuffle_epi32_splat((a), 0); \ break; \ case _MM_SHUFFLE(1, 1, 1, 1): \ ret = _mm_shuffle_epi32_splat((a), 1); \ break; \ case _MM_SHUFFLE(2, 2, 2, 2): \ ret = _mm_shuffle_epi32_splat((a), 2); \ break; \ case _MM_SHUFFLE(3, 3, 3, 3): \ ret = _mm_shuffle_epi32_splat((a), 3); \ break; \ default: \ ret = _mm_shuffle_epi32_default((a), (imm)); \ break; \ } \ ret; \ }) #endif // Shuffles the lower 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/y41dkk37(v=vs.100) // FORCE_INLINE __m128i _mm_shufflelo_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflelo_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t lowBits = vget_low_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, (imm) & (0x3)), ret, 0); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 2) & 0x3), ret, \ 1); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 4) & 0x3), ret, \ 2); \ ret = vsetq_lane_s16(vget_lane_s16(lowBits, ((imm) >> 6) & 0x3), ret, \ 3); \ vreinterpretq_m128i_s16(ret); \ }) // FORCE_INLINE __m128i _mm_shufflelo_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflelo_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, ((imm) & (0x3)), (((imm) >> 2) & 0x3), \ (((imm) >> 4) & 0x3), (((imm) >> 6) & 0x3), 4, 5, 6, 7); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflelo_epi16(a, imm) _mm_shufflelo_epi16_function((a), (imm)) #endif // Shuffles the upper 4 signed or unsigned 16-bit integers in a as specified // by imm. // https://msdn.microsoft.com/en-us/library/13ywktbs(v=vs.100).aspx // FORCE_INLINE __m128i _mm_shufflehi_epi16_function(__m128i a, // __constrange(0,255) int // imm) #define _mm_shufflehi_epi16_function(a, imm) \ __extension__({ \ int16x8_t ret = vreinterpretq_s16_m128i(a); \ int16x4_t highBits = vget_high_s16(ret); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, (imm) & (0x3)), ret, 4); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 2) & 0x3), ret, \ 5); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 4) & 0x3), ret, \ 6); \ ret = vsetq_lane_s16(vget_lane_s16(highBits, ((imm) >> 6) & 0x3), ret, \ 7); \ vreinterpretq_m128i_s16(ret); \ }) // FORCE_INLINE __m128i _mm_shufflehi_epi16(__m128i a, // __constrange(0,255) int imm) #if __has_builtin(__builtin_shufflevector) #define _mm_shufflehi_epi16(a, imm) \ __extension__({ \ int16x8_t _input = vreinterpretq_s16_m128i(a); \ int16x8_t _shuf = __builtin_shufflevector( \ _input, _input, 0, 1, 2, 3, ((imm) & (0x3)) + 4, \ (((imm) >> 2) & 0x3) + 4, (((imm) >> 4) & 0x3) + 4, \ (((imm) >> 6) & 0x3) + 4); \ vreinterpretq_m128i_s16(_shuf); \ }) #else // generic #define _mm_shufflehi_epi16(a, imm) _mm_shufflehi_epi16_function((a), (imm)) #endif // Shuffle double-precision (64-bit) floating-point elements using the control // in imm8, and store the results in dst. // // dst[63:0] := (imm8[0] == 0) ? a[63:0] : a[127:64] // dst[127:64] := (imm8[1] == 0) ? b[63:0] : b[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_shuffle_pd #if __has_builtin(__builtin_shufflevector) #define _mm_shuffle_pd(a, b, imm8) \ vreinterpretq_m128d_s64(__builtin_shufflevector( \ vreinterpretq_s64_m128d(a), vreinterpretq_s64_m128d(b), imm8 & 0x1, \ ((imm8 & 0x2) >> 1) + 2)) #else #define _mm_shuffle_pd(a, b, imm8) \ _mm_castsi128_pd(_mm_set_epi64x( \ vgetq_lane_s64(vreinterpretq_s64_m128d(b), (imm8 & 0x2) >> 1), \ vgetq_lane_s64(vreinterpretq_s64_m128d(a), imm8 & 0x1))) #endif // Blend packed 16-bit integers from a and b using control mask imm8, and store // the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[j] // dst[i+15:i] := b[i+15:i] // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // FORCE_INLINE __m128i _mm_blend_epi16(__m128i a, __m128i b, // __constrange(0,255) int imm) #define _mm_blend_epi16(a, b, imm) \ __extension__({ \ const uint16_t _mask[8] = {((imm) & (1 << 0)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 1)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 2)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 3)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 4)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 5)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 6)) ? 0xFFFF : 0x0000, \ ((imm) & (1 << 7)) ? 0xFFFF : 0x0000}; \ uint16x8_t _mask_vec = vld1q_u16(_mask); \ uint16x8_t _a = vreinterpretq_u16_m128i(a); \ uint16x8_t _b = vreinterpretq_u16_m128i(b); \ vreinterpretq_m128i_u16(vbslq_u16(_mask_vec, _b, _a)); \ }) // Blend packed 8-bit integers from a and b using mask, and store the results in // dst. // // FOR j := 0 to 15 // i := j*8 // IF mask[i+7] // dst[i+7:i] := b[i+7:i] // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR FORCE_INLINE __m128i _mm_blendv_epi8(__m128i _a, __m128i _b, __m128i _mask) { // Use a signed shift right to create a mask with the sign bit uint8x16_t mask = vreinterpretq_u8_s8(vshrq_n_s8(vreinterpretq_s8_m128i(_mask), 7)); uint8x16_t a = vreinterpretq_u8_m128i(_a); uint8x16_t b = vreinterpretq_u8_m128i(_b); return vreinterpretq_m128i_u8(vbslq_u8(mask, b, a)); } /* Shifts */ // Shift packed 16-bit integers in a right by imm while shifting in sign // bits, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi16 FORCE_INLINE __m128i _mm_srai_epi16(__m128i a, int imm) { const int count = (imm & ~15) ? 15 : imm; return (__m128i) vshlq_s16((int16x8_t) a, vdupq_n_s16(-count)); } // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while // shifting in zeros. // // r0 := a0 << count // r1 := a1 << count // ... // r7 := a7 << count // // https://msdn.microsoft.com/en-us/library/es73bcsy(v=vs.90).aspx #define _mm_slli_epi16(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm)) <= 0) { \ ret = a; \ } \ if (unlikely((imm) > 15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_s16( \ vshlq_n_s16(vreinterpretq_s16_m128i(a), (imm))); \ } \ ret; \ }) // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while // shifting in zeros. : // https://msdn.microsoft.com/en-us/library/z2k3bbtb%28v=vs.90%29.aspx // FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, __constrange(0,255) int imm) FORCE_INLINE __m128i _mm_slli_epi32(__m128i a, int imm) { if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ return a; if (unlikely(imm > 31)) return _mm_setzero_si128(); return vreinterpretq_m128i_s32( vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(imm))); } // Shift packed 64-bit integers in a left by imm8 while shifting in zeros, and // store the results in dst. FORCE_INLINE __m128i _mm_slli_epi64(__m128i a, int imm) { if (unlikely(imm <= 0)) /* TODO: add constant range macro: [0, 255] */ return a; if (unlikely(imm > 63)) return _mm_setzero_si128(); return vreinterpretq_m128i_s64( vshlq_s64(vreinterpretq_s64_m128i(a), vdupq_n_s64(imm))); } // Shift packed 16-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 7 // i := j*16 // IF imm8[7:0] > 15 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := ZeroExtend16(a[i+15:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi16 #define _mm_srli_epi16(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely(imm) == 0) { \ ret = a; \ } \ if (likely(0 < (imm) && (imm) < 16)) { \ ret = vreinterpretq_m128i_u16( \ vshlq_u16(vreinterpretq_u16_m128i(a), vdupq_n_s16(-imm))); \ } else { \ ret = _mm_setzero_si128(); \ } \ ret; \ }) // Shift packed 32-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := ZeroExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi32 // FORCE_INLINE __m128i _mm_srli_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srli_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) == 0)) { \ ret = a; \ } \ if (likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_u32( \ vshlq_u32(vreinterpretq_u32_m128i(a), vdupq_n_s32(-imm))); \ } else { \ ret = _mm_setzero_si128(); \ } \ ret; \ }) // Shift packed 64-bit integers in a right by imm8 while shifting in zeros, and // store the results in dst. // // FOR j := 0 to 1 // i := j*64 // IF imm8[7:0] > 63 // dst[i+63:i] := 0 // ELSE // dst[i+63:i] := ZeroExtend64(a[i+63:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srli_epi64 #define _mm_srli_epi64(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) == 0)) { \ ret = a; \ } \ if (likely(0 < (imm) && (imm) < 64)) { \ ret = vreinterpretq_m128i_u64( \ vshlq_u64(vreinterpretq_u64_m128i(a), vdupq_n_s64(-imm))); \ } else { \ ret = _mm_setzero_si128(); \ } \ ret; \ }) // Shift packed 32-bit integers in a right by imm8 while shifting in sign bits, // and store the results in dst. // // FOR j := 0 to 3 // i := j*32 // IF imm8[7:0] > 31 // dst[i+31:i] := (a[i+31] ? 0xFFFFFFFF : 0x0) // ELSE // dst[i+31:i] := SignExtend32(a[i+31:i] >> imm8[7:0]) // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_srai_epi32 // FORCE_INLINE __m128i _mm_srai_epi32(__m128i a, __constrange(0,255) int imm) #define _mm_srai_epi32(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) == 0)) { \ ret = a; \ } \ if (likely(0 < (imm) && (imm) < 32)) { \ ret = vreinterpretq_m128i_s32( \ vshlq_s32(vreinterpretq_s32_m128i(a), vdupq_n_s32(-imm))); \ } else { \ ret = vreinterpretq_m128i_s32( \ vshrq_n_s32(vreinterpretq_s32_m128i(a), 31)); \ } \ ret; \ }) // Shifts the 128 - bit value in a right by imm bytes while shifting in // zeros.imm must be an immediate. // // r := srl(a, imm*8) // // https://msdn.microsoft.com/en-us/library/305w28yz(v=vs.100).aspx // FORCE_INLINE _mm_srli_si128(__m128i a, __constrange(0,255) int imm) #define _mm_srli_si128(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) <= 0)) { \ ret = a; \ } \ if (unlikely((imm) > 15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_s8( \ vextq_s8(vreinterpretq_s8_m128i(a), vdupq_n_s8(0), (imm))); \ } \ ret; \ }) // Shifts the 128-bit value in a left by imm bytes while shifting in zeros. imm // must be an immediate. // // r := a << (imm * 8) // // https://msdn.microsoft.com/en-us/library/34d3k2kt(v=vs.100).aspx // FORCE_INLINE __m128i _mm_slli_si128(__m128i a, __constrange(0,255) int imm) #define _mm_slli_si128(a, imm) \ __extension__({ \ __m128i ret; \ if (unlikely((imm) <= 0)) { \ ret = a; \ } \ if (unlikely((imm) > 15)) { \ ret = _mm_setzero_si128(); \ } else { \ ret = vreinterpretq_m128i_s8(vextq_s8( \ vdupq_n_s8(0), vreinterpretq_s8_m128i(a), 16 - (imm))); \ } \ ret; \ }) // Shifts the 8 signed or unsigned 16-bit integers in a left by count bits while // shifting in zeros. // // r0 := a0 << count // r1 := a1 << count // ... // r7 := a7 << count // // https://msdn.microsoft.com/en-us/library/c79w388h(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_sll_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16((int16_t) c); return vreinterpretq_m128i_s16(vshlq_s16(vreinterpretq_s16_m128i(a), vc)); } // Shifts the 4 signed or unsigned 32-bit integers in a left by count bits while // shifting in zeros. // // r0 := a0 << count // r1 := a1 << count // r2 := a2 << count // r3 := a3 << count // // https://msdn.microsoft.com/en-us/library/6fe5a6s9(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_sll_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32((int32_t) c); return vreinterpretq_m128i_s32(vshlq_s32(vreinterpretq_s32_m128i(a), vc)); } // Shifts the 2 signed or unsigned 64-bit integers in a left by count bits while // shifting in zeros. // // r0 := a0 << count // r1 := a1 << count // // https://msdn.microsoft.com/en-us/library/6ta9dffd(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_sll_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64((int64_t) c); return vreinterpretq_m128i_s64(vshlq_s64(vreinterpretq_s64_m128i(a), vc)); } // Shifts the 8 signed or unsigned 16-bit integers in a right by count bits // while shifting in zeros. // // r0 := srl(a0, count) // r1 := srl(a1, count) // ... // r7 := srl(a7, count) // // https://msdn.microsoft.com/en-us/library/wd5ax830(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_srl_epi16(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 15)) return _mm_setzero_si128(); int16x8_t vc = vdupq_n_s16(-(int16_t) c); return vreinterpretq_m128i_u16(vshlq_u16(vreinterpretq_u16_m128i(a), vc)); } // Shifts the 4 signed or unsigned 32-bit integers in a right by count bits // while shifting in zeros. // // r0 := srl(a0, count) // r1 := srl(a1, count) // r2 := srl(a2, count) // r3 := srl(a3, count) // // https://msdn.microsoft.com/en-us/library/a9cbttf4(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_srl_epi32(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 31)) return _mm_setzero_si128(); int32x4_t vc = vdupq_n_s32(-(int32_t) c); return vreinterpretq_m128i_u32(vshlq_u32(vreinterpretq_u32_m128i(a), vc)); } // Shifts the 2 signed or unsigned 64-bit integers in a right by count bits // while shifting in zeros. // // r0 := srl(a0, count) // r1 := srl(a1, count) // // https://msdn.microsoft.com/en-us/library/yf6cf9k8(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_srl_epi64(__m128i a, __m128i count) { uint64_t c = vreinterpretq_nth_u64_m128i(count, 0); if (unlikely(c > 63)) return _mm_setzero_si128(); int64x2_t vc = vdupq_n_s64(-(int64_t) c); return vreinterpretq_m128i_u64(vshlq_u64(vreinterpretq_u64_m128i(a), vc)); } // NEON does not provide a version of this function. // Creates a 16-bit mask from the most significant bits of the 16 signed or // unsigned 8-bit integers in a and zero extends the upper bits. // https://msdn.microsoft.com/en-us/library/vstudio/s090c8fk(v=vs.100).aspx FORCE_INLINE int _mm_movemask_epi8(__m128i a) { // Use increasingly wide shifts+adds to collect the sign bits // together. // Since the widening shifts would be rather confusing to follow in little // endian, everything will be illustrated in big endian order instead. This // has a different result - the bits would actually be reversed on a big // endian machine. // Starting input (only half the elements are shown): // 89 ff 1d c0 00 10 99 33 uint8x16_t input = vreinterpretq_u8_m128i(a); // Shift out everything but the sign bits with an unsigned shift right. // // Bytes of the vector:: // 89 ff 1d c0 00 10 99 33 // \ \ \ \ \ \ \ \ high_bits = (uint16x4_t)(input >> 7) // | | | | | | | | // 01 01 00 01 00 00 01 00 // // Bits of first important lane(s): // 10001001 (89) // \______ // | // 00000001 (01) uint16x8_t high_bits = vreinterpretq_u16_u8(vshrq_n_u8(input, 7)); // Merge the even lanes together with a 16-bit unsigned shift right + add. // 'xx' represents garbage data which will be ignored in the final result. // In the important bytes, the add functions like a binary OR. // // 01 01 00 01 00 00 01 00 // \_ | \_ | \_ | \_ | paired16 = (uint32x4_t)(input + (input >> 7)) // \| \| \| \| // xx 03 xx 01 xx 00 xx 02 // // 00000001 00000001 (01 01) // \_______ | // \| // xxxxxxxx xxxxxx11 (xx 03) uint32x4_t paired16 = vreinterpretq_u32_u16(vsraq_n_u16(high_bits, high_bits, 7)); // Repeat with a wider 32-bit shift + add. // xx 03 xx 01 xx 00 xx 02 // \____ | \____ | paired32 = (uint64x1_t)(paired16 + (paired16 >> // 14)) // \| \| // xx xx xx 0d xx xx xx 02 // // 00000011 00000001 (03 01) // \\_____ || // '----.\|| // xxxxxxxx xxxx1101 (xx 0d) uint64x2_t paired32 = vreinterpretq_u64_u32(vsraq_n_u32(paired16, paired16, 14)); // Last, an even wider 64-bit shift + add to get our result in the low 8 bit // lanes. xx xx xx 0d xx xx xx 02 // \_________ | paired64 = (uint8x8_t)(paired32 + (paired32 >> // 28)) // \| // xx xx xx xx xx xx xx d2 // // 00001101 00000010 (0d 02) // \ \___ | | // '---. \| | // xxxxxxxx 11010010 (xx d2) uint8x16_t paired64 = vreinterpretq_u8_u64(vsraq_n_u64(paired32, paired32, 28)); // Extract the low 8 bits from each 64-bit lane with 2 8-bit extracts. // xx xx xx xx xx xx xx d2 // || return paired64[0] // d2 // Note: Little endian would return the correct value 4b (01001011) instead. return vgetq_lane_u8(paired64, 0) | ((int) vgetq_lane_u8(paired64, 8) << 8); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movepi64_pi64 FORCE_INLINE __m64 _mm_movepi64_pi64(__m128i a) { return vreinterpret_m64_s64(vget_low_s64(vreinterpretq_s64_m128i(a))); } // Copy the 64-bit integer a to the lower element of dst, and zero the upper // element. // // dst[63:0] := a[63:0] // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_movpi64_epi64 FORCE_INLINE __m128i _mm_movpi64_epi64(__m64 a) { return vreinterpretq_m128i_s64( vcombine_s64(vreinterpret_s64_m64(a), vdup_n_s64(0))); } // NEON does not provide this method // Creates a 4-bit mask from the most significant bits of the four // single-precision, floating-point values. // https://msdn.microsoft.com/en-us/library/vstudio/4490ys29(v=vs.100).aspx FORCE_INLINE int _mm_movemask_ps(__m128 a) { uint32x4_t input = vreinterpretq_u32_m128(a); #if defined(__aarch64__) static const int32x4_t shift = {0, 1, 2, 3}; uint32x4_t tmp = vshrq_n_u32(input, 31); return vaddvq_u32(vshlq_u32(tmp, shift)); #else // Uses the exact same method as _mm_movemask_epi8, see that for details. // Shift out everything but the sign bits with a 32-bit unsigned shift // right. uint64x2_t high_bits = vreinterpretq_u64_u32(vshrq_n_u32(input, 31)); // Merge the two pairs together with a 64-bit unsigned shift right + add. uint8x16_t paired = vreinterpretq_u8_u64(vsraq_n_u64(high_bits, high_bits, 31)); // Extract the result. return vgetq_lane_u8(paired, 0) | (vgetq_lane_u8(paired, 8) << 2); #endif } // Compute the bitwise NOT of a and then AND with a 128-bit vector containing // all 1's, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_ones FORCE_INLINE int _mm_test_all_ones(__m128i a) { return (uint64_t)(vgetq_lane_s64(a, 0) & vgetq_lane_s64(a, 1)) == ~(uint64_t) 0; } // Compute the bitwise AND of 128 bits (representing integer data) in a and // mask, and return 1 if the result is zero, otherwise return 0. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_test_all_zeros FORCE_INLINE int _mm_test_all_zeros(__m128i a, __m128i mask) { int64x2_t a_and_mask = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(mask)); return (vgetq_lane_s64(a_and_mask, 0) | vgetq_lane_s64(a_and_mask, 1)) ? 0 : 1; } /* Math operations */ // Subtracts the four single-precision, floating-point values of a and b. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/1zad2k61(v=vs.100).aspx FORCE_INLINE __m128 _mm_sub_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vsubq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Subtract the lower single-precision (32-bit) floating-point element in b from // the lower single-precision (32-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper 3 packed elements from // a to the upper elements of dst. // // dst[31:0] := a[31:0] - b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_ss FORCE_INLINE __m128 _mm_sub_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_sub_ps(a, b)); } // Subtract 2 packed 64-bit integers in b from 2 packed 64-bit integers in a, // and store the results in dst. // r0 := a0 - b0 // r1 := a1 - b1 FORCE_INLINE __m128i _mm_sub_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vsubq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Subtracts the 4 signed or unsigned 32-bit integers of b from the 4 signed or // unsigned 32-bit integers of a. // // r0 := a0 - b0 // r1 := a1 - b1 // r2 := a2 - b2 // r3 := a3 - b3 // // https://msdn.microsoft.com/en-us/library/vstudio/fhh866h0(v=vs.100).aspx FORCE_INLINE __m128i _mm_sub_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vsubq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Subtract packed 16-bit integers in b from packed 16-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi16 FORCE_INLINE __m128i _mm_sub_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtract packed 8-bit integers in b from packed 8-bit integers in a, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_epi8 FORCE_INLINE __m128i _mm_sub_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtract 64-bit integer b from 64-bit integer a, and store the result in dst. // // dst[63:0] := a[63:0] - b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_si64 FORCE_INLINE __m64 _mm_sub_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vsub_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // Subtracts the 8 unsigned 16-bit integers of bfrom the 8 unsigned 16-bit // integers of a and saturates.. // https://technet.microsoft.com/en-us/subscriptions/index/f44y0s19(v=vs.90).aspx FORCE_INLINE __m128i _mm_subs_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqsubq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Subtracts the 16 unsigned 8-bit integers of b from the 16 unsigned 8-bit // integers of a and saturates. // // r0 := UnsignedSaturate(a0 - b0) // r1 := UnsignedSaturate(a1 - b1) // ... // r15 := UnsignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/yadkxc18(v=vs.90) FORCE_INLINE __m128i _mm_subs_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqsubq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Subtracts the 16 signed 8-bit integers of b from the 16 signed 8-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r15 := SignedSaturate(a15 - b15) // // https://technet.microsoft.com/en-us/subscriptions/by7kzks1(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqsubq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Subtracts the 8 signed 16-bit integers of b from the 8 signed 16-bit integers // of a and saturates. // // r0 := SignedSaturate(a0 - b0) // r1 := SignedSaturate(a1 - b1) // ... // r7 := SignedSaturate(a7 - b7) // // https://technet.microsoft.com/en-us/subscriptions/3247z5b8(v=vs.90) FORCE_INLINE __m128i _mm_subs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqsubq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Subtract packed double-precision (64-bit) floating-point elements in b from // packed double-precision (64-bit) floating-point elements in a, and store the // results in dst. // // FOR j := 0 to 1 // i := j*64 // dst[i+63:i] := a[i+63:i] - b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_sub_pd FORCE_INLINE __m128d _mm_sub_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vsubq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] - db[0]; c[1] = da[1] - db[1]; return vld1q_f32((float32_t *) c); #endif } // Subtract the lower double-precision (64-bit) floating-point element in b from // the lower double-precision (64-bit) floating-point element in a, store the // result in the lower element of dst, and copy the upper element from a to the // upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sub_sd FORCE_INLINE __m128d _mm_sub_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_sub_pd(a, b)); } // Add packed unsigned 16-bit integers in a and b using saturation, and store // the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epu16 FORCE_INLINE __m128i _mm_adds_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vqaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Negate packed 8-bit integers in a when the corresponding signed // 8-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..15 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi8(__m128i _a, __m128i _b) { int8x16_t a = vreinterpretq_s8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x16_t ltMask = vreinterpretq_u8_s8(vshrq_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x16_t zeroMask = vreinterpretq_s8_u8(vceqzq_s8(b)); #else int8x16_t zeroMask = vreinterpretq_s8_u8(vceqq_s8(b, vdupq_n_s8(0))); #endif // bitwise select either a or nagative 'a' (vnegq_s8(a) return nagative 'a') // based on ltMask int8x16_t masked = vbslq_s8(ltMask, vnegq_s8(a), a); // res = masked & (~zeroMask) int8x16_t res = vbicq_s8(masked, zeroMask); return vreinterpretq_m128i_s8(res); } // Negate packed 16-bit integers in a when the corresponding signed // 16-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..7 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x8_t ltMask = vreinterpretq_u16_s16(vshrq_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x8_t zeroMask = vreinterpretq_s16_u16(vceqzq_s16(b)); #else int16x8_t zeroMask = vreinterpretq_s16_u16(vceqq_s16(b, vdupq_n_s16(0))); #endif // bitwise select either a or negative 'a' (vnegq_s16(a) equals to negative // 'a') based on ltMask int16x8_t masked = vbslq_s16(ltMask, vnegq_s16(a), a); // res = masked & (~zeroMask) int16x8_t res = vbicq_s16(masked, zeroMask); return vreinterpretq_m128i_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed // 32-bit integer in b is negative, and store the results in dst. // Element in dst are zeroed out when the corresponding element // in b is zero. // // for i in 0..3 // if b[i] < 0 // r[i] := -a[i] // else if b[i] == 0 // r[i] := 0 // else // r[i] := a[i] // fi // done FORCE_INLINE __m128i _mm_sign_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x4_t ltMask = vreinterpretq_u32_s32(vshrq_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x4_t zeroMask = vreinterpretq_s32_u32(vceqzq_s32(b)); #else int32x4_t zeroMask = vreinterpretq_s32_u32(vceqq_s32(b, vdupq_n_s32(0))); #endif // bitwise select either a or negative 'a' (vnegq_s32(a) equals to negative // 'a') based on ltMask int32x4_t masked = vbslq_s32(ltMask, vnegq_s32(a), a); // res = masked & (~zeroMask) int32x4_t res = vbicq_s32(masked, zeroMask); return vreinterpretq_m128i_s32(res); } // Negate packed 16-bit integers in a when the corresponding signed 16-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 3 // i := j*16 // IF b[i+15:i] < 0 // dst[i+15:i] := -(a[i+15:i]) // ELSE IF b[i+15:i] == 0 // dst[i+15:i] := 0 // ELSE // dst[i+15:i] := a[i+15:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi16 FORCE_INLINE __m64 _mm_sign_pi16(__m64 _a, __m64 _b) { int16x4_t a = vreinterpret_s16_m64(_a); int16x4_t b = vreinterpret_s16_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFF : 0 uint16x4_t ltMask = vreinterpret_u16_s16(vshr_n_s16(b, 15)); // (b == 0) ? 0xFFFF : 0 #if defined(__aarch64__) int16x4_t zeroMask = vreinterpret_s16_u16(vceqz_s16(b)); #else int16x4_t zeroMask = vreinterpret_s16_u16(vceq_s16(b, vdup_n_s16(0))); #endif // bitwise select either a or nagative 'a' (vneg_s16(a) return nagative 'a') // based on ltMask int16x4_t masked = vbsl_s16(ltMask, vneg_s16(a), a); // res = masked & (~zeroMask) int16x4_t res = vbic_s16(masked, zeroMask); return vreinterpret_m64_s16(res); } // Negate packed 32-bit integers in a when the corresponding signed 32-bit // integer in b is negative, and store the results in dst. Element in dst are // zeroed out when the corresponding element in b is zero. // // FOR j := 0 to 1 // i := j*32 // IF b[i+31:i] < 0 // dst[i+31:i] := -(a[i+31:i]) // ELSE IF b[i+31:i] == 0 // dst[i+31:i] := 0 // ELSE // dst[i+31:i] := a[i+31:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi32 FORCE_INLINE __m64 _mm_sign_pi32(__m64 _a, __m64 _b) { int32x2_t a = vreinterpret_s32_m64(_a); int32x2_t b = vreinterpret_s32_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFFFFFFFF : 0 uint32x2_t ltMask = vreinterpret_u32_s32(vshr_n_s32(b, 31)); // (b == 0) ? 0xFFFFFFFF : 0 #if defined(__aarch64__) int32x2_t zeroMask = vreinterpret_s32_u32(vceqz_s32(b)); #else int32x2_t zeroMask = vreinterpret_s32_u32(vceq_s32(b, vdup_n_s32(0))); #endif // bitwise select either a or nagative 'a' (vneg_s32(a) return nagative 'a') // based on ltMask int32x2_t masked = vbsl_s32(ltMask, vneg_s32(a), a); // res = masked & (~zeroMask) int32x2_t res = vbic_s32(masked, zeroMask); return vreinterpret_m64_s32(res); } // Negate packed 8-bit integers in a when the corresponding signed 8-bit integer // in b is negative, and store the results in dst. Element in dst are zeroed out // when the corresponding element in b is zero. // // FOR j := 0 to 7 // i := j*8 // IF b[i+7:i] < 0 // dst[i+7:i] := -(a[i+7:i]) // ELSE IF b[i+7:i] == 0 // dst[i+7:i] := 0 // ELSE // dst[i+7:i] := a[i+7:i] // FI // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sign_pi8 FORCE_INLINE __m64 _mm_sign_pi8(__m64 _a, __m64 _b) { int8x8_t a = vreinterpret_s8_m64(_a); int8x8_t b = vreinterpret_s8_m64(_b); // signed shift right: faster than vclt // (b < 0) ? 0xFF : 0 uint8x8_t ltMask = vreinterpret_u8_s8(vshr_n_s8(b, 7)); // (b == 0) ? 0xFF : 0 #if defined(__aarch64__) int8x8_t zeroMask = vreinterpret_s8_u8(vceqz_s8(b)); #else int8x8_t zeroMask = vreinterpret_s8_u8(vceq_s8(b, vdup_n_s8(0))); #endif // bitwise select either a or nagative 'a' (vneg_s8(a) return nagative 'a') // based on ltMask int8x8_t masked = vbsl_s8(ltMask, vneg_s8(a), a); // res = masked & (~zeroMask) int8x8_t res = vbic_s8(masked, zeroMask); return vreinterpret_m64_s8(res); } // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu16 FORCE_INLINE __m64 _mm_avg_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16( vrhadd_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_avg_pu8 FORCE_INLINE __m64 _mm_avg_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vrhadd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Average packed unsigned 8-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := (a[i+7:i] + b[i+7:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgb #define _m_pavgb(a, b) _mm_avg_pu8(a, b) // Average packed unsigned 16-bit integers in a and b, and store the results in // dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := (a[i+15:i] + b[i+15:i] + 1) >> 1 // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pavgw #define _m_pavgw(a, b) _mm_avg_pu16(a, b) // Extract a 16-bit integer from a, selected with imm8, and store the result in // the lower element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pextrw #define _m_pextrw(a, imm) _mm_extract_pi16(a, imm) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=m_pinsrw #define _m_pinsrw(a, i, imm) _mm_insert_pi16(a, i, imm) // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxsw #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmaxub #define _m_pmaxub(a, b) _mm_max_pu8(a, b) // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminsw #define _m_pminsw(a, b) _mm_min_pi16(a, b) // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pminub #define _m_pminub(a, b) _mm_min_pu8(a, b) // Computes the average of the 16 unsigned 8-bit integers in a and the 16 // unsigned 8-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r15 := (a15 + b15) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/8zwh554a(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_avg_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vrhaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Computes the average of the 8 unsigned 16-bit integers in a and the 8 // unsigned 16-bit integers in b and rounds. // // r0 := (a0 + b0) / 2 // r1 := (a1 + b1) / 2 // ... // r7 := (a7 + b7) / 2 // // https://msdn.microsoft.com/en-us/library/vstudio/y13ca3c8(v=vs.90).aspx FORCE_INLINE __m128i _mm_avg_epu16(__m128i a, __m128i b) { return (__m128i) vrhaddq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); } // Adds the four single-precision, floating-point values of a and b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/c9848chc(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Add packed double-precision (64-bit) floating-point elements in a and b, and // store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_pd FORCE_INLINE __m128d _mm_add_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1] + db[1]; return vld1q_f32((float32_t *) c); #endif } // Add the lower double-precision (64-bit) floating-point element in a and b, // store the result in the lower element of dst, and copy the upper element from // a to the upper element of dst. // // dst[63:0] := a[63:0] + b[63:0] // dst[127:64] := a[127:64] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_sd FORCE_INLINE __m128d _mm_add_sd(__m128d a, __m128d b) { #if defined(__aarch64__) return _mm_move_sd(a, _mm_add_pd(a, b)); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] + db[0]; c[1] = da[1]; return vld1q_f32((float32_t *) c); #endif } // Add 64-bit integers a and b, and store the result in dst. // // dst[63:0] := a[63:0] + b[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_add_si64 FORCE_INLINE __m64 _mm_add_si64(__m64 a, __m64 b) { return vreinterpret_m64_s64( vadd_s64(vreinterpret_s64_m64(a), vreinterpret_s64_m64(b))); } // adds the scalar single-precision floating point values of a and b. // https://msdn.microsoft.com/en-us/library/be94x2y6(v=vs.100).aspx FORCE_INLINE __m128 _mm_add_ss(__m128 a, __m128 b) { float32_t b0 = vgetq_lane_f32(vreinterpretq_f32_m128(b), 0); float32x4_t value = vsetq_lane_f32(b0, vdupq_n_f32(0), 0); // the upper values in the result must be the remnants of . return vreinterpretq_m128_f32(vaddq_f32(a, value)); } // Adds the 4 signed or unsigned 64-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi64(__m128i a, __m128i b) { return vreinterpretq_m128i_s64( vaddq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); } // Adds the 4 signed or unsigned 32-bit integers in a to the 4 signed or // unsigned 32-bit integers in b. // // r0 := a0 + b0 // r1 := a1 + b1 // r2 := a2 + b2 // r3 := a3 + b3 // // https://msdn.microsoft.com/en-us/library/vstudio/09xs4fkk(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vaddq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Adds the 8 signed or unsigned 16-bit integers in a to the 8 signed or // unsigned 16-bit integers in b. // https://msdn.microsoft.com/en-us/library/fceha5k4(v=vs.100).aspx FORCE_INLINE __m128i _mm_add_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Adds the 16 signed or unsigned 8-bit integers in a to the 16 signed or // unsigned 8-bit integers in b. // https://technet.microsoft.com/en-us/subscriptions/yc7tcyzs(v=vs.90) FORCE_INLINE __m128i _mm_add_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Adds the 8 signed 16-bit integers in a to the 8 signed 16-bit integers in b // and saturates. // // r0 := SignedSaturate(a0 + b0) // r1 := SignedSaturate(a1 + b1) // ... // r7 := SignedSaturate(a7 + b7) // // https://msdn.microsoft.com/en-us/library/1a306ef8(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vqaddq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Add packed signed 8-bit integers in a and b using saturation, and store the // results in dst. // // FOR j := 0 to 15 // i := j*8 // dst[i+7:i] := Saturate8( a[i+7:i] + b[i+7:i] ) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_adds_epi8 FORCE_INLINE __m128i _mm_adds_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vqaddq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Adds the 16 unsigned 8-bit integers in a to the 16 unsigned 8-bit integers in // b and saturates.. // https://msdn.microsoft.com/en-us/library/9hahyddy(v=vs.100).aspx FORCE_INLINE __m128i _mm_adds_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vqaddq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Multiplies the 8 signed or unsigned 16-bit integers from a by the 8 signed or // unsigned 16-bit integers from b. // // r0 := (a0 * b0)[15:0] // r1 := (a1 * b1)[15:0] // ... // r7 := (a7 * b7)[15:0] // // https://msdn.microsoft.com/en-us/library/vstudio/9ks1472s(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmulq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Multiplies the 4 signed or unsigned 32-bit integers from a by the 4 signed or // unsigned 32-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/bb531409(v=vs.100).aspx FORCE_INLINE __m128i _mm_mullo_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmulq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // // FOR j := 0 to 3 // i := j*16 // tmp[31:0] := a[i+15:i] * b[i+15:i] // dst[i+15:i] := tmp[31:16] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_pmulhuw #define _m_pmulhuw(a, b) _mm_mulhi_pu16(a, b) // Multiplies the four single-precision, floating-point values of a and b. // // r0 := a0 * b0 // r1 := a1 * b1 // r2 := a2 * b2 // r3 := a3 * b3 // // https://msdn.microsoft.com/en-us/library/vstudio/22kbk6t9(v=vs.100).aspx FORCE_INLINE __m128 _mm_mul_ps(__m128 a, __m128 b) { return vreinterpretq_m128_f32( vmulq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Multiply packed double-precision (64-bit) floating-point elements in a and b, // and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_pd FORCE_INLINE __m128d _mm_mul_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vmulq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] * db[0]; c[1] = da[1] * db[1]; return vld1q_f32((float32_t *) c); #endif } // Multiply the lower double-precision (64-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper element // from a to the upper element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=mm_mul_sd FORCE_INLINE __m128d _mm_mul_sd(__m128d a, __m128d b) { return _mm_move_sd(a, _mm_mul_pd(a, b)); } // Multiply the lower single-precision (32-bit) floating-point element in a and // b, store the result in the lower element of dst, and copy the upper 3 packed // elements from a to the upper elements of dst. // // dst[31:0] := a[31:0] * b[31:0] // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_ss FORCE_INLINE __m128 _mm_mul_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_mul_ps(a, b)); } // Multiply the low unsigned 32-bit integers from each packed 64-bit element in // a and b, and store the unsigned 64-bit results in dst. // // r0 := (a0 & 0xFFFFFFFF) * (b0 & 0xFFFFFFFF) // r1 := (a2 & 0xFFFFFFFF) * (b2 & 0xFFFFFFFF) FORCE_INLINE __m128i _mm_mul_epu32(__m128i a, __m128i b) { // vmull_u32 upcasts instead of masking, so we downcast. uint32x2_t a_lo = vmovn_u64(vreinterpretq_u64_m128i(a)); uint32x2_t b_lo = vmovn_u64(vreinterpretq_u64_m128i(b)); return vreinterpretq_m128i_u64(vmull_u32(a_lo, b_lo)); } // Multiply the low unsigned 32-bit integers from a and b, and store the // unsigned 64-bit result in dst. // // dst[63:0] := a[31:0] * b[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mul_su32 FORCE_INLINE __m64 _mm_mul_su32(__m64 a, __m64 b) { return vreinterpret_m64_u64(vget_low_u64( vmull_u32(vreinterpret_u32_m64(a), vreinterpret_u32_m64(b)))); } // Multiply the low signed 32-bit integers from each packed 64-bit element in // a and b, and store the signed 64-bit results in dst. // // r0 := (int64_t)(int32_t)a0 * (int64_t)(int32_t)b0 // r1 := (int64_t)(int32_t)a2 * (int64_t)(int32_t)b2 FORCE_INLINE __m128i _mm_mul_epi32(__m128i a, __m128i b) { // vmull_s32 upcasts instead of masking, so we downcast. int32x2_t a_lo = vmovn_s64(vreinterpretq_s64_m128i(a)); int32x2_t b_lo = vmovn_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vmull_s32(a_lo, b_lo)); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0) + (a1 * b1) // r1 := (a2 * b2) + (a3 * b3) // r2 := (a4 * b4) + (a5 * b5) // r3 := (a6 * b6) + (a7 * b7) // https://msdn.microsoft.com/en-us/library/yht36sa6(v=vs.90).aspx FORCE_INLINE __m128i _mm_madd_epi16(__m128i a, __m128i b) { int32x4_t low = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t high = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); int32x2_t low_sum = vpadd_s32(vget_low_s32(low), vget_high_s32(low)); int32x2_t high_sum = vpadd_s32(vget_low_s32(high), vget_high_s32(high)); return vreinterpretq_m128i_s32(vcombine_s32(low_sum, high_sum)); } // Multiply packed signed 16-bit integers in a and b, producing intermediate // signed 32-bit integers. Shift right by 15 bits while rounding up, and store // the packed 16-bit integers in dst. // // r0 := Round(((int32_t)a0 * (int32_t)b0) >> 15) // r1 := Round(((int32_t)a1 * (int32_t)b1) >> 15) // r2 := Round(((int32_t)a2 * (int32_t)b2) >> 15) // ... // r7 := Round(((int32_t)a7 * (int32_t)b7) >> 15) FORCE_INLINE __m128i _mm_mulhrs_epi16(__m128i a, __m128i b) { // Has issues due to saturation // return vreinterpretq_m128i_s16(vqrdmulhq_s16(a, b)); // Multiply int32x4_t mul_lo = vmull_s16(vget_low_s16(vreinterpretq_s16_m128i(a)), vget_low_s16(vreinterpretq_s16_m128i(b))); int32x4_t mul_hi = vmull_s16(vget_high_s16(vreinterpretq_s16_m128i(a)), vget_high_s16(vreinterpretq_s16_m128i(b))); // Rounding narrowing shift right // narrow = (int16_t)((mul + 16384) >> 15); int16x4_t narrow_lo = vrshrn_n_s32(mul_lo, 15); int16x4_t narrow_hi = vrshrn_n_s32(mul_hi, 15); // Join together return vreinterpretq_m128i_s16(vcombine_s16(narrow_lo, narrow_hi)); } // Vertically multiply each unsigned 8-bit integer from a with the corresponding // signed 8-bit integer from b, producing intermediate signed 16-bit integers. // Horizontally add adjacent pairs of intermediate signed 16-bit integers, // and pack the saturated results in dst. // // FOR j := 0 to 7 // i := j*16 // dst[i+15:i] := Saturate_To_Int16( a[i+15:i+8]*b[i+15:i+8] + // a[i+7:i]*b[i+7:i] ) // ENDFOR FORCE_INLINE __m128i _mm_maddubs_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) uint8x16_t a = vreinterpretq_u8_m128i(_a); int8x16_t b = vreinterpretq_s8_m128i(_b); int16x8_t tl = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), vmovl_s8(vget_low_s8(b))); int16x8_t th = vmulq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))), vmovl_s8(vget_high_s8(b))); return vreinterpretq_m128i_s16( vqaddq_s16(vuzp1q_s16(tl, th), vuzp2q_s16(tl, th))); #else // This would be much simpler if x86 would choose to zero extend OR sign // extend, not both. This could probably be optimized better. uint16x8_t a = vreinterpretq_u16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); // Zero extend a int16x8_t a_odd = vreinterpretq_s16_u16(vshrq_n_u16(a, 8)); int16x8_t a_even = vreinterpretq_s16_u16(vbicq_u16(a, vdupq_n_u16(0xff00))); // Sign extend by shifting left then shifting right. int16x8_t b_even = vshrq_n_s16(vshlq_n_s16(b, 8), 8); int16x8_t b_odd = vshrq_n_s16(b, 8); // multiply int16x8_t prod1 = vmulq_s16(a_even, b_even); int16x8_t prod2 = vmulq_s16(a_odd, b_odd); // saturated add return vreinterpretq_m128i_s16(vqaddq_s16(prod1, prod2)); #endif } // Computes the fused multiple add product of 32-bit floating point numbers. // // Return Value // Multiplies A and B, and adds C to the temporary result before returning it. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_fmadd FORCE_INLINE __m128 _mm_fmadd_ps(__m128 a, __m128 b, __m128 c) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vfmaq_f32(vreinterpretq_f32_m128(c), vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(a))); #else return _mm_add_ps(_mm_mul_ps(a, b), c); #endif } // Alternatively add and subtract packed single-precision (32-bit) // floating-point elements in a to/from packed elements in b, and store the // results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=addsub_ps FORCE_INLINE __m128 _mm_addsub_ps(__m128 a, __m128 b) { __m128 mask = {-1.0f, 1.0f, -1.0f, 1.0f}; return _mm_fmadd_ps(b, mask, a); } // Horizontally add adjacent pairs of double-precision (64-bit) floating-point // elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pd FORCE_INLINE __m128d _mm_hadd_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vpaddq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[] = {da[0] + da[1], db[0] + db[1]}; return vreinterpretq_m128d_u64(vld1q_u64((uint64_t *) c)); #endif } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce two // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of 64-bit elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_epu8 FORCE_INLINE __m128i _mm_sad_epu8(__m128i a, __m128i b) { uint16x8_t t = vpaddlq_u8(vabdq_u8((uint8x16_t) a, (uint8x16_t) b)); uint16_t r0 = t[0] + t[1] + t[2] + t[3]; uint16_t r4 = t[4] + t[5] + t[6] + t[7]; uint16x8_t r = vsetq_lane_u16(r0, vdupq_n_u16(0), 0); return (__m128i) vsetq_lane_u16(r4, r, 4); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_sad_pu8 FORCE_INLINE __m64 _mm_sad_pu8(__m64 a, __m64 b) { uint16x4_t t = vpaddl_u8(vabd_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); uint16_t r0 = t[0] + t[1] + t[2] + t[3]; return vreinterpret_m64_u16(vset_lane_u16(r0, vdup_n_u16(0), 0)); } // Compute the absolute differences of packed unsigned 8-bit integers in a and // b, then horizontally sum each consecutive 8 differences to produce four // unsigned 16-bit integers, and pack these unsigned 16-bit integers in the low // 16 bits of dst. // // FOR j := 0 to 7 // i := j*8 // tmp[i+7:i] := ABS(a[i+7:i] - b[i+7:i]) // ENDFOR // dst[15:0] := tmp[7:0] + tmp[15:8] + tmp[23:16] + tmp[31:24] + tmp[39:32] + // tmp[47:40] + tmp[55:48] + tmp[63:56] dst[63:16] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_m_psadbw #define _m_psadbw(a, b) _mm_sad_pu8(a, b) // Divides the four single-precision, floating-point values of a and b. // // r0 := a0 / b0 // r1 := a1 / b1 // r2 := a2 / b2 // r3 := a3 / b3 // // https://msdn.microsoft.com/en-us/library/edaw8147(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ps(__m128 a, __m128 b) { #if defined(__aarch64__) && !SSE2NEON_PRECISE_DIV return vreinterpretq_m128_f32( vdivq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(b)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(b))); #endif return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(a), recip)); #endif } // Divides the scalar single-precision floating point value of a by b. // https://msdn.microsoft.com/en-us/library/4y73xa49(v=vs.100).aspx FORCE_INLINE __m128 _mm_div_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_div_ps(a, b)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Divide packed double-precision (64-bit) floating-point elements in a by // packed elements in b, and store the results in dst. // // FOR j := 0 to 1 // i := 64*j // dst[i+63:i] := a[i+63:i] / b[i+63:i] // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_pd FORCE_INLINE __m128d _mm_div_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else double *da = (double *) &a; double *db = (double *) &b; double c[2]; c[0] = da[0] / db[0]; c[1] = da[1] / db[1]; return vld1q_f32((float32_t *) c); #endif } // Divide the lower double-precision (64-bit) floating-point element in a by the // lower double-precision (64-bit) floating-point element in b, store the result // in the lower element of dst, and copy the upper element from a to the upper // element of dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_div_sd FORCE_INLINE __m128d _mm_div_sd(__m128d a, __m128d b) { #if defined(__aarch64__) float64x2_t tmp = vdivq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b)); return vreinterpretq_m128d_f64( vsetq_lane_f64(vgetq_lane_f64(vreinterpretq_f64_m128d(a), 1), tmp, 1)); #else return _mm_move_sd(a, _mm_div_pd(a, b)); #endif } // Compute the approximate reciprocal of packed single-precision (32-bit) // floating-point elements in a, and store the results in dst. The maximum // relative error for this approximation is less than 1.5*2^-12. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ps FORCE_INLINE __m128 _mm_rcp_ps(__m128 in) { float32x4_t recip = vrecpeq_f32(vreinterpretq_f32_m128(in)); recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #if SSE2NEON_PRECISE_DIV // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32(recip, vrecpsq_f32(recip, vreinterpretq_f32_m128(in))); #endif return vreinterpretq_m128_f32(recip); } // Compute the approximate reciprocal of the lower single-precision (32-bit) // floating-point element in a, store the result in the lower element of dst, // and copy the upper 3 packed elements from a to the upper elements of dst. The // maximum relative error for this approximation is less than 1.5*2^-12. // // dst[31:0] := (1.0 / a[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rcp_ss FORCE_INLINE __m128 _mm_rcp_ss(__m128 a) { return _mm_move_ss(a, _mm_rcp_ps(a)); } // Computes the approximations of square roots of the four single-precision, // floating-point values of a. First computes reciprocal square roots and then // reciprocals of the four values. // // r0 := sqrt(a0) // r1 := sqrt(a1) // r2 := sqrt(a2) // r3 := sqrt(a3) // // https://msdn.microsoft.com/en-us/library/vstudio/8z67bwwk(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ps(__m128 in) { #if SSE2NEON_PRECISE_SQRT float32x4_t recip = vrsqrteq_f32(vreinterpretq_f32_m128(in)); // Test for vrsqrteq_f32(0) -> positive infinity case. // Change to zero, so that s * 1/sqrt(s) result is zero too. const uint32x4_t pos_inf = vdupq_n_u32(0x7F800000); const uint32x4_t div_by_zero = vceqq_u32(pos_inf, vreinterpretq_u32_f32(recip)); recip = vreinterpretq_f32_u32( vandq_u32(vmvnq_u32(div_by_zero), vreinterpretq_u32_f32(recip))); // Additional Netwon-Raphson iteration for accuracy recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); recip = vmulq_f32( vrsqrtsq_f32(vmulq_f32(recip, recip), vreinterpretq_f32_m128(in)), recip); // sqrt(s) = s * 1/sqrt(s) return vreinterpretq_m128_f32(vmulq_f32(vreinterpretq_f32_m128(in), recip)); #elif defined(__aarch64__) return vreinterpretq_m128_f32(vsqrtq_f32(vreinterpretq_f32_m128(in))); #else float32x4_t recipsq = vrsqrteq_f32(vreinterpretq_f32_m128(in)); float32x4_t sq = vrecpeq_f32(recipsq); return vreinterpretq_m128_f32(sq); #endif } // Computes the approximation of the square root of the scalar single-precision // floating point value of in. // https://msdn.microsoft.com/en-us/library/ahfsc22d(v=vs.100).aspx FORCE_INLINE __m128 _mm_sqrt_ss(__m128 in) { float32_t value = vgetq_lane_f32(vreinterpretq_f32_m128(_mm_sqrt_ps(in)), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(in), 0)); } // Computes the approximations of the reciprocal square roots of the four // single-precision floating point values of in. // https://msdn.microsoft.com/en-us/library/22hfsh53(v=vs.100).aspx FORCE_INLINE __m128 _mm_rsqrt_ps(__m128 in) { float32x4_t out = vrsqrteq_f32(vreinterpretq_f32_m128(in)); #if SSE2NEON_PRECISE_RSQRT // Additional Netwon-Raphson iteration for accuracy out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); out = vmulq_f32( out, vrsqrtsq_f32(vmulq_f32(vreinterpretq_f32_m128(in), out), out)); #endif return vreinterpretq_m128_f32(out); } // Compute the approximate reciprocal square root of the lower single-precision // (32-bit) floating-point element in a, store the result in the lower element // of dst, and copy the upper 3 packed elements from a to the upper elements of // dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_rsqrt_ss FORCE_INLINE __m128 _mm_rsqrt_ss(__m128 in) { return vsetq_lane_f32(vgetq_lane_f32(_mm_rsqrt_ps(in), 0), in, 0); } // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 FORCE_INLINE __m64 _mm_max_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmax_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Compare packed signed 16-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MAX(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pi16 #define _m_pmaxsw(a, b) _mm_max_pi16(a, b) // Computes the maximums of the four single-precision, floating-point values of // a and b. // https://msdn.microsoft.com/en-us/library/vstudio/ff5d607a(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vbslq_f32(vcltq_f32(_b, _a), _a, _b); #else return vreinterpretq_m128_f32( vmaxq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 FORCE_INLINE __m64 _mm_max_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmax_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compare packed unsigned 8-bit integers in a and b, and store packed maximum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MAX(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_pu8 #define _m_pmaxub(a, b) _mm_max_pu8(a, b) // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 FORCE_INLINE __m64 _mm_min_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vmin_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Compare packed signed 16-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 3 // i := j*16 // dst[i+15:i] := MIN(a[i+15:i], b[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pi16 #define _m_pminsw(a, b) _mm_min_pi16(a, b) // Computes the minima of the four single-precision, floating-point values of a // and b. // https://msdn.microsoft.com/en-us/library/vstudio/wh13kadz(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ps(__m128 a, __m128 b) { #if SSE2NEON_PRECISE_MINMAX float32x4_t _a = vreinterpretq_f32_m128(a); float32x4_t _b = vreinterpretq_f32_m128(b); return vbslq_f32(vcltq_f32(_a, _b), _a, _b); #else return vreinterpretq_m128_f32( vminq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #endif } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 FORCE_INLINE __m64 _mm_min_pu8(__m64 a, __m64 b) { return vreinterpret_m64_u8( vmin_u8(vreinterpret_u8_m64(a), vreinterpret_u8_m64(b))); } // Compare packed unsigned 8-bit integers in a and b, and store packed minimum // values in dst. // // FOR j := 0 to 7 // i := j*8 // dst[i+7:i] := MIN(a[i+7:i], b[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_pu8 #define _m_pminub(a, b) _mm_min_pu8(a, b) // Computes the maximum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/s6db5esz(v=vs.100).aspx FORCE_INLINE __m128 _mm_max_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_max_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Computes the minimum of the two lower scalar single-precision floating point // values of a and b. // https://msdn.microsoft.com/en-us/library/0a9y7xaa(v=vs.100).aspx FORCE_INLINE __m128 _mm_min_ss(__m128 a, __m128 b) { float32_t value = vgetq_lane_f32(_mm_min_ps(a, b), 0); return vreinterpretq_m128_f32( vsetq_lane_f32(value, vreinterpretq_f32_m128(a), 0)); } // Computes the pairwise maxima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/en-us/library/st6634za(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vmaxq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Computes the pairwise minima of the 16 unsigned 8-bit integers from a and the // 16 unsigned 8-bit integers from b. // https://msdn.microsoft.com/ko-kr/library/17k8cf58(v=vs.100).aspxx FORCE_INLINE __m128i _mm_min_epu8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vminq_u8(vreinterpretq_u8_m128i(a), vreinterpretq_u8_m128i(b))); } // Computes the pairwise minima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/library/vstudio/6te997ew(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vminq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epi8 FORCE_INLINE __m128i _mm_max_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vmaxq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu16 FORCE_INLINE __m128i _mm_max_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vmaxq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Compare packed signed 8-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epi8 FORCE_INLINE __m128i _mm_min_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vminq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed unsigned 16-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_min_epu16 FORCE_INLINE __m128i _mm_min_epu16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vminq_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b))); } // Computes the pairwise maxima of the 8 signed 16-bit integers from a and the 8 // signed 16-bit integers from b. // https://msdn.microsoft.com/en-us/LIBRary/3x060h7c(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vmaxq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // epi versions of min/max // Computes the pariwise maximums of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 > b0) ? a0 : b0 // r1 := (a1 > b1) ? a1 : b1 // r2 := (a2 > b2) ? a2 : b2 // r3 := (a3 > b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb514055(v=vs.100).aspx FORCE_INLINE __m128i _mm_max_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vmaxq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Computes the pariwise minima of the four signed 32-bit integer values of a // and b. // // A 128-bit parameter that can be defined with the following equations: // r0 := (a0 < b0) ? a0 : b0 // r1 := (a1 < b1) ? a1 : b1 // r2 := (a2 < b2) ? a2 : b2 // r3 := (a3 < b3) ? a3 : b3 // // https://msdn.microsoft.com/en-us/library/vstudio/bb531476(v=vs.100).aspx FORCE_INLINE __m128i _mm_min_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s32( vminq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed maximum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_max_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vmaxq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Compare packed unsigned 32-bit integers in a and b, and store packed minimum // values in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_max_epu32 FORCE_INLINE __m128i _mm_min_epu32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vminq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b))); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_pu16 FORCE_INLINE __m64 _mm_mulhi_pu16(__m64 a, __m64 b) { return vreinterpret_m64_u16(vshrn_n_u32( vmull_u16(vreinterpret_u16_m64(a), vreinterpret_u16_m64(b)), 16)); } // Multiplies the 8 signed 16-bit integers from a by the 8 signed 16-bit // integers from b. // // r0 := (a0 * b0)[31:16] // r1 := (a1 * b1)[31:16] // ... // r7 := (a7 * b7)[31:16] // // https://msdn.microsoft.com/en-us/library/vstudio/59hddw1d(v=vs.100).aspx FORCE_INLINE __m128i _mm_mulhi_epi16(__m128i a, __m128i b) { /* FIXME: issue with large values because of result saturation */ // int16x8_t ret = vqdmulhq_s16(vreinterpretq_s16_m128i(a), // vreinterpretq_s16_m128i(b)); /* =2*a*b */ return // vreinterpretq_m128i_s16(vshrq_n_s16(ret, 1)); int16x4_t a3210 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b3210 = vget_low_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab3210 = vmull_s16(a3210, b3210); /* 3333222211110000 */ int16x4_t a7654 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b7654 = vget_high_s16(vreinterpretq_s16_m128i(b)); int32x4_t ab7654 = vmull_s16(a7654, b7654); /* 7777666655554444 */ uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_s32(ab3210), vreinterpretq_u16_s32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); } // Multiply the packed unsigned 16-bit integers in a and b, producing // intermediate 32-bit integers, and store the high 16 bits of the intermediate // integers in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_mulhi_epu16 FORCE_INLINE __m128i _mm_mulhi_epu16(__m128i a, __m128i b) { uint16x4_t a3210 = vget_low_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b3210 = vget_low_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab3210 = vmull_u16(a3210, b3210); #if defined(__aarch64__) uint32x4_t ab7654 = vmull_high_u16(vreinterpretq_u16_m128i(a), vreinterpretq_u16_m128i(b)); uint16x8_t r = vuzp2q_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r); #else uint16x4_t a7654 = vget_high_u16(vreinterpretq_u16_m128i(a)); uint16x4_t b7654 = vget_high_u16(vreinterpretq_u16_m128i(b)); uint32x4_t ab7654 = vmull_u16(a7654, b7654); uint16x8x2_t r = vuzpq_u16(vreinterpretq_u16_u32(ab3210), vreinterpretq_u16_u32(ab7654)); return vreinterpretq_m128i_u16(r.val[1]); #endif } // Computes pairwise add of each argument as single-precision, floating-point // values a and b. // https://msdn.microsoft.com/en-us/library/yd9wecaa.aspx FORCE_INLINE __m128 _mm_hadd_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vpaddq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a10 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t a32 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b10 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2_t b32 = vget_high_f32(vreinterpretq_f32_m128(b)); return vreinterpretq_m128_f32( vcombine_f32(vpadd_f32(a10, a32), vpadd_f32(b10, b32))); #endif } // Computes pairwise add of each argument as a 16-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi16(__m128i _a, __m128i _b) { int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); #if defined(__aarch64__) return vreinterpretq_m128i_s16(vpaddq_s16(a, b)); #else return vreinterpretq_m128i_s16( vcombine_s16(vpadd_s16(vget_low_s16(a), vget_high_s16(a)), vpadd_s16(vget_low_s16(b), vget_high_s16(b)))); #endif } // Horizontally substract adjacent pairs of single-precision (32-bit) // floating-point elements in a and b, and pack the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsub_ps FORCE_INLINE __m128 _mm_hsub_ps(__m128 _a, __m128 _b) { #if defined(__aarch64__) return vreinterpretq_m128_f32(vsubq_f32( vuzp1q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)), vuzp2q_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)))); #else float32x4x2_t c = vuzpq_f32(vreinterpretq_f32_m128(_a), vreinterpretq_f32_m128(_b)); return vreinterpretq_m128_f32(vsubq_f32(c.val[0], c.val[1])); #endif } // Horizontally add adjacent pairs of 16-bit integers in a and b, and pack the // signed 16-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi16 FORCE_INLINE __m64 _mm_hadd_pi16(__m64 a, __m64 b) { return vreinterpret_m64_s16( vpadd_s16(vreinterpret_s16_m64(a), vreinterpret_s16_m64(b))); } // Horizontally add adjacent pairs of 32-bit integers in a and b, and pack the // signed 32-bit results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hadd_pi32 FORCE_INLINE __m64 _mm_hadd_pi32(__m64 a, __m64 b) { return vreinterpret_m64_s32( vpadd_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b))); } // Computes pairwise difference of each argument as a 16-bit signed or unsigned // integer values a and b. FORCE_INLINE __m128i _mm_hsub_epi16(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Subtract return vreinterpretq_m128i_s16(vsubq_s16(ab0246, ab1357)); } // Computes saturated pairwise sub of each argument as a 16-bit signed // integer values a and b. FORCE_INLINE __m128i _mm_hadds_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( vqaddq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Saturated add return vreinterpretq_m128i_s16(vqaddq_s16(ab0246, ab1357)); #endif } // Computes saturated pairwise difference of each argument as a 16-bit signed // integer values a and b. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_hsubs_epi16 FORCE_INLINE __m128i _mm_hsubs_epi16(__m128i _a, __m128i _b) { #if defined(__aarch64__) int16x8_t a = vreinterpretq_s16_m128i(_a); int16x8_t b = vreinterpretq_s16_m128i(_b); return vreinterpretq_s64_s16( vqsubq_s16(vuzp1q_s16(a, b), vuzp2q_s16(a, b))); #else int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|a4|a6|b0|b2|b4|b6] // [a1|a3|a5|a7|b1|b3|b5|b7] int16x8_t ab0246 = vcombine_s16(vmovn_s32(a), vmovn_s32(b)); int16x8_t ab1357 = vcombine_s16(vshrn_n_s32(a, 16), vshrn_n_s32(b, 16)); // Saturated subtract return vreinterpretq_m128i_s16(vqsubq_s16(ab0246, ab1357)); #endif } // Computes pairwise add of each argument as a 32-bit signed or unsigned integer // values a and b. FORCE_INLINE __m128i _mm_hadd_epi32(__m128i _a, __m128i _b) { int32x4_t a = vreinterpretq_s32_m128i(_a); int32x4_t b = vreinterpretq_s32_m128i(_b); return vreinterpretq_m128i_s32( vcombine_s32(vpadd_s32(vget_low_s32(a), vget_high_s32(a)), vpadd_s32(vget_low_s32(b), vget_high_s32(b)))); } // Computes pairwise difference of each argument as a 32-bit signed or unsigned // integer values a and b. FORCE_INLINE __m128i _mm_hsub_epi32(__m128i _a, __m128i _b) { int64x2_t a = vreinterpretq_s64_m128i(_a); int64x2_t b = vreinterpretq_s64_m128i(_b); // Interleave using vshrn/vmovn // [a0|a2|b0|b2] // [a1|a2|b1|b3] int32x4_t ab02 = vcombine_s32(vmovn_s64(a), vmovn_s64(b)); int32x4_t ab13 = vcombine_s32(vshrn_n_s64(a, 32), vshrn_n_s64(b, 32)); // Subtract return vreinterpretq_m128i_s32(vsubq_s32(ab02, ab13)); } // Kahan summation for accurate summation of floating-point numbers. // http://blog.zachbjornson.com/2019/08/11/fast-float-summation.html FORCE_INLINE void _sse2neon_kadd_f32(float *sum, float *c, float y) { y -= *c; float t = *sum + y; *c = (t - *sum) - y; *sum = t; } // Conditionally multiply the packed single-precision (32-bit) floating-point // elements in a and b using the high 4 bits in imm8, sum the four products, // and conditionally store the sum in dst using the low 4 bits of imm. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_dp_ps FORCE_INLINE __m128 _mm_dp_ps(__m128 a, __m128 b, const int imm) { #if defined(__aarch64__) /* shortcuts */ if (imm == 0xFF) { return _mm_set1_ps(vaddvq_f32(_mm_mul_ps(a, b))); } if (imm == 0x7F) { float32x4_t m = _mm_mul_ps(a, b); m[3] = 0; return _mm_set1_ps(vaddvq_f32(m)); } #endif float s = 0, c = 0; float32x4_t f32a = vreinterpretq_f32_m128(a); float32x4_t f32b = vreinterpretq_f32_m128(b); /* To improve the accuracy of floating-point summation, Kahan algorithm * is used for each operation. */ if (imm & (1 << 4)) _sse2neon_kadd_f32(&s, &c, f32a[0] * f32b[0]); if (imm & (1 << 5)) _sse2neon_kadd_f32(&s, &c, f32a[1] * f32b[1]); if (imm & (1 << 6)) _sse2neon_kadd_f32(&s, &c, f32a[2] * f32b[2]); if (imm & (1 << 7)) _sse2neon_kadd_f32(&s, &c, f32a[3] * f32b[3]); s += c; float32x4_t res = { (imm & 0x1) ? s : 0, (imm & 0x2) ? s : 0, (imm & 0x4) ? s : 0, (imm & 0x8) ? s : 0, }; return vreinterpretq_m128_f32(res); } /* Compare operations */ // Compares for less than // https://msdn.microsoft.com/en-us/library/vstudio/f330yhc8(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmplt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fy94wye7(v=vs.100) FORCE_INLINE __m128 _mm_cmplt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmplt_ps(a, b)); } // Compares for greater than. // // r0 := (a0 > b0) ? 0xffffffff : 0x0 // r1 := (a1 > b1) ? 0xffffffff : 0x0 // r2 := (a2 > b2) ? 0xffffffff : 0x0 // r3 := (a3 > b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/11dy102s(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpgt_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/1xyyyy9e(v=vs.100) FORCE_INLINE __m128 _mm_cmpgt_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpgt_ps(a, b)); } // Compares for greater than or equal. // https://msdn.microsoft.com/en-us/library/vstudio/fs813y2t(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpge_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/kesh3ddc(v=vs.100) FORCE_INLINE __m128 _mm_cmpge_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpge_ps(a, b)); } // Compares for less than or equal. // // r0 := (a0 <= b0) ? 0xffffffff : 0x0 // r1 := (a1 <= b1) ? 0xffffffff : 0x0 // r2 := (a2 <= b2) ? 0xffffffff : 0x0 // r3 := (a3 <= b3) ? 0xffffffff : 0x0 // // https://msdn.microsoft.com/en-us/library/vstudio/1s75w83z(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmple_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/a7x0hbhw(v=vs.100) FORCE_INLINE __m128 _mm_cmple_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmple_ps(a, b)); } // Compares for equality. // https://msdn.microsoft.com/en-us/library/vstudio/36aectz5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpeq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); } // Compares for equality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/k423z28e(v=vs.100) FORCE_INLINE __m128 _mm_cmpeq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpeq_ps(a, b)); } // Compares for inequality. // https://msdn.microsoft.com/en-us/library/sf44thbx(v=vs.100).aspx FORCE_INLINE __m128 _mm_cmpneq_ps(__m128 a, __m128 b) { return vreinterpretq_m128_u32(vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)))); } // Compares for inequality. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/ekya8fh4(v=vs.100) FORCE_INLINE __m128 _mm_cmpneq_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpneq_ps(a, b)); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/wsexys62(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ps(__m128 a, __m128 b) { return _mm_cmplt_ps(a, b); } // Compares for not greater than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/fk2y80s8(v=vs.100) FORCE_INLINE __m128 _mm_cmpnge_ss(__m128 a, __m128 b) { return _mm_cmplt_ss(a, b); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/d0xh7w0s(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ps(__m128 a, __m128 b) { return _mm_cmple_ps(a, b); } // Compares for not greater than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpngt_ss(__m128 a, __m128 b) { return _mm_cmple_ss(a, b); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/6a330kxw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ps(__m128 a, __m128 b) { return _mm_cmpgt_ps(a, b); } // Compares for not less than or equal. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/z7x9ydwh(v=vs.100) FORCE_INLINE __m128 _mm_cmpnle_ss(__m128 a, __m128 b) { return _mm_cmpgt_ss(a, b); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/4686bbdw(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ps(__m128 a, __m128 b) { return _mm_cmpge_ps(a, b); } // Compares for not less than. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/56b9z2wf(v=vs.100) FORCE_INLINE __m128 _mm_cmpnlt_ss(__m128 a, __m128 b) { return _mm_cmpge_ss(a, b); } // Compares the 16 signed or unsigned 8-bit integers in a and the 16 signed or // unsigned 8-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/windows/desktop/bz5xk21a(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmpeq_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vceqq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compare packed double-precision (64-bit) floating-point elements in a and b // for equality, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cmpeq_pd FORCE_INLINE __m128d _mm_cmpeq_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_u64( vceqq_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128d(a), vreinterpretq_u32_m128d(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128d_u32(vandq_u32(cmp, swapped)); #endif } // Compares the 8 signed or unsigned 16-bit integers in a and the 8 signed or // unsigned 16-bit integers in b for equality. // https://msdn.microsoft.com/en-us/library/2ay060te(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpeq_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vceqq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compare packed 32-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vceqq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compare packed 64-bit integers in a and b for equality, and store the results // in dst FORCE_INLINE __m128i _mm_cmpeq_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vceqq_u64(vreinterpretq_u64_m128i(a), vreinterpretq_u64_m128i(b))); #else // ARMv7 lacks vceqq_u64 // (a == b) -> (a_lo == b_lo) && (a_hi == b_hi) uint32x4_t cmp = vceqq_u32(vreinterpretq_u32_m128i(a), vreinterpretq_u32_m128i(b)); uint32x4_t swapped = vrev64q_u32(cmp); return vreinterpretq_m128i_u32(vandq_u32(cmp, swapped)); #endif } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for lesser than. // https://msdn.microsoft.com/en-us/library/windows/desktop/9s46csht(v=vs.90).aspx FORCE_INLINE __m128i _mm_cmplt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcltq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compares the 16 signed 8-bit integers in a and the 16 signed 8-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xff : 0x0 // r1 := (a1 > b1) ? 0xff : 0x0 // ... // r15 := (a15 > b15) ? 0xff : 0x0 // // https://msdn.microsoft.com/zh-tw/library/wf45zt2b(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi8(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vcgtq_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for less than. // // r0 := (a0 < b0) ? 0xffff : 0x0 // r1 := (a1 < b1) ? 0xffff : 0x0 // ... // r7 := (a7 < b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/t863edb2(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcltq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 8 signed 16-bit integers in a and the 8 signed 16-bit integers // in b for greater than. // // r0 := (a0 > b0) ? 0xffff : 0x0 // r1 := (a1 > b1) ? 0xffff : 0x0 // ... // r7 := (a7 > b7) ? 0xffff : 0x0 // // https://technet.microsoft.com/en-us/library/xd43yfsa(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcgtq_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for less than. // https://msdn.microsoft.com/en-us/library/vstudio/4ak0bf5d(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmplt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcltq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 4 signed 32-bit integers in a and the 4 signed 32-bit integers // in b for greater than. // https://msdn.microsoft.com/en-us/library/vstudio/1s9f2z0y(v=vs.100).aspx FORCE_INLINE __m128i _mm_cmpgt_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u32( vcgtq_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); } // Compares the 2 signed 64-bit integers in a and the 2 signed 64-bit integers // in b for greater than. FORCE_INLINE __m128i _mm_cmpgt_epi64(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_u64( vcgtq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b))); #else // ARMv7 lacks vcgtq_s64. // This is based off of Clang's SSE2 polyfill: // (a > b) -> ((a_hi > b_hi) || (a_lo > b_lo && a_hi == b_hi)) // Mask the sign bit out since we need a signed AND an unsigned comparison // and it is ugly to try and split them. int32x4_t mask = vreinterpretq_s32_s64(vdupq_n_s64(0x80000000ull)); int32x4_t a_mask = veorq_s32(vreinterpretq_s32_m128i(a), mask); int32x4_t b_mask = veorq_s32(vreinterpretq_s32_m128i(b), mask); // Check if a > b int64x2_t greater = vreinterpretq_s64_u32(vcgtq_s32(a_mask, b_mask)); // Copy upper mask to lower mask // a_hi > b_hi int64x2_t gt_hi = vshrq_n_s64(greater, 63); // Copy lower mask to upper mask // a_lo > b_lo int64x2_t gt_lo = vsliq_n_s64(greater, greater, 32); // Compare for equality int64x2_t equal = vreinterpretq_s64_u32(vceqq_s32(a_mask, b_mask)); // Copy upper mask to lower mask // a_hi == b_hi int64x2_t eq_hi = vshrq_n_s64(equal, 63); // a_hi > b_hi || (a_lo > b_lo && a_hi == b_hi) int64x2_t ret = vorrq_s64(gt_hi, vandq_s64(gt_lo, eq_hi)); return vreinterpretq_m128i_s64(ret); #endif } // Compares the four 32-bit floats in a and b to check if any values are NaN. // Ordered compare between each value returns true for "orderable" and false for // "not orderable" (NaN). // https://msdn.microsoft.com/en-us/library/vstudio/0h9w00fx(v=vs.100).aspx see // also: // http://stackoverflow.com/questions/8627331/what-does-ordered-unordered-comparison-mean // http://stackoverflow.com/questions/29349621/neon-isnanval-intrinsics FORCE_INLINE __m128 _mm_cmpord_ps(__m128 a, __m128 b) { // Note: NEON does not have ordered compare builtin // Need to compare a eq a and b eq b to check for NaN // Do AND of results to get final uint32x4_t ceqaa = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t ceqbb = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vandq_u32(ceqaa, ceqbb)); } // Compares for ordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/343t62da(v=vs.100) FORCE_INLINE __m128 _mm_cmpord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpord_ps(a, b)); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/khy6fk1t(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ps(__m128 a, __m128 b) { uint32x4_t f32a = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t f32b = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); return vreinterpretq_m128_u32(vmvnq_u32(vandq_u32(f32a, f32b))); } // Compares for unordered. // https://docs.microsoft.com/en-us/previous-versions/visualstudio/visual-studio-2010/2as2387b(v=vs.100) FORCE_INLINE __m128 _mm_cmpunord_ss(__m128 a, __m128 b) { return _mm_move_ss(a, _mm_cmpunord_ps(a, b)); } // Compares the lower single-precision floating point scalar values of a and b // using a less than operation. : // https://msdn.microsoft.com/en-us/library/2kwe606b(v=vs.90).aspx Important // note!! The documentation on MSDN is incorrect! If either of the values is a // NAN the docs say you will get a one, but in fact, it will return a zero!! FORCE_INLINE int _mm_comilt_ss(__m128 a, __m128 b) { uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_lt_b = vcltq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_lt_b), 0) != 0) ? 1 : 0; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than operation. : // https://msdn.microsoft.com/en-us/library/b0738e0t(v=vs.100).aspx FORCE_INLINE int _mm_comigt_ss(__m128 a, __m128 b) { // return vgetq_lane_u32(vcgtq_f32(vreinterpretq_f32_m128(a), // vreinterpretq_f32_m128(b)), 0); uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_gt_b = vcgtq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_gt_b), 0) != 0) ? 1 : 0; } // Compares the lower single-precision floating point scalar values of a and b // using a less than or equal operation. : // https://msdn.microsoft.com/en-us/library/1w4t7c57(v=vs.90).aspx FORCE_INLINE int _mm_comile_ss(__m128 a, __m128 b) { // return vgetq_lane_u32(vcleq_f32(vreinterpretq_f32_m128(a), // vreinterpretq_f32_m128(b)), 0); uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_le_b = vcleq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_le_b), 0) != 0) ? 1 : 0; } // Compares the lower single-precision floating point scalar values of a and b // using a greater than or equal operation. : // https://msdn.microsoft.com/en-us/library/8t80des6(v=vs.100).aspx FORCE_INLINE int _mm_comige_ss(__m128 a, __m128 b) { // return vgetq_lane_u32(vcgeq_f32(vreinterpretq_f32_m128(a), // vreinterpretq_f32_m128(b)), 0); uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_ge_b = vcgeq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_ge_b), 0) != 0) ? 1 : 0; } // Compares the lower single-precision floating point scalar values of a and b // using an equality operation. : // https://msdn.microsoft.com/en-us/library/93yx2h2b(v=vs.100).aspx FORCE_INLINE int _mm_comieq_ss(__m128 a, __m128 b) { // return vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), // vreinterpretq_f32_m128(b)), 0); uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_and_b_not_nan = vandq_u32(a_not_nan, b_not_nan); uint32x4_t a_eq_b = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b)); return (vgetq_lane_u32(vandq_u32(a_and_b_not_nan, a_eq_b), 0) != 0) ? 1 : 0; } // Compares the lower single-precision floating point scalar values of a and b // using an inequality operation. : // https://msdn.microsoft.com/en-us/library/bafh5e0a(v=vs.90).aspx FORCE_INLINE int _mm_comineq_ss(__m128 a, __m128 b) { // return !vgetq_lane_u32(vceqq_f32(vreinterpretq_f32_m128(a), // vreinterpretq_f32_m128(b)), 0); uint32x4_t a_not_nan = vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(a)); uint32x4_t b_not_nan = vceqq_f32(vreinterpretq_f32_m128(b), vreinterpretq_f32_m128(b)); uint32x4_t a_or_b_nan = vmvnq_u32(vandq_u32(a_not_nan, b_not_nan)); uint32x4_t a_neq_b = vmvnq_u32( vceqq_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); return (vgetq_lane_u32(vorrq_u32(a_or_b_nan, a_neq_b), 0) != 0) ? 1 : 0; } // according to the documentation, these intrinsics behave the same as the // non-'u' versions. We'll just alias them here. #define _mm_ucomieq_ss _mm_comieq_ss #define _mm_ucomige_ss _mm_comige_ss #define _mm_ucomigt_ss _mm_comigt_ss #define _mm_ucomile_ss _mm_comile_ss #define _mm_ucomilt_ss _mm_comilt_ss #define _mm_ucomineq_ss _mm_comineq_ss /* Conversions */ // Convert packed signed 32-bit integers in b to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, and copy the upper 2 packed elements from a to the upper elements of // dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_pi2ps FORCE_INLINE __m128 _mm_cvt_pi2ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_si2ss FORCE_INLINE __m128 _mm_cvt_si2ss(__m128 a, int b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Convert the signed 32-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi32_ss #define _mm_cvtsi32_ss(a, b) _mm_cvt_si2ss(a, b) // Convert the signed 64-bit integer b to a single-precision (32-bit) // floating-point element, store the result in the lower element of dst, and // copy the upper 3 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int64_To_FP32(b[63:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi64_ss FORCE_INLINE __m128 _mm_cvtsi64_ss(__m128 a, int64_t b) { return vreinterpretq_m128_f32( vsetq_lane_f32((float) b, vreinterpretq_f32_m128(a), 0)); } // Convert the lower single-precision (32-bit) floating-point element in a to a // 32-bit integer, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ss2si FORCE_INLINE int _mm_cvt_ss2si(__m128 a) { #if defined(__aarch64__) return vgetq_lane_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)), 0); #else float32_t data = vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); float32_t diff = data - floor(data); if (diff > 0.5) return (int32_t) ceil(data); if (unlikely(diff == 0.5)) { int32_t f = (int32_t) floor(data); int32_t c = (int32_t) ceil(data); return c & 1 ? f : c; } return (int32_t) floor(data); #endif } // Convert packed 16-bit integers in a to packed single-precision (32-bit) // floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_Int16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi16_ps FORCE_INLINE __m128 _mm_cvtpi16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_s32(vmovl_s16(vreinterpret_s16_m64(a)))); } // Convert packed 32-bit integers in b to packed single-precision (32-bit) // floating-point elements, store the results in the lower 2 elements of dst, // and copy the upper 2 packed elements from a to the upper elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(b[31:0]) // dst[63:32] := Convert_Int32_To_FP32(b[63:32]) // dst[95:64] := a[95:64] // dst[127:96] := a[127:96] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32_ps FORCE_INLINE __m128 _mm_cvtpi32_ps(__m128 a, __m64 b) { return vreinterpretq_m128_f32( vcombine_f32(vcvt_f32_s32(vreinterpret_s32_m64(b)), vget_high_f32(vreinterpretq_f32_m128(a)))); } // Convert packed signed 32-bit integers in a to packed single-precision // (32-bit) floating-point elements, store the results in the lower 2 elements // of dst, then covert the packed signed 32-bit integers in b to // single-precision (32-bit) floating-point element, and store the results in // the upper 2 elements of dst. // // dst[31:0] := Convert_Int32_To_FP32(a[31:0]) // dst[63:32] := Convert_Int32_To_FP32(a[63:32]) // dst[95:64] := Convert_Int32_To_FP32(b[31:0]) // dst[127:96] := Convert_Int32_To_FP32(b[63:32]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi32x2_ps FORCE_INLINE __m128 _mm_cvtpi32x2_ps(__m64 a, __m64 b) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vcombine_s32(vreinterpret_s32_m64(a), vreinterpret_s32_m64(b)))); } // Convert the lower packed 8-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_Int8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpi8_ps FORCE_INLINE __m128 _mm_cvtpi8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_s32( vmovl_s16(vget_low_s16(vmovl_s8(vreinterpret_s8_m64(a)))))); } // Convert packed unsigned 16-bit integers in a to packed single-precision // (32-bit) floating-point elements, and store the results in dst. // // FOR j := 0 to 3 // i := j*16 // m := j*32 // dst[m+31:m] := Convert_UInt16_To_FP32(a[i+15:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu16_ps FORCE_INLINE __m128 _mm_cvtpu16_ps(__m64 a) { return vreinterpretq_m128_f32( vcvtq_f32_u32(vmovl_u16(vreinterpret_u16_m64(a)))); } // Convert the lower packed unsigned 8-bit integers in a to packed // single-precision (32-bit) floating-point elements, and store the results in // dst. // // FOR j := 0 to 3 // i := j*8 // m := j*32 // dst[m+31:m] := Convert_UInt8_To_FP32(a[i+7:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpu8_ps FORCE_INLINE __m128 _mm_cvtpu8_ps(__m64 a) { return vreinterpretq_m128_f32(vcvtq_f32_u32( vmovl_u16(vget_low_u16(vmovl_u8(vreinterpret_u8_m64(a)))))); } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values using truncate. // https://msdn.microsoft.com/en-us/library/vstudio/1h005y6x(v=vs.100).aspx FORCE_INLINE __m128i _mm_cvttps_epi32(__m128 a) { return vreinterpretq_m128i_s32(vcvtq_s32_f32(vreinterpretq_f32_m128(a))); } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64 FORCE_INLINE int64_t _mm_cvttsd_si64(__m128d a) { #if defined(__aarch64__) return vgetq_lane_s64(vcvtq_s64_f64(vreinterpretq_f64_m128d(a)), 0); #else double ret = *((double *) &a); return (int64_t) ret; #endif } // Convert the lower double-precision (64-bit) floating-point element in a to a // 64-bit integer with truncation, and store the result in dst. // // dst[63:0] := Convert_FP64_To_Int64_Truncate(a[63:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvttsd_si64x #define _mm_cvttsd_si64x(a) _mm_cvttsd_si64(a) // Converts the four signed 32-bit integer values of a to single-precision, // floating-point values // https://msdn.microsoft.com/en-us/library/vstudio/36bwxcx5(v=vs.100).aspx FORCE_INLINE __m128 _mm_cvtepi32_ps(__m128i a) { return vreinterpretq_m128_f32(vcvtq_f32_s32(vreinterpretq_s32_m128i(a))); } // Converts the four unsigned 8-bit integers in the lower 16 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepu8_epi16(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ return vreinterpretq_m128i_u16(u16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb531467%28v=vs.100%29.aspx FORCE_INLINE __m128i _mm_cvtepu8_epi32(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx DCBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0D0C 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_u32(u32x4); } // Converts the two unsigned 8-bit integers in the lower 16 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu8_epi64(__m128i a) { uint8x16_t u8x16 = vreinterpretq_u8_m128i(a); /* xxxx xxxx xxxx xxBA */ uint16x8_t u16x8 = vmovl_u8(vget_low_u8(u8x16)); /* 0x0x 0x0x 0x0x 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Converts the four unsigned 8-bit integers in the lower 16 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi16(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ return vreinterpretq_m128i_s16(s16x8); } // Converts the four unsigned 8-bit integers in the lower 32 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi32(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx DCBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0D0C 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000D 000C 000B 000A */ return vreinterpretq_m128i_s32(s32x4); } // Converts the two signed 8-bit integers in the lower 32 bits to four // signed 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi8_epi64(__m128i a) { int8x16_t s8x16 = vreinterpretq_s8_m128i(a); /* xxxx xxxx xxxx xxBA */ int16x8_t s16x8 = vmovl_s8(vget_low_s8(s8x16)); /* 0x0x 0x0x 0x0x 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the four signed 16-bit integers in the lower 64 bits to four signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi32(__m128i a) { return vreinterpretq_m128i_s32( vmovl_s16(vget_low_s16(vreinterpretq_s16_m128i(a)))); } // Converts the two signed 16-bit integers in the lower 32 bits two signed // 32-bit integers. FORCE_INLINE __m128i _mm_cvtepi16_epi64(__m128i a) { int16x8_t s16x8 = vreinterpretq_s16_m128i(a); /* xxxx xxxx xxxx 0B0A */ int32x4_t s32x4 = vmovl_s16(vget_low_s16(s16x8)); /* 000x 000x 000B 000A */ int64x2_t s64x2 = vmovl_s32(vget_low_s32(s32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_s64(s64x2); } // Converts the four unsigned 16-bit integers in the lower 64 bits to four // unsigned 32-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi32(__m128i a) { return vreinterpretq_m128i_u32( vmovl_u16(vget_low_u16(vreinterpretq_u16_m128i(a)))); } // Converts the two unsigned 16-bit integers in the lower 32 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu16_epi64(__m128i a) { uint16x8_t u16x8 = vreinterpretq_u16_m128i(a); /* xxxx xxxx xxxx 0B0A */ uint32x4_t u32x4 = vmovl_u16(vget_low_u16(u16x8)); /* 000x 000x 000B 000A */ uint64x2_t u64x2 = vmovl_u32(vget_low_u32(u32x4)); /* 0000 000B 0000 000A */ return vreinterpretq_m128i_u64(u64x2); } // Converts the two unsigned 32-bit integers in the lower 64 bits to two // unsigned 64-bit integers. FORCE_INLINE __m128i _mm_cvtepu32_epi64(__m128i a) { return vreinterpretq_m128i_u64( vmovl_u32(vget_low_u32(vreinterpretq_u32_m128i(a)))); } // Converts the two signed 32-bit integers in the lower 64 bits to two signed // 64-bit integers. FORCE_INLINE __m128i _mm_cvtepi32_epi64(__m128i a) { return vreinterpretq_m128i_s64( vmovl_s32(vget_low_s32(vreinterpretq_s32_m128i(a)))); } // Converts the four single-precision, floating-point values of a to signed // 32-bit integer values. // // r0 := (int) a0 // r1 := (int) a1 // r2 := (int) a2 // r3 := (int) a3 // // https://msdn.microsoft.com/en-us/library/vstudio/xdc42k5e(v=vs.100).aspx // *NOTE*. The default rounding mode on SSE is 'round to even', which ARMv7-A // does not support! It is supported on ARMv8-A however. FORCE_INLINE __m128i _mm_cvtps_epi32(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128i_s32(vcvtnq_s32_f32(a)); #else uint32x4_t signmask = vdupq_n_u32(0x80000000); float32x4_t half = vbslq_f32(signmask, vreinterpretq_f32_m128(a), vdupq_n_f32(0.5f)); /* +/- 0.5 */ int32x4_t r_normal = vcvtq_s32_f32(vaddq_f32( vreinterpretq_f32_m128(a), half)); /* round to integer: [a + 0.5]*/ int32x4_t r_trunc = vcvtq_s32_f32(vreinterpretq_f32_m128(a)); /* truncate to integer: [a] */ int32x4_t plusone = vreinterpretq_s32_u32(vshrq_n_u32( vreinterpretq_u32_s32(vnegq_s32(r_trunc)), 31)); /* 1 or 0 */ int32x4_t r_even = vbicq_s32(vaddq_s32(r_trunc, plusone), vdupq_n_s32(1)); /* ([a] + {0,1}) & ~1 */ float32x4_t delta = vsubq_f32( vreinterpretq_f32_m128(a), vcvtq_f32_s32(r_trunc)); /* compute delta: delta = (a - [a]) */ uint32x4_t is_delta_half = vceqq_f32(delta, half); /* delta == +/- 0.5 */ return vreinterpretq_m128i_s32(vbslq_s32(is_delta_half, r_even, r_normal)); #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 16-bit integers, and store the results in dst. Note: this intrinsic // will generate 0x7FFF, rather than 0x8000, for input values between 0x7FFF and // 0x7FFFFFFF. // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi16 FORCE_INLINE __m64 _mm_cvtps_pi16(__m128 a) { return vreinterpret_m64_s16( vmovn_s32(vreinterpretq_s32_m128i(_mm_cvtps_epi32(a)))); } // Copy the lower 32-bit integer in a to dst. // // dst[31:0] := a[31:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si32 FORCE_INLINE int _mm_cvtsi128_si32(__m128i a) { return vgetq_lane_s32(vreinterpretq_s32_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64 FORCE_INLINE int64_t _mm_cvtsi128_si64(__m128i a) { return vgetq_lane_s64(vreinterpretq_s64_m128i(a), 0); } // Copy the lower 64-bit integer in a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsi128_si64x #define _mm_cvtsi128_si64x(a) _mm_cvtsi128_si64(a) // Moves 32-bit integer a to the least significant 32 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 // r2 := 0x0 // r3 := 0x0 // // https://msdn.microsoft.com/en-us/library/ct3539ha%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_cvtsi32_si128(int a) { return vreinterpretq_m128i_s32(vsetq_lane_s32(a, vdupq_n_s32(0), 0)); } // Moves 64-bit integer a to the least significant 64 bits of an __m128 object, // zero extending the upper bits. // // r0 := a // r1 := 0x0 FORCE_INLINE __m128i _mm_cvtsi64_si128(int64_t a) { return vreinterpretq_m128i_s64(vsetq_lane_s64(a, vdupq_n_s64(0), 0)); } // Cast vector of type __m128 to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castps_pd FORCE_INLINE __m128d _mm_castps_pd(__m128 a) { return vreinterpretq_m128d_s32(vreinterpretq_s32_m128(a)); } // Applies a type cast to reinterpret four 32-bit floating point values passed // in as a 128-bit parameter as packed 32-bit integers. // https://msdn.microsoft.com/en-us/library/bb514099.aspx FORCE_INLINE __m128i _mm_castps_si128(__m128 a) { return vreinterpretq_m128i_s32(vreinterpretq_s32_m128(a)); } // Cast vector of type __m128i to type __m128d. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castsi128_pd FORCE_INLINE __m128d _mm_castsi128_pd(__m128i a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vreinterpretq_f64_m128i(a)); #else return vreinterpretq_m128d_f32(vreinterpretq_f32_m128i(a)); #endif } // Applies a type cast to reinterpret four 32-bit integers passed in as a // 128-bit parameter as packed 32-bit floating point values. // https://msdn.microsoft.com/en-us/library/bb514029.aspx FORCE_INLINE __m128 _mm_castsi128_ps(__m128i a) { return vreinterpretq_m128_s32(vreinterpretq_s32_m128i(a)); } // Loads 128-bit value. : // https://msdn.microsoft.com/en-us/library/atzzad1h(v=vs.80).aspx FORCE_INLINE __m128i _mm_load_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load1_pd FORCE_INLINE __m128d _mm_load1_pd(const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64(vld1q_dup_f64(p)); #else return vreinterpretq_m128d_s64(vdupq_n_s64(*(const int64_t *) p)); #endif } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 #define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into the // upper element of dst, and copy the lower element from a to dst. mem_addr does // not need to be aligned on any particular boundary. // // dst[63:0] := a[63:0] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadh_pd FORCE_INLINE __m128d _mm_loadh_pd(__m128d a, const double *p) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcombine_f64(vget_low_f64(vreinterpretq_f64_m128d(a)), vld1_f64(p))); #else return vreinterpretq_m128d_f32(vcombine_f32( vget_low_f32(vreinterpretq_f32_m128d(a)), vld1_f32((const float *) p))); #endif } // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_load_pd1 #define _mm_load_pd1 _mm_load1_pd // Load a double-precision (64-bit) floating-point element from memory into both // elements of dst. // // dst[63:0] := MEM[mem_addr+63:mem_addr] // dst[127:64] := MEM[mem_addr+63:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loaddup_pd #define _mm_loaddup_pd _mm_load1_pd // Loads 128-bit value. : // https://msdn.microsoft.com/zh-cn/library/f4k12ae8(v=vs.90).aspx FORCE_INLINE __m128i _mm_loadu_si128(const __m128i *p) { return vreinterpretq_m128i_s32(vld1q_s32((const int32_t *) p)); } // Load unaligned 32-bit integer from memory into the first element of dst. // // dst[31:0] := MEM[mem_addr+31:mem_addr] // dst[MAX:32] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_loadu_si32 FORCE_INLINE __m128i _mm_loadu_si32(const void *p) { return vreinterpretq_m128i_s32( vsetq_lane_s32(*(const int32_t *) p, vdupq_n_s32(0), 0)); } // Convert packed double-precision (64-bit) floating-point elements in a to // packed single-precision (32-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 32*j // k := 64*j // dst[i+31:i] := Convert_FP64_To_FP32(a[k+64:k]) // ENDFOR // dst[127:64] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtpd_ps FORCE_INLINE __m128 _mm_cvtpd_ps(__m128d a) { #if defined(__aarch64__) float32x2_t tmp = vcvt_f32_f64(vreinterpretq_f64_m128d(a)); return vreinterpretq_m128_f32(vcombine_f32(tmp, vdup_n_f32(0))); #else float a0 = (float) ((double *) &a)[0]; float a1 = (float) ((double *) &a)[1]; return _mm_set_ps(0, 0, a1, a0); #endif } // Copy the lower double-precision (64-bit) floating-point element of a to dst. // // dst[63:0] := a[63:0] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtsd_f64 FORCE_INLINE double _mm_cvtsd_f64(__m128d a) { #if defined(__aarch64__) return (double) vgetq_lane_f64(vreinterpretq_f64_m128d(a), 0); #else return ((double *) &a)[0]; #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed double-precision (64-bit) floating-point elements, and store the // results in dst. // // FOR j := 0 to 1 // i := 64*j // k := 32*j // dst[i+63:i] := Convert_FP32_To_FP64(a[k+31:k]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pd FORCE_INLINE __m128d _mm_cvtps_pd(__m128 a) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vcvt_f64_f32(vget_low_f32(vreinterpretq_f32_m128(a)))); #else double a0 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 0); double a1 = (double) vgetq_lane_f32(vreinterpretq_f32_m128(a), 1); return _mm_set_pd(a1, a0); #endif } // Cast vector of type __m128d to type __m128i. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_si128 FORCE_INLINE __m128i _mm_castpd_si128(__m128d a) { return vreinterpretq_m128i_s64(vreinterpretq_s64_m128d(a)); } // Cast vector of type __m128d to type __m128. This intrinsic is only used for // compilation and does not generate any instructions, thus it has zero latency. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_castpd_ps FORCE_INLINE __m128 _mm_castpd_ps(__m128d a) { return vreinterpretq_m128_s64(vreinterpretq_s64_m128d(a)); } // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_ps FORCE_INLINE __m128 _mm_blendv_ps(__m128 _a, __m128 _b, __m128 _mask) { // Use a signed shift right to create a mask with the sign bit uint32x4_t mask = vreinterpretq_u32_s32(vshrq_n_s32(vreinterpretq_s32_m128(_mask), 31)); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Blend packed single-precision (32-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blend_ps FORCE_INLINE __m128 _mm_blend_ps(__m128 _a, __m128 _b, const char imm8) { const uint32_t ALIGN_STRUCT(16) data[4] = {((imm8) & (1 << 0)) ? UINT32_MAX : 0, ((imm8) & (1 << 1)) ? UINT32_MAX : 0, ((imm8) & (1 << 2)) ? UINT32_MAX : 0, ((imm8) & (1 << 3)) ? UINT32_MAX : 0}; uint32x4_t mask = vld1q_u32(data); float32x4_t a = vreinterpretq_f32_m128(_a); float32x4_t b = vreinterpretq_f32_m128(_b); return vreinterpretq_m128_f32(vbslq_f32(mask, b, a)); } // Blend packed double-precision (64-bit) floating-point elements from a and b // using mask, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_blendv_pd FORCE_INLINE __m128d _mm_blendv_pd(__m128d _a, __m128d _b, __m128d _mask) { uint64x2_t mask = vreinterpretq_u64_s64(vshrq_n_s64(vreinterpretq_s64_m128d(_mask), 63)); #if defined(__aarch64__) float64x2_t a = vreinterpretq_f64_m128d(_a); float64x2_t b = vreinterpretq_f64_m128d(_b); return vreinterpretq_m128d_f64(vbslq_f64(mask, b, a)); #else uint64x2_t a = vreinterpretq_u64_m128d(_a); uint64x2_t b = vreinterpretq_u64_m128d(_b); return vreinterpretq_m128d_u64(vbslq_u64(mask, b, a)); #endif } typedef struct { uint16_t res0; uint8_t res1 : 6; uint8_t bit22 : 1; uint8_t bit23 : 1; uint8_t res2; #if defined(__aarch64__) uint32_t res3; #endif } fpcr_bitfield; // Macro: Set the rounding mode bits of the MXCSR control and status register to // the value in unsigned 32-bit integer a. The rounding mode may contain any of // the following flags: _MM_ROUND_NEAREST, _MM_ROUND_DOWN, _MM_ROUND_UP, // _MM_ROUND_TOWARD_ZERO // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_MM_SET_ROUNDING_MODE FORCE_INLINE void _MM_SET_ROUNDING_MODE(int rounding) { union { fpcr_bitfield field; #if defined(__aarch64__) uint64_t value; #else uint32_t value; #endif } r; #if defined(__aarch64__) asm volatile("mrs %0, FPCR" : "=r"(r.value)); /* read */ #else asm volatile("vmrs %0, FPSCR" : "=r"(r.value)); /* read */ #endif switch (rounding) { case _MM_ROUND_TOWARD_ZERO: r.field.bit22 = 1; r.field.bit23 = 1; break; case _MM_ROUND_DOWN: r.field.bit22 = 0; r.field.bit23 = 1; break; case _MM_ROUND_UP: r.field.bit22 = 1; r.field.bit23 = 0; break; default: //_MM_ROUND_NEAREST r.field.bit22 = 0; r.field.bit23 = 0; } #if defined(__aarch64__) asm volatile("msr FPCR, %0" ::"r"(r)); /* write */ #else asm volatile("vmsr FPSCR, %0" ::"r"(r)); /* write */ #endif } FORCE_INLINE void _mm_setcsr(unsigned int a) { _MM_SET_ROUNDING_MODE(a); } // Round the packed single-precision (32-bit) floating-point elements in a using // the rounding parameter, and store the results as packed single-precision // floating-point elements in dst. // software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_round_ps FORCE_INLINE __m128 _mm_round_ps(__m128 a, int rounding) { #if defined(__aarch64__) switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndnq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndmq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndpq_f32(vreinterpretq_f32_m128(a))); case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): return vreinterpretq_m128_f32(vrndq_f32(vreinterpretq_f32_m128(a))); default: //_MM_FROUND_CUR_DIRECTION return vreinterpretq_m128_f32(vrndiq_f32(vreinterpretq_f32_m128(a))); } #else float *v_float = (float *) &a; __m128 zero, neg_inf, pos_inf; switch (rounding) { case (_MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC): return _mm_cvtepi32_ps(_mm_cvtps_epi32(a)); case (_MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC): return (__m128){floorf(v_float[0]), floorf(v_float[1]), floorf(v_float[2]), floorf(v_float[3])}; case (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC): return (__m128){ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), ceilf(v_float[3])}; case (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC): zero = _mm_set_ps(0.0f, 0.0f, 0.0f, 0.0f); neg_inf = _mm_set_ps(floorf(v_float[0]), floorf(v_float[1]), floorf(v_float[2]), floorf(v_float[3])); pos_inf = _mm_set_ps(ceilf(v_float[0]), ceilf(v_float[1]), ceilf(v_float[2]), ceilf(v_float[3])); return _mm_blendv_ps(pos_inf, neg_inf, _mm_cmple_ps(a, zero)); default: //_MM_FROUND_CUR_DIRECTION return (__m128){roundf(v_float[0]), roundf(v_float[1]), roundf(v_float[2]), roundf(v_float[3])}; } #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvt_ps2pi FORCE_INLINE __m64 _mm_cvt_ps2pi(__m128 a) { #if defined(__aarch64__) return vreinterpret_m64_s32( vget_low_s32(vcvtnq_s32_f32(vreinterpretq_f32_m128(a)))); #else return vreinterpret_m64_s32( vcvt_s32_f32(vget_low_f32(vreinterpretq_f32_m128( _mm_round_ps(a, _MM_FROUND_TO_NEAREST_INT | _MM_FROUND_NO_EXC))))); #endif } // Convert packed single-precision (32-bit) floating-point elements in a to // packed 32-bit integers, and store the results in dst. // // FOR j := 0 to 1 // i := 32*j // dst[i+31:i] := Convert_FP32_To_Int32(a[i+31:i]) // ENDFOR // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_cvtps_pi32 #define _mm_cvtps_pi32(a) _mm_cvt_ps2pi(a) // Round the packed single-precision (32-bit) floating-point elements in a up to // an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ps FORCE_INLINE __m128 _mm_ceil_ps(__m128 a) { return _mm_round_ps(a, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC); } // Round the lower single-precision (32-bit) floating-point element in b up to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := CEIL(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_ceil_ss FORCE_INLINE __m128 _mm_ceil_ss(__m128 a, __m128 b) { return _mm_move_ss( a, _mm_round_ps(b, _MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC)); } // Round the packed single-precision (32-bit) floating-point elements in a down // to an integer value, and store the results as packed single-precision // floating-point elements in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ps FORCE_INLINE __m128 _mm_floor_ps(__m128 a) { return _mm_round_ps(a, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC); } // Round the lower single-precision (32-bit) floating-point element in b down to // an integer value, store the result as a single-precision floating-point // element in the lower element of dst, and copy the upper 3 packed elements // from a to the upper elements of dst. // // dst[31:0] := FLOOR(b[31:0]) // dst[127:32] := a[127:32] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_floor_ss FORCE_INLINE __m128 _mm_floor_ss(__m128 a, __m128 b) { return _mm_move_ss( a, _mm_round_ps(b, _MM_FROUND_TO_NEG_INF | _MM_FROUND_NO_EXC)); } // Load 128-bits of integer data from unaligned memory into dst. This intrinsic // may perform better than _mm_loadu_si128 when the data crosses a cache line // boundary. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_lddqu_si128 #define _mm_lddqu_si128 _mm_loadu_si128 /* Miscellaneous Operations */ // Shifts the 8 signed 16-bit integers in a right by count bits while shifting // in the sign bit. // // r0 := a0 >> count // r1 := a1 >> count // ... // r7 := a7 >> count // // https://msdn.microsoft.com/en-us/library/3c9997dk(v%3dvs.90).aspx FORCE_INLINE __m128i _mm_sra_epi16(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (unlikely(c > 15)) return _mm_cmplt_epi16(a, _mm_setzero_si128()); return vreinterpretq_m128i_s16(vshlq_s16((int16x8_t) a, vdupq_n_s16(-c))); } // Shifts the 4 signed 32-bit integers in a right by count bits while shifting // in the sign bit. // // r0 := a0 >> count // r1 := a1 >> count // r2 := a2 >> count // r3 := a3 >> count // // https://msdn.microsoft.com/en-us/library/ce40009e(v%3dvs.100).aspx FORCE_INLINE __m128i _mm_sra_epi32(__m128i a, __m128i count) { int64_t c = (int64_t) vget_low_s64((int64x2_t) count); if (unlikely(c > 31)) return _mm_cmplt_epi32(a, _mm_setzero_si128()); return vreinterpretq_m128i_s32(vshlq_s32((int32x4_t) a, vdupq_n_s32(-c))); } // Packs the 16 signed 16-bit integers from a and b into 8-bit integers and // saturates. // https://msdn.microsoft.com/en-us/library/k4y4f7w5%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi16(__m128i a, __m128i b) { return vreinterpretq_m128i_s8( vcombine_s8(vqmovn_s16(vreinterpretq_s16_m128i(a)), vqmovn_s16(vreinterpretq_s16_m128i(b)))); } // Packs the 16 signed 16 - bit integers from a and b into 8 - bit unsigned // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // ... // r7 := UnsignedSaturate(a7) // r8 := UnsignedSaturate(b0) // r9 := UnsignedSaturate(b1) // ... // r15 := UnsignedSaturate(b7) // // https://msdn.microsoft.com/en-us/library/07ad1wx4(v=vs.100).aspx FORCE_INLINE __m128i _mm_packus_epi16(const __m128i a, const __m128i b) { return vreinterpretq_m128i_u8( vcombine_u8(vqmovun_s16(vreinterpretq_s16_m128i(a)), vqmovun_s16(vreinterpretq_s16_m128i(b)))); } // Packs the 8 signed 32-bit integers from a and b into signed 16-bit integers // and saturates. // // r0 := SignedSaturate(a0) // r1 := SignedSaturate(a1) // r2 := SignedSaturate(a2) // r3 := SignedSaturate(a3) // r4 := SignedSaturate(b0) // r5 := SignedSaturate(b1) // r6 := SignedSaturate(b2) // r7 := SignedSaturate(b3) // // https://msdn.microsoft.com/en-us/library/393t56f9%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_packs_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_s16( vcombine_s16(vqmovn_s32(vreinterpretq_s32_m128i(a)), vqmovn_s32(vreinterpretq_s32_m128i(b)))); } // Packs the 8 unsigned 32-bit integers from a and b into unsigned 16-bit // integers and saturates. // // r0 := UnsignedSaturate(a0) // r1 := UnsignedSaturate(a1) // r2 := UnsignedSaturate(a2) // r3 := UnsignedSaturate(a3) // r4 := UnsignedSaturate(b0) // r5 := UnsignedSaturate(b1) // r6 := UnsignedSaturate(b2) // r7 := UnsignedSaturate(b3) FORCE_INLINE __m128i _mm_packus_epi32(__m128i a, __m128i b) { return vreinterpretq_m128i_u16( vcombine_u16(vqmovun_s32(vreinterpretq_s32_m128i(a)), vqmovun_s32(vreinterpretq_s32_m128i(b)))); } // Interleaves the lower 8 signed or unsigned 8-bit integers in a with the lower // 8 signed or unsigned 8-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // ... // r14 := a7 // r15 := b7 // // https://msdn.microsoft.com/en-us/library/xf7k860c%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip1q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_low_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Interleaves the lower 4 signed or unsigned 16-bit integers in a with the // lower 4 signed or unsigned 16-bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // r4 := a2 // r5 := b2 // r6 := a3 // r7 := b3 // // https://msdn.microsoft.com/en-us/library/btxb17bw%28v=vs.90%29.aspx FORCE_INLINE __m128i _mm_unpacklo_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip1q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_low_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_low_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the lower 2 signed or unsigned 32 - bit integers in a with the // lower 2 signed or unsigned 32 - bit integers in b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/x8atst9d(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpacklo_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip1q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_low_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_low_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } FORCE_INLINE __m128i _mm_unpacklo_epi64(__m128i a, __m128i b) { int64x1_t a_l = vget_low_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_l = vget_low_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_l, b_l)); } // Selects and interleaves the lower two single-precision, floating-point values // from a and b. // // r0 := a0 // r1 := b0 // r2 := a1 // r3 := b1 // // https://msdn.microsoft.com/en-us/library/25st103b%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpacklo_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip1q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_low_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_low_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the low half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[63:0] // dst[127:64] := src2[63:0] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpacklo_pd FORCE_INLINE __m128d _mm_unpacklo_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip1q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_low_s64(vreinterpretq_s64_m128d(a)), vget_low_s64(vreinterpretq_s64_m128d(b)))); #endif } // Unpack and interleave double-precision (64-bit) floating-point elements from // the high half of a and b, and store the results in dst. // // DEFINE INTERLEAVE_HIGH_QWORDS(src1[127:0], src2[127:0]) { // dst[63:0] := src1[127:64] // dst[127:64] := src2[127:64] // RETURN dst[127:0] // } // dst[127:0] := INTERLEAVE_HIGH_QWORDS(a[127:0], b[127:0]) // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_unpackhi_pd FORCE_INLINE __m128d _mm_unpackhi_pd(__m128d a, __m128d b) { #if defined(__aarch64__) return vreinterpretq_m128d_f64( vzip2q_f64(vreinterpretq_f64_m128d(a), vreinterpretq_f64_m128d(b))); #else return vreinterpretq_m128d_s64( vcombine_s64(vget_high_s64(vreinterpretq_s64_m128d(a)), vget_high_s64(vreinterpretq_s64_m128d(b)))); #endif } // Selects and interleaves the upper two single-precision, floating-point values // from a and b. // // r0 := a2 // r1 := b2 // r2 := a3 // r3 := b3 // // https://msdn.microsoft.com/en-us/library/skccxx7d%28v=vs.90%29.aspx FORCE_INLINE __m128 _mm_unpackhi_ps(__m128 a, __m128 b) { #if defined(__aarch64__) return vreinterpretq_m128_f32( vzip2q_f32(vreinterpretq_f32_m128(a), vreinterpretq_f32_m128(b))); #else float32x2_t a1 = vget_high_f32(vreinterpretq_f32_m128(a)); float32x2_t b1 = vget_high_f32(vreinterpretq_f32_m128(b)); float32x2x2_t result = vzip_f32(a1, b1); return vreinterpretq_m128_f32(vcombine_f32(result.val[0], result.val[1])); #endif } // Interleaves the upper 8 signed or unsigned 8-bit integers in a with the upper // 8 signed or unsigned 8-bit integers in b. // // r0 := a8 // r1 := b8 // r2 := a9 // r3 := b9 // ... // r14 := a15 // r15 := b15 // // https://msdn.microsoft.com/en-us/library/t5h7783k(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi8(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s8( vzip2q_s8(vreinterpretq_s8_m128i(a), vreinterpretq_s8_m128i(b))); #else int8x8_t a1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(a))); int8x8_t b1 = vreinterpret_s8_s16(vget_high_s16(vreinterpretq_s16_m128i(b))); int8x8x2_t result = vzip_s8(a1, b1); return vreinterpretq_m128i_s8(vcombine_s8(result.val[0], result.val[1])); #endif } // Interleaves the upper 4 signed or unsigned 16-bit integers in a with the // upper 4 signed or unsigned 16-bit integers in b. // // r0 := a4 // r1 := b4 // r2 := a5 // r3 := b5 // r4 := a6 // r5 := b6 // r6 := a7 // r7 := b7 // // https://msdn.microsoft.com/en-us/library/03196cz7(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi16(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s16( vzip2q_s16(vreinterpretq_s16_m128i(a), vreinterpretq_s16_m128i(b))); #else int16x4_t a1 = vget_high_s16(vreinterpretq_s16_m128i(a)); int16x4_t b1 = vget_high_s16(vreinterpretq_s16_m128i(b)); int16x4x2_t result = vzip_s16(a1, b1); return vreinterpretq_m128i_s16(vcombine_s16(result.val[0], result.val[1])); #endif } // Interleaves the upper 2 signed or unsigned 32-bit integers in a with the // upper 2 signed or unsigned 32-bit integers in b. // https://msdn.microsoft.com/en-us/library/65sa7cbs(v=vs.100).aspx FORCE_INLINE __m128i _mm_unpackhi_epi32(__m128i a, __m128i b) { #if defined(__aarch64__) return vreinterpretq_m128i_s32( vzip2q_s32(vreinterpretq_s32_m128i(a), vreinterpretq_s32_m128i(b))); #else int32x2_t a1 = vget_high_s32(vreinterpretq_s32_m128i(a)); int32x2_t b1 = vget_high_s32(vreinterpretq_s32_m128i(b)); int32x2x2_t result = vzip_s32(a1, b1); return vreinterpretq_m128i_s32(vcombine_s32(result.val[0], result.val[1])); #endif } // Interleaves the upper signed or unsigned 64-bit integer in a with the // upper signed or unsigned 64-bit integer in b. // // r0 := a1 // r1 := b1 FORCE_INLINE __m128i _mm_unpackhi_epi64(__m128i a, __m128i b) { int64x1_t a_h = vget_high_s64(vreinterpretq_s64_m128i(a)); int64x1_t b_h = vget_high_s64(vreinterpretq_s64_m128i(b)); return vreinterpretq_m128i_s64(vcombine_s64(a_h, b_h)); } // Horizontally compute the minimum amongst the packed unsigned 16-bit integers // in a, store the minimum and index in dst, and zero the remaining bits in dst. // // index[2:0] := 0 // min[15:0] := a[15:0] // FOR j := 0 to 7 // i := j*16 // IF a[i+15:i] < min[15:0] // index[2:0] := j // min[15:0] := a[i+15:i] // FI // ENDFOR // dst[15:0] := min[15:0] // dst[18:16] := index[2:0] // dst[127:19] := 0 // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_minpos_epu16 FORCE_INLINE __m128i _mm_minpos_epu16(__m128i a) { __m128i dst; uint16_t min, idx = 0; // Find the minimum value #if defined(__aarch64__) min = vminvq_u16(vreinterpretq_u16_m128i(a)); #else __m64 tmp; tmp = vreinterpret_m64_u16( vmin_u16(vget_low_u16(vreinterpretq_u16_m128i(a)), vget_high_u16(vreinterpretq_u16_m128i(a)))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); tmp = vreinterpret_m64_u16( vpmin_u16(vreinterpret_u16_m64(tmp), vreinterpret_u16_m64(tmp))); min = vget_lane_u16(vreinterpret_u16_m64(tmp), 0); #endif // Get the index of the minimum value int i; for (i = 0; i < 8; i++) { if (min == vgetq_lane_u16(vreinterpretq_u16_m128i(a), 0)) { idx = (uint16_t) i; break; } a = _mm_srli_si128(a, 2); } // Generate result dst = _mm_setzero_si128(); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(min, vreinterpretq_u16_m128i(dst), 0)); dst = vreinterpretq_m128i_u16( vsetq_lane_u16(idx, vreinterpretq_u16_m128i(dst), 1)); return dst; } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the CF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testc_si128 FORCE_INLINE int _mm_testc_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_s32(vmvnq_s32(vreinterpretq_s32_m128i(a))), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Compute the bitwise AND of 128 bits (representing integer data) in a and b, // and set ZF to 1 if the result is zero, otherwise set ZF to 0. Compute the // bitwise NOT of a and then AND with b, and set CF to 1 if the result is zero, // otherwise set CF to 0. Return the ZF value. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_testz_si128 FORCE_INLINE int _mm_testz_si128(__m128i a, __m128i b) { int64x2_t s64 = vandq_s64(vreinterpretq_s64_m128i(a), vreinterpretq_s64_m128i(b)); return !(vgetq_lane_s64(s64, 0) | vgetq_lane_s64(s64, 1)); } // Extracts the selected signed or unsigned 8-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi8(__m128i a, __constrange(0,16) int imm) #define _mm_extract_epi8(a, imm) vgetq_lane_u8(vreinterpretq_u8_m128i(a), (imm)) // Inserts the least significant 8 bits of b into the selected 8-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi8(__m128i a, int b, // __constrange(0,16) int imm) #define _mm_insert_epi8(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s8( \ vsetq_lane_s8((b), vreinterpretq_s8_m128i(a), (imm))); \ }) // Extracts the selected signed or unsigned 16-bit integer from a and zero // extends. // https://msdn.microsoft.com/en-us/library/6dceta0c(v=vs.100).aspx // FORCE_INLINE int _mm_extract_epi16(__m128i a, __constrange(0,8) int imm) #define _mm_extract_epi16(a, imm) \ vgetq_lane_u16(vreinterpretq_u16_m128i(a), (imm)) // Inserts the least significant 16 bits of b into the selected 16-bit integer // of a. // https://msdn.microsoft.com/en-us/library/kaze8hz1%28v=vs.100%29.aspx // FORCE_INLINE __m128i _mm_insert_epi16(__m128i a, int b, // __constrange(0,8) int imm) #define _mm_insert_epi16(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s16( \ vsetq_lane_s16((b), vreinterpretq_s16_m128i(a), (imm))); \ }) // Copy a to dst, and insert the 16-bit integer i into dst at the location // specified by imm8. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_insert_pi16 #define _mm_insert_pi16(a, b, imm) \ __extension__({ \ vreinterpret_m64_s16( \ vset_lane_s16((b), vreinterpret_s16_m64(a), (imm))); \ }) // Extracts the selected signed or unsigned 32-bit integer from a and zero // extends. // FORCE_INLINE int _mm_extract_epi32(__m128i a, __constrange(0,4) int imm) #define _mm_extract_epi32(a, imm) \ vgetq_lane_s32(vreinterpretq_s32_m128i(a), (imm)) // Extracts the selected single-precision (32-bit) floating-point from a. // FORCE_INLINE int _mm_extract_ps(__m128 a, __constrange(0,4) int imm) #define _mm_extract_ps(a, imm) vgetq_lane_s32(vreinterpretq_s32_m128(a), (imm)) // Inserts the least significant 32 bits of b into the selected 32-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi32(__m128i a, int b, // __constrange(0,4) int imm) #define _mm_insert_epi32(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s32( \ vsetq_lane_s32((b), vreinterpretq_s32_m128i(a), (imm))); \ }) // Extracts the selected signed or unsigned 64-bit integer from a and zero // extends. // FORCE_INLINE __int64 _mm_extract_epi64(__m128i a, __constrange(0,2) int imm) #define _mm_extract_epi64(a, imm) \ vgetq_lane_s64(vreinterpretq_s64_m128i(a), (imm)) // Inserts the least significant 64 bits of b into the selected 64-bit integer // of a. // FORCE_INLINE __m128i _mm_insert_epi64(__m128i a, __int64 b, // __constrange(0,2) int imm) #define _mm_insert_epi64(a, b, imm) \ __extension__({ \ vreinterpretq_m128i_s64( \ vsetq_lane_s64((b), vreinterpretq_s64_m128i(a), (imm))); \ }) // Count the number of bits set to 1 in unsigned 32-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u32 FORCE_INLINE int _mm_popcnt_u32(unsigned int a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcount) return __builtin_popcount(a); #else return (int) vaddlv_u8(vcnt_u8(vcreate_u8((uint64_t) a))); #endif #else uint32_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); vst1_u32(&count, count32x2_val); return count; #endif } // Count the number of bits set to 1 in unsigned 64-bit integer a, and // return that count in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_popcnt_u64 FORCE_INLINE int64_t _mm_popcnt_u64(uint64_t a) { #if defined(__aarch64__) #if __has_builtin(__builtin_popcountll) return __builtin_popcountll(a); #else return (int64_t) vaddlv_u8(vcnt_u8(vcreate_u8(a))); #endif #else uint64_t count = 0; uint8x8_t input_val, count8x8_val; uint16x4_t count16x4_val; uint32x2_t count32x2_val; uint64x1_t count64x1_val; input_val = vld1_u8((uint8_t *) &a); count8x8_val = vcnt_u8(input_val); count16x4_val = vpaddl_u8(count8x8_val); count32x2_val = vpaddl_u16(count16x4_val); count64x1_val = vpaddl_u32(count32x2_val); vst1_u64(&count, count64x1_val); return count; #endif } // Macro: Transpose the 4x4 matrix formed by the 4 rows of single-precision // (32-bit) floating-point elements in row0, row1, row2, and row3, and store the // transposed matrix in these vectors (row0 now contains column 0, etc.). // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=MM_TRANSPOSE4_PS #define _MM_TRANSPOSE4_PS(row0, row1, row2, row3) \ do { \ float32x4x2_t ROW01 = vtrnq_f32(row0, row1); \ float32x4x2_t ROW23 = vtrnq_f32(row2, row3); \ row0 = vcombine_f32(vget_low_f32(ROW01.val[0]), \ vget_low_f32(ROW23.val[0])); \ row1 = vcombine_f32(vget_low_f32(ROW01.val[1]), \ vget_low_f32(ROW23.val[1])); \ row2 = vcombine_f32(vget_high_f32(ROW01.val[0]), \ vget_high_f32(ROW23.val[0])); \ row3 = vcombine_f32(vget_high_f32(ROW01.val[1]), \ vget_high_f32(ROW23.val[1])); \ } while (0) /* Crypto Extensions */ #if defined(__ARM_FEATURE_CRYPTO) // Wraps vmull_p64 FORCE_INLINE uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly64_t a = vget_lane_p64(vreinterpret_p64_u64(_a), 0); poly64_t b = vget_lane_p64(vreinterpret_p64_u64(_b), 0); return vreinterpretq_u64_p128(vmull_p64(a, b)); } #else // ARMv7 polyfill // ARMv7/some A64 lacks vmull_p64, but it has vmull_p8. // // vmull_p8 calculates 8 8-bit->16-bit polynomial multiplies, but we need a // 64-bit->128-bit polynomial multiply. // // It needs some work and is somewhat slow, but it is still faster than all // known scalar methods. // // Algorithm adapted to C from // https://www.workofard.com/2017/07/ghash-for-low-end-cores/, which is adapted // from "Fast Software Polynomial Multiplication on ARM Processors Using the // NEON Engine" by Danilo Camara, Conrado Gouvea, Julio Lopez and Ricardo Dahab // (https://hal.inria.fr/hal-01506572) static uint64x2_t _sse2neon_vmull_p64(uint64x1_t _a, uint64x1_t _b) { poly8x8_t a = vreinterpret_p8_u64(_a); poly8x8_t b = vreinterpret_p8_u64(_b); // Masks uint8x16_t k48_32 = vcombine_u8(vcreate_u8(0x0000ffffffffffff), vcreate_u8(0x00000000ffffffff)); uint8x16_t k16_00 = vcombine_u8(vcreate_u8(0x000000000000ffff), vcreate_u8(0x0000000000000000)); // Do the multiplies, rotating with vext to get all combinations uint8x16_t d = vreinterpretq_u8_p16(vmull_p8(a, b)); // D = A0 * B0 uint8x16_t e = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 1))); // E = A0 * B1 uint8x16_t f = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 1), b)); // F = A1 * B0 uint8x16_t g = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 2))); // G = A0 * B2 uint8x16_t h = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 2), b)); // H = A2 * B0 uint8x16_t i = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 3))); // I = A0 * B3 uint8x16_t j = vreinterpretq_u8_p16(vmull_p8(vext_p8(a, a, 3), b)); // J = A3 * B0 uint8x16_t k = vreinterpretq_u8_p16(vmull_p8(a, vext_p8(b, b, 4))); // L = A0 * B4 // Add cross products uint8x16_t l = veorq_u8(e, f); // L = E + F uint8x16_t m = veorq_u8(g, h); // M = G + H uint8x16_t n = veorq_u8(i, j); // N = I + J // Interleave. Using vzip1 and vzip2 prevents Clang from emitting TBL // instructions. #if defined(__aarch64__) uint8x16_t lm_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t lm_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(l), vreinterpretq_u64_u8(m))); uint8x16_t nk_p0 = vreinterpretq_u8_u64( vzip1q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); uint8x16_t nk_p1 = vreinterpretq_u8_u64( vzip2q_u64(vreinterpretq_u64_u8(n), vreinterpretq_u64_u8(k))); #else uint8x16_t lm_p0 = vcombine_u8(vget_low_u8(l), vget_low_u8(m)); uint8x16_t lm_p1 = vcombine_u8(vget_high_u8(l), vget_high_u8(m)); uint8x16_t nk_p0 = vcombine_u8(vget_low_u8(n), vget_low_u8(k)); uint8x16_t nk_p1 = vcombine_u8(vget_high_u8(n), vget_high_u8(k)); #endif // t0 = (L) (P0 + P1) << 8 // t1 = (M) (P2 + P3) << 16 uint8x16_t t0t1_tmp = veorq_u8(lm_p0, lm_p1); uint8x16_t t0t1_h = vandq_u8(lm_p1, k48_32); uint8x16_t t0t1_l = veorq_u8(t0t1_tmp, t0t1_h); // t2 = (N) (P4 + P5) << 24 // t3 = (K) (P6 + P7) << 32 uint8x16_t t2t3_tmp = veorq_u8(nk_p0, nk_p1); uint8x16_t t2t3_h = vandq_u8(nk_p1, k16_00); uint8x16_t t2t3_l = veorq_u8(t2t3_tmp, t2t3_h); // De-interleave #if defined(__aarch64__) uint8x16_t t0 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t1 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t0t1_l), vreinterpretq_u64_u8(t0t1_h))); uint8x16_t t2 = vreinterpretq_u8_u64( vuzp1q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); uint8x16_t t3 = vreinterpretq_u8_u64( vuzp2q_u64(vreinterpretq_u64_u8(t2t3_l), vreinterpretq_u64_u8(t2t3_h))); #else uint8x16_t t1 = vcombine_u8(vget_high_u8(t0t1_l), vget_high_u8(t0t1_h)); uint8x16_t t0 = vcombine_u8(vget_low_u8(t0t1_l), vget_low_u8(t0t1_h)); uint8x16_t t3 = vcombine_u8(vget_high_u8(t2t3_l), vget_high_u8(t2t3_h)); uint8x16_t t2 = vcombine_u8(vget_low_u8(t2t3_l), vget_low_u8(t2t3_h)); #endif // Shift the cross products uint8x16_t t0_shift = vextq_u8(t0, t0, 15); // t0 << 8 uint8x16_t t1_shift = vextq_u8(t1, t1, 14); // t1 << 16 uint8x16_t t2_shift = vextq_u8(t2, t2, 13); // t2 << 24 uint8x16_t t3_shift = vextq_u8(t3, t3, 12); // t3 << 32 // Accumulate the products uint8x16_t cross1 = veorq_u8(t0_shift, t1_shift); uint8x16_t cross2 = veorq_u8(t2_shift, t3_shift); uint8x16_t mix = veorq_u8(d, cross1); uint8x16_t r = veorq_u8(mix, cross2); return vreinterpretq_u64_u8(r); } #endif // ARMv7 polyfill // Perform a carry-less multiplication of two 64-bit integers, selected from a // and b according to imm8, and store the results in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_clmulepi64_si128 FORCE_INLINE __m128i _mm_clmulepi64_si128(__m128i _a, __m128i _b, const int imm) { uint64x2_t a = vreinterpretq_u64_m128i(_a); uint64x2_t b = vreinterpretq_u64_m128i(_b); switch (imm & 0x11) { case 0x00: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_low_u64(b))); case 0x01: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_low_u64(b))); case 0x10: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_low_u64(a), vget_high_u64(b))); case 0x11: return vreinterpretq_m128i_u64( _sse2neon_vmull_p64(vget_high_u64(a), vget_high_u64(b))); default: abort(); } } #if !defined(__ARM_FEATURE_CRYPTO) /* clang-format off */ #define SSE2NEON_AES_DATA(w) \ { \ w(0x63), w(0x7c), w(0x77), w(0x7b), w(0xf2), w(0x6b), w(0x6f), \ w(0xc5), w(0x30), w(0x01), w(0x67), w(0x2b), w(0xfe), w(0xd7), \ w(0xab), w(0x76), w(0xca), w(0x82), w(0xc9), w(0x7d), w(0xfa), \ w(0x59), w(0x47), w(0xf0), w(0xad), w(0xd4), w(0xa2), w(0xaf), \ w(0x9c), w(0xa4), w(0x72), w(0xc0), w(0xb7), w(0xfd), w(0x93), \ w(0x26), w(0x36), w(0x3f), w(0xf7), w(0xcc), w(0x34), w(0xa5), \ w(0xe5), w(0xf1), w(0x71), w(0xd8), w(0x31), w(0x15), w(0x04), \ w(0xc7), w(0x23), w(0xc3), w(0x18), w(0x96), w(0x05), w(0x9a), \ w(0x07), w(0x12), w(0x80), w(0xe2), w(0xeb), w(0x27), w(0xb2), \ w(0x75), w(0x09), w(0x83), w(0x2c), w(0x1a), w(0x1b), w(0x6e), \ w(0x5a), w(0xa0), w(0x52), w(0x3b), w(0xd6), w(0xb3), w(0x29), \ w(0xe3), w(0x2f), w(0x84), w(0x53), w(0xd1), w(0x00), w(0xed), \ w(0x20), w(0xfc), w(0xb1), w(0x5b), w(0x6a), w(0xcb), w(0xbe), \ w(0x39), w(0x4a), w(0x4c), w(0x58), w(0xcf), w(0xd0), w(0xef), \ w(0xaa), w(0xfb), w(0x43), w(0x4d), w(0x33), w(0x85), w(0x45), \ w(0xf9), w(0x02), w(0x7f), w(0x50), w(0x3c), w(0x9f), w(0xa8), \ w(0x51), w(0xa3), w(0x40), w(0x8f), w(0x92), w(0x9d), w(0x38), \ w(0xf5), w(0xbc), w(0xb6), w(0xda), w(0x21), w(0x10), w(0xff), \ w(0xf3), w(0xd2), w(0xcd), w(0x0c), w(0x13), w(0xec), w(0x5f), \ w(0x97), w(0x44), w(0x17), w(0xc4), w(0xa7), w(0x7e), w(0x3d), \ w(0x64), w(0x5d), w(0x19), w(0x73), w(0x60), w(0x81), w(0x4f), \ w(0xdc), w(0x22), w(0x2a), w(0x90), w(0x88), w(0x46), w(0xee), \ w(0xb8), w(0x14), w(0xde), w(0x5e), w(0x0b), w(0xdb), w(0xe0), \ w(0x32), w(0x3a), w(0x0a), w(0x49), w(0x06), w(0x24), w(0x5c), \ w(0xc2), w(0xd3), w(0xac), w(0x62), w(0x91), w(0x95), w(0xe4), \ w(0x79), w(0xe7), w(0xc8), w(0x37), w(0x6d), w(0x8d), w(0xd5), \ w(0x4e), w(0xa9), w(0x6c), w(0x56), w(0xf4), w(0xea), w(0x65), \ w(0x7a), w(0xae), w(0x08), w(0xba), w(0x78), w(0x25), w(0x2e), \ w(0x1c), w(0xa6), w(0xb4), w(0xc6), w(0xe8), w(0xdd), w(0x74), \ w(0x1f), w(0x4b), w(0xbd), w(0x8b), w(0x8a), w(0x70), w(0x3e), \ w(0xb5), w(0x66), w(0x48), w(0x03), w(0xf6), w(0x0e), w(0x61), \ w(0x35), w(0x57), w(0xb9), w(0x86), w(0xc1), w(0x1d), w(0x9e), \ w(0xe1), w(0xf8), w(0x98), w(0x11), w(0x69), w(0xd9), w(0x8e), \ w(0x94), w(0x9b), w(0x1e), w(0x87), w(0xe9), w(0xce), w(0x55), \ w(0x28), w(0xdf), w(0x8c), w(0xa1), w(0x89), w(0x0d), w(0xbf), \ w(0xe6), w(0x42), w(0x68), w(0x41), w(0x99), w(0x2d), w(0x0f), \ w(0xb0), w(0x54), w(0xbb), w(0x16) \ } /* clang-format on */ /* X Macro trick. See https://en.wikipedia.org/wiki/X_Macro */ #define SSE2NEON_AES_H0(x) (x) static const uint8_t SSE2NEON_sbox[256] = SSE2NEON_AES_DATA(SSE2NEON_AES_H0); #undef SSE2NEON_AES_H0 // In the absence of crypto extensions, implement aesenc using regular neon // intrinsics instead. See: // https://www.workofard.com/2017/01/accelerated-aes-for-the-arm64-linux-kernel/ // https://www.workofard.com/2017/07/ghash-for-low-end-cores/ and // https://github.com/ColinIanKing/linux-next-mirror/blob/b5f466091e130caaf0735976648f72bd5e09aa84/crypto/aegis128-neon-inner.c#L52 // for more information Reproduced with permission of the author. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i EncBlock, __m128i RoundKey) { #if defined(__aarch64__) static const uint8_t shift_rows[] = {0x0, 0x5, 0xa, 0xf, 0x4, 0x9, 0xe, 0x3, 0x8, 0xd, 0x2, 0x7, 0xc, 0x1, 0x6, 0xb}; static const uint8_t ror32by8[] = {0x1, 0x2, 0x3, 0x0, 0x5, 0x6, 0x7, 0x4, 0x9, 0xa, 0xb, 0x8, 0xd, 0xe, 0xf, 0xc}; uint8x16_t v; uint8x16_t w = vreinterpretq_u8_m128i(EncBlock); // shift rows w = vqtbl1q_u8(w, vld1q_u8(shift_rows)); // sub bytes v = vqtbl4q_u8(_sse2neon_vld1q_u8_x4(SSE2NEON_sbox), w); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x40), w - 0x40); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0x80), w - 0x80); v = vqtbx4q_u8(v, _sse2neon_vld1q_u8_x4(SSE2NEON_sbox + 0xc0), w - 0xc0); // mix columns w = (v << 1) ^ (uint8x16_t)(((int8x16_t) v >> 7) & 0x1b); w ^= (uint8x16_t) vrev32q_u16((uint16x8_t) v); w ^= vqtbl1q_u8(v ^ w, vld1q_u8(ror32by8)); // add round key return vreinterpretq_m128i_u8(w) ^ RoundKey; #else /* ARMv7-A NEON implementation */ #define SSE2NEON_AES_B2W(b0, b1, b2, b3) \ (((uint32_t)(b3) << 24) | ((uint32_t)(b2) << 16) | ((uint32_t)(b1) << 8) | \ (b0)) #define SSE2NEON_AES_F2(x) ((x << 1) ^ (((x >> 7) & 1) * 0x011b /* WPOLY */)) #define SSE2NEON_AES_F3(x) (SSE2NEON_AES_F2(x) ^ x) #define SSE2NEON_AES_U0(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F2(p), p, p, SSE2NEON_AES_F3(p)) #define SSE2NEON_AES_U1(p) \ SSE2NEON_AES_B2W(SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p, p) #define SSE2NEON_AES_U2(p) \ SSE2NEON_AES_B2W(p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p), p) #define SSE2NEON_AES_U3(p) \ SSE2NEON_AES_B2W(p, p, SSE2NEON_AES_F3(p), SSE2NEON_AES_F2(p)) static const uint32_t ALIGN_STRUCT(16) aes_table[4][256] = { SSE2NEON_AES_DATA(SSE2NEON_AES_U0), SSE2NEON_AES_DATA(SSE2NEON_AES_U1), SSE2NEON_AES_DATA(SSE2NEON_AES_U2), SSE2NEON_AES_DATA(SSE2NEON_AES_U3), }; #undef SSE2NEON_AES_B2W #undef SSE2NEON_AES_F2 #undef SSE2NEON_AES_F3 #undef SSE2NEON_AES_U0 #undef SSE2NEON_AES_U1 #undef SSE2NEON_AES_U2 #undef SSE2NEON_AES_U3 uint32_t x0 = _mm_cvtsi128_si32(EncBlock); uint32_t x1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0x55)); uint32_t x2 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xAA)); uint32_t x3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(EncBlock, 0xFF)); __m128i out = _mm_set_epi32( (aes_table[0][x3 & 0xff] ^ aes_table[1][(x0 >> 8) & 0xff] ^ aes_table[2][(x1 >> 16) & 0xff] ^ aes_table[3][x2 >> 24]), (aes_table[0][x2 & 0xff] ^ aes_table[1][(x3 >> 8) & 0xff] ^ aes_table[2][(x0 >> 16) & 0xff] ^ aes_table[3][x1 >> 24]), (aes_table[0][x1 & 0xff] ^ aes_table[1][(x2 >> 8) & 0xff] ^ aes_table[2][(x3 >> 16) & 0xff] ^ aes_table[3][x0 >> 24]), (aes_table[0][x0 & 0xff] ^ aes_table[1][(x1 >> 8) & 0xff] ^ aes_table[2][(x2 >> 16) & 0xff] ^ aes_table[3][x3 >> 24])); return _mm_xor_si128(out, RoundKey); #endif } // Perform the last round of an AES encryption flow on data (state) in a using // the round key in RoundKey, and store the result in dst. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { /* FIXME: optimized for NEON */ uint8_t v[4][4] = { [0] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 0)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 5)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 10)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 15)]}, [1] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 4)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 9)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 14)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 3)]}, [2] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 8)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 13)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 2)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 7)]}, [3] = {SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 12)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 1)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 6)], SSE2NEON_sbox[vreinterpretq_nth_u8_m128i(a, 11)]}, }; for (int i = 0; i < 16; i++) vreinterpretq_nth_u8_m128i(a, i) = v[i / 4][i % 4] ^ vreinterpretq_nth_u8_m128i(RoundKey, i); return a; } // Emits the Advanced Encryption Standard (AES) instruction aeskeygenassist. // This instruction generates a round key for AES encryption. See // https://kazakov.life/2017/11/01/cryptocurrency-mining-on-ios-devices/ // for details. // // https://msdn.microsoft.com/en-us/library/cc714138(v=vs.120).aspx FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i key, const int rcon) { uint32_t X1 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0x55)); uint32_t X3 = _mm_cvtsi128_si32(_mm_shuffle_epi32(key, 0xFF)); for (int i = 0; i < 4; ++i) { ((uint8_t *) &X1)[i] = SSE2NEON_sbox[((uint8_t *) &X1)[i]]; ((uint8_t *) &X3)[i] = SSE2NEON_sbox[((uint8_t *) &X3)[i]]; } return _mm_set_epi32(((X3 >> 8) | (X3 << 24)) ^ rcon, X3, ((X1 >> 8) | (X1 << 24)) ^ rcon, X1); } #undef SSE2NEON_AES_DATA #else /* __ARM_FEATURE_CRYPTO */ // Implements equivalent of 'aesenc' by combining AESE (with an empty key) and // AESMC and then manually applying the real key as an xor operation. This // unfortunately means an additional xor op; the compiler should be able to // optimize this away for repeated calls however. See // https://blog.michaelbrase.com/2018/05/08/emulating-x86-aes-intrinsics-on-armv8-a // for more details. FORCE_INLINE __m128i _mm_aesenc_si128(__m128i a, __m128i b) { return vreinterpretq_m128i_u8( vaesmcq_u8(vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0))) ^ vreinterpretq_u8_m128i(b)); } // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_aesenclast_si128 FORCE_INLINE __m128i _mm_aesenclast_si128(__m128i a, __m128i RoundKey) { return _mm_xor_si128(vreinterpretq_m128i_u8(vaeseq_u8( vreinterpretq_u8_m128i(a), vdupq_n_u8(0))), RoundKey); } FORCE_INLINE __m128i _mm_aeskeygenassist_si128(__m128i a, const int rcon) { // AESE does ShiftRows and SubBytes on A uint8x16_t u8 = vaeseq_u8(vreinterpretq_u8_m128i(a), vdupq_n_u8(0)); uint8x16_t dest = { // Undo ShiftRows step from AESE and extract X1 and X3 u8[0x4], u8[0x1], u8[0xE], u8[0xB], // SubBytes(X1) u8[0x1], u8[0xE], u8[0xB], u8[0x4], // ROT(SubBytes(X1)) u8[0xC], u8[0x9], u8[0x6], u8[0x3], // SubBytes(X3) u8[0x9], u8[0x6], u8[0x3], u8[0xC], // ROT(SubBytes(X3)) }; uint32x4_t r = {0, (unsigned) rcon, 0, (unsigned) rcon}; return vreinterpretq_m128i_u8(dest) ^ vreinterpretq_m128i_u32(r); } #endif /* Streaming Extensions */ // Guarantees that every preceding store is globally visible before any // subsequent store. // https://msdn.microsoft.com/en-us/library/5h2w73d1%28v=vs.90%29.aspx FORCE_INLINE void _mm_sfence(void) { __sync_synchronize(); } // Store 128-bits (composed of 4 packed single-precision (32-bit) floating- // point elements) from a into memory using a non-temporal memory hint. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_ps FORCE_INLINE void _mm_stream_ps(float *p, __m128 a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, (float32x4_t *) p); #else vst1q_f32(p, vreinterpretq_f32_m128(a)); #endif } // Stores the data in a to the address p without polluting the caches. If the // cache line containing address p is already in the cache, the cache will be // updated. // https://msdn.microsoft.com/en-us/library/ba08y07y%28v=vs.90%29.aspx FORCE_INLINE void _mm_stream_si128(__m128i *p, __m128i a) { #if __has_builtin(__builtin_nontemporal_store) __builtin_nontemporal_store(a, p); #else vst1q_s64((int64_t *) p, vreinterpretq_s64_m128i(a)); #endif } // Load 128-bits of integer data from memory into dst using a non-temporal // memory hint. mem_addr must be aligned on a 16-byte boundary or a // general-protection exception may be generated. // // dst[127:0] := MEM[mem_addr+127:mem_addr] // // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_stream_load_si128 FORCE_INLINE __m128i _mm_stream_load_si128(__m128i *p) { #if __has_builtin(__builtin_nontemporal_store) return __builtin_nontemporal_load(p); #else return vreinterpretq_m128i_s64(vld1q_s64((int64_t *) p)); #endif } // Cache line containing p is flushed and invalidated from all caches in the // coherency domain. : // https://msdn.microsoft.com/en-us/library/ba08y07y(v=vs.100).aspx FORCE_INLINE void _mm_clflush(void const *p) { (void) p; // no corollary for Neon? } // Allocate aligned blocks of memory. // https://software.intel.com/en-us/ // cpp-compiler-developer-guide-and-reference-allocating-and-freeing-aligned-memory-blocks FORCE_INLINE void *_mm_malloc(size_t size, size_t align) { void *ptr; if (align == 1) return malloc(size); if (align == 2 || (sizeof(void *) == 8 && align == 4)) align = sizeof(void *); if (!posix_memalign(&ptr, align, size)) return ptr; return NULL; } // Free aligned memory that was allocated with _mm_malloc. // https://software.intel.com/sites/landingpage/IntrinsicsGuide/#text=_mm_free FORCE_INLINE void _mm_free(void *addr) { free(addr); } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 8-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514036(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u8(uint32_t crc, uint8_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cb %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc ^= v; for (int bit = 0; bit < 8; bit++) { if (crc & 1) crc = (crc >> 1) ^ UINT32_C(0x82f63b78); else crc = (crc >> 1); } #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 16-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531411(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u16(uint32_t crc, uint16_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32ch %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u8(crc, v & 0xff); crc = _mm_crc32_u8(crc, (v >> 8) & 0xff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 32-bit integer v. // https://msdn.microsoft.com/en-us/library/bb531394(v=vs.100) FORCE_INLINE uint32_t _mm_crc32_u32(uint32_t crc, uint32_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cw %w[c], %w[c], %w[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u16(crc, v & 0xffff); crc = _mm_crc32_u16(crc, (v >> 16) & 0xffff); #endif return crc; } // Starting with the initial value in crc, accumulates a CRC32 value for // unsigned 64-bit integer v. // https://msdn.microsoft.com/en-us/library/bb514033(v=vs.100) FORCE_INLINE uint64_t _mm_crc32_u64(uint64_t crc, uint64_t v) { #if defined(__aarch64__) && defined(__ARM_FEATURE_CRC32) __asm__ __volatile__("crc32cx %w[c], %w[c], %x[v]\n\t" : [c] "+r"(crc) : [v] "r"(v)); #else crc = _mm_crc32_u32((uint32_t)(crc), v & 0xffffffff); crc = _mm_crc32_u32((uint32_t)(crc), (v >> 32) & 0xffffffff); #endif return crc; } #if defined(__GNUC__) || defined(__clang__) #pragma pop_macro("ALIGN_STRUCT") #pragma pop_macro("FORCE_INLINE") #endif #if defined(__GNUC__) && !defined(__clang__) #pragma GCC pop_options #endif #endif ospray-rkcommon-538f8a2/rkcommon/math/box.h000066400000000000000000000076361456117377200207640ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "range.h" #include "vec.h" namespace rkcommon { namespace math { // box declaration //////////////////////////////////////////////////////// template using box_t = range_t>; // box free functions ///////////////////////////////////////////////////// template inline scalar_t area(const box_t &b) { return b.size().product(); } template inline scalar_t area(const box_t &b) { const auto size = b.size(); return 2.f * (size.x * size.y + size.x * size.z + size.y * size.z); } /*! return the volume of the 3D box - undefined for empty boxes */ template inline scalar_t volume(const box_t &b) { return b.size().product(); } /*! computes whether two boxes are either touching OR overlapping; ie, the case where boxes just barely touch side-by side (even if they do not have any actual overlapping _volume_!) then this is still true */ template inline bool touchingOrOverlapping(const box_t &a, const box_t &b) { if (a.lower.x > b.upper.x) return false; if (a.lower.y > b.upper.y) return false; if (a.lower.z > b.upper.z) return false; if (b.lower.x > a.upper.x) return false; if (b.lower.y > a.upper.y) return false; if (b.lower.z > a.upper.z) return false; return true; } template inline bool touchingOrOverlapping(const box_t &a, const box_t &b) { if (a.lower.x > b.upper.x) return false; if (a.lower.y > b.upper.y) return false; if (b.lower.x > a.upper.x) return false; if (b.lower.y > a.upper.y) return false; return true; } /*! compute the intersection of two boxes */ template inline box_t intersectionOf(const box_t &a, const box_t &b) { return box_t(max(a.lower, b.lower), min(a.upper, b.upper)); } template inline bool disjoint(const box_t &a, const box_t &b) { return anyLessThan(a.upper, b.lower) || anyLessThan(b.upper, a.lower); } /*! returns the center of the box (not valid for empty boxes) */ template inline vec_t center(const box_t &b) { return b.center(); } template inline range_t intersectRayBox( const vec_t &org, const vec_t &dir, const box_t &box, const range_t &tRange = range_t(0, inf)) { const auto mins = (box.lower - org) * rcp_safe(dir); const auto maxs = (box.upper - org) * rcp_safe(dir); return range_t( reduce_max(vec_t(min(mins, maxs), tRange.lower)), reduce_min(vec_t(max(mins, maxs), tRange.upper))); } using box1i = range_t; using box2i = box_t; using box3i = box_t; using box4i = box_t; using box1f = range_t; using box2f = box_t; using box3f = box_t; using box4f = box_t; using box3fa = box_t; // this is just a renaming - in some cases the code reads cleaner if // we're talking about 'regions' than about boxes using region2i = box2i; } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/box.ih000066400000000000000000000403171456117377200211260ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "vec.ih" #ifndef ISPC namespace ispc { #endif // a 1-d int bounding box (ie, a range struct box1i { int32 lower; int32 upper; #ifndef ISPC box1i() = default; box1i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box1i(const int32 v) : lower(v), upper(v) {} box1i(const int32 l, const int32 u) : lower(l), upper(u) {} #endif }; // a 1-d float bounding box (ie, a range struct box1f { float lower; float upper; #ifndef ISPC box1f() = default; box1f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box1f(const float v) : lower(v), upper(v) {} box1f(const float l, const float u) : lower(l), upper(u) {} #endif }; // a 2-d float bounding box struct box2f { vec2f lower; vec2f upper; #ifndef ISPC box2f() = default; box2f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box2f(const float v) : lower(v), upper(v) {} box2f(const float l, const float u) : lower(l), upper(u) {} #endif }; // a 2-d integer bounding box struct box2i { vec2i lower; vec2i upper; #ifndef ISPC box2i() = default; box2i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box2i(const int v) : lower(v), upper(v) {} box2i(const int l, const int u) : lower(l), upper(u) {} #endif }; // a 3-d float bounding box struct box3f { vec3f lower; vec3f upper; #ifndef ISPC box3f() = default; box3f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3f(const vec3f v) : lower(v), upper(v) {} box3f(const vec3f l, const vec3f u) : lower(l), upper(u) {} #endif }; // a 3-d int bounding box struct box3i { vec3i lower; vec3i upper; #ifndef ISPC box3i() = default; box3i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3i(const int v) : lower(v), upper(v) {} box3i(const int l, const int u) : lower(l), upper(u) {} #endif }; // a 3-d float bounding box with aligned vec3f coordinates struct box3fa { vec3f lower; int32 align0; vec3f upper; int32 align1; #ifndef ISPC box3fa() = default; box3fa(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box3fa(const vec3f v) : lower(v), upper(v) {} box3fa(const vec3f l, const vec3f u) : lower(l), upper(u) {} #endif }; // a 4-d int bounding box struct box4i { vec4i lower; vec4i upper; #ifndef ISPC box4i() = default; box4i(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box4i(const vec4i v) : lower(v), upper(v) {} box4i(const vec4i l, const vec4i u) : lower(l), upper(u) {} #endif }; // a 4-d float bounding box struct box4f { vec4f lower; vec4f upper; #ifndef ISPC box4f() = default; box4f(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} box4f(const vec4f v) : lower(v), upper(v) {} box4f(const vec4f l, const vec4f u) : lower(l), upper(u) {} #endif }; // this is just a renaming - in some cases the code reads cleaner if we're // talking about 'regions' than about boxes typedef box1f range1f; typedef box2i range2i; // ------------------------------------------------------- // all box1f operations // ------------------------------------------------------- #define MAKE_BOX1F_uv(univary) \ inline univary box1f make_box1f(const univary float f) \ { \ univary box1f bb; \ bb.lower = bb.upper = f; \ return bb; \ } \ \ inline univary box1f make_box1f( \ const univary float lo, const univary float hi) \ { \ univary box1f bb; \ bb.lower = lo; \ bb.upper = hi; \ return bb; \ } \ \ inline univary float box_size(const univary box1f &bb) \ { \ return bb.upper - bb.lower; \ } \ \ inline univary box1f box_extend( \ const univary box1f &a, const univary box1f &b) \ { \ return make_box1f(min(a.lower, b.lower), max(a.upper, b.upper)); \ } \ \ inline univary bool isEmpty(const univary box1f &bb) \ { \ return bb.upper < bb.lower; \ } #ifdef ISPC MAKE_BOX1F_uv(uniform); MAKE_BOX1F_uv(varying); #else MAKE_BOX1F_uv(); #endif #undef MAKE_BOX1F_uv // ------------------------------------------------------- // box2 'constructors' // ------------------------------------------------------- #define MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, otherT) \ inline univary box2##Tabb make_box2##Tabb( \ const univary vec2##otherT lower, const univary vec2##otherT upper) \ { \ univary box2##Tabb bb; \ bb.lower.x = lower.x; \ bb.lower.y = lower.y; \ bb.upper.x = upper.x; \ bb.upper.y = upper.y; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, otherT) \ inline univary box2##Tabb make_box2##Tabb(const univary box2##otherT other) \ { \ univary box2##Tabb bb; \ bb.lower.x = other.lower.x; \ bb.lower.y = other.lower.y; \ bb.upper.x = other.upper.x; \ bb.upper.y = other.upper.y; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T_empty(univary, Tabb) \ inline univary box2##Tabb make_box2##Tabb##_empty() \ { \ return make_box2##Tabb(make_vec2##Tabb(inf), make_vec2##Tabb(neg_inf)); \ } #define MAKE_BOX_CONSTRUCTORS_uv_2T(univary, Tabb) \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2(univary, Tabb, i); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2(univary, Tabb, i) #define MAKE_BOX_CONSTRUCTORS_uv_2(univary) \ MAKE_BOX_CONSTRUCTORS_uv_2T(univary, i); \ MAKE_BOX_CONSTRUCTORS_uv_2T(univary, f) #ifdef ISPC MAKE_BOX_CONSTRUCTORS_uv_2(uniform); MAKE_BOX_CONSTRUCTORS_uv_2(varying); MAKE_BOX_CONSTRUCTORS_uv_2T_empty(uniform, f); #else MAKE_BOX_CONSTRUCTORS_uv_2(); MAKE_BOX_CONSTRUCTORS_uv_2T_empty(, f); #endif #undef MAKE_BOX_CONSTRUCTORS_uv_2T_fromVec2 #undef MAKE_BOX_CONSTRUCTORS_uv_2T_fromBox2 #undef MAKE_BOX_CONSTRUCTORS_uv_2T_empty #undef MAKE_BOX_CONSTRUCTORS_uv_2T #undef MAKE_BOX_CONSTRUCTORS_uv_2 // ------------------------------------------------------- // box3 'constructors' // ------------------------------------------------------- #define MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, otherT) \ inline univary box3##Tabb make_box3##Tabb( \ const univary vec3##otherT lower, const univary vec3##otherT upper) \ { \ univary box3##Tabb bb; \ bb.lower.x = lower.x; \ bb.lower.y = lower.y; \ bb.lower.z = lower.z; \ bb.upper.x = upper.x; \ bb.upper.y = upper.y; \ bb.upper.z = upper.z; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, otherT) \ inline univary box3##Tabb make_box3##Tabb(const univary box3##otherT other) \ { \ univary box3##Tabb bb; \ bb.lower.x = other.lower.x; \ bb.lower.y = other.lower.y; \ bb.lower.z = other.lower.z; \ bb.upper.x = other.upper.x; \ bb.upper.y = other.upper.y; \ bb.upper.z = other.upper.z; \ return bb; \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T_empty(univary, Tabb) \ inline univary box3##Tabb make_box3##Tabb##_empty() \ { \ return make_box3##Tabb(make_vec3f(inf), make_vec3f(neg_inf)); \ } #define MAKE_BOX_CONSTRUCTORS_uv_3T(univary, Tabb) \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3(univary, Tabb, i); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, fa); \ MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3(univary, Tabb, i) #define MAKE_BOX_CONSTRUCTORS_uv_3(univary) \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, i); \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, f); \ MAKE_BOX_CONSTRUCTORS_uv_3T(univary, fa) #ifdef ISPC MAKE_BOX_CONSTRUCTORS_uv_3(uniform); MAKE_BOX_CONSTRUCTORS_uv_3(varying); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(uniform, f); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(uniform, fa); #else MAKE_BOX_CONSTRUCTORS_uv_3(); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(, f); MAKE_BOX_CONSTRUCTORS_uv_3T_empty(, fa); #endif #undef MAKE_BOX_CONSTRUCTORS_uv_3T_fromVec3 #undef MAKE_BOX_CONSTRUCTORS_uv_3T_fromBox3 #undef MAKE_BOX_CONSTRUCTORS_uv_3T_empty #undef MAKE_BOX_CONSTRUCTORS_uv_3T #undef MAKE_BOX_CONSTRUCTORS_uv_3 // ------------------------------------------------------- // box 'operations' // ------------------------------------------------------- #define BOX_OPERATIONS_uv_N_T(univary, N, T) \ inline univary vec##N##T box_size(const univary box##N##T &bb) \ { \ return bb.upper - bb.lower; \ } \ \ inline univary bool isEmpty(const univary box##N##T &bb) \ { \ return anyLessThan(bb.upper, bb.lower); \ } \ \ inline univary box##N##T box_extend( \ const univary box##N##T bb, const univary vec##N##T v) \ { \ return make_box##N##T(min(bb.lower, v), max(bb.upper, v)); \ } \ \ inline univary box##N##T box_extend( \ const univary box##N##T bb, const univary box##N##T other) \ { \ return make_box##N##T( \ min(bb.lower, other.lower), max(bb.upper, other.upper)); \ } #define BOX_OPERATIONS_uv_3fa(univary) \ inline univary box3fa box_extend( \ const univary box3fa bb, const univary vec3f v) \ { \ return make_box3fa(min(bb.lower, v), max(bb.upper, v)); \ } \ \ inline univary box3fa box_extend( \ const univary box3fa bb, const univary box3fa other) \ { \ return make_box3fa( \ min(bb.lower, other.lower), max(bb.upper, other.upper)); \ } #define BOX_OPERATIONS_uv_N(univary, N) \ BOX_OPERATIONS_uv_N_T(univary, N, i); \ BOX_OPERATIONS_uv_N_T(univary, N, f) #define BOX_OPERATIONS_uv(univary) \ BOX_OPERATIONS_uv_N(univary, 2); \ BOX_OPERATIONS_uv_N(univary, 3); \ BOX_OPERATIONS_uv_3fa(univary) #ifdef ISPC BOX_OPERATIONS_uv(uniform); BOX_OPERATIONS_uv(varying); #else BOX_OPERATIONS_uv(); #endif #undef BOX_OPERATIONS_uv_N_T #undef BOX_OPERATIONS_uv_N #undef BOX_OPERATIONS_uv inline bool box_contains(const ISPC_UNIFORM box3f &bbox, const vec3f &p) { return p.x >= bbox.lower.x && p.y >= bbox.lower.y && p.z >= bbox.lower.z && p.x <= bbox.upper.x && p.y <= bbox.upper.y && p.z <= bbox.upper.z; } #ifdef ISPC inline void extend(uniform range1f &r, uniform float v) { r.lower = min(r.lower, v); r.upper = max(r.upper, v); } inline void extend(uniform range1f &r, varying float v) { r.lower = min(r.lower, reduce_min(v)); r.upper = max(r.upper, reduce_max(v)); } #endif inline void extend(range1f &r, float v) { r.lower = min(r.lower, v); r.upper = max(r.upper, v); } #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/math/constants.h000066400000000000000000000177071456117377200222100ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../platform.h" #ifndef _USE_MATH_DEFINES #define _USE_MATH_DEFINES #endif #include // using cmath causes issues under Windows #include namespace rkcommon { namespace math { static const float one_over_255 = 1.0f / 255.0f; static struct ZeroTy { __forceinline operator double() const { return 0; } __forceinline operator float() const { return 0; } __forceinline operator long long() const { return 0; } __forceinline operator unsigned long long() const { return 0; } __forceinline operator long() const { return 0; } __forceinline operator unsigned long() const { return 0; } __forceinline operator int() const { return 0; } __forceinline operator unsigned int() const { return 0; } __forceinline operator short() const { return 0; } __forceinline operator unsigned short() const { return 0; } __forceinline operator char() const { return 0; } __forceinline operator unsigned char() const { return 0; } } zero MAYBE_UNUSED; static struct OneTy { __forceinline operator double() const { return 1; } __forceinline operator float() const { return 1; } __forceinline operator long long() const { return 1; } __forceinline operator unsigned long long() const { return 1; } __forceinline operator long() const { return 1; } __forceinline operator unsigned long() const { return 1; } __forceinline operator int() const { return 1; } __forceinline operator unsigned int() const { return 1; } __forceinline operator short() const { return 1; } __forceinline operator unsigned short() const { return 1; } __forceinline operator char() const { return 1; } __forceinline operator unsigned char() const { return 1; } } one MAYBE_UNUSED; static struct NegInfTy { __forceinline operator double() const { return -std::numeric_limits::infinity(); } __forceinline operator float() const { return -std::numeric_limits::infinity(); } __forceinline operator long long() const { return std::numeric_limits::min(); } __forceinline operator unsigned long long() const { return std::numeric_limits::min(); } __forceinline operator long() const { return std::numeric_limits::min(); } __forceinline operator unsigned long() const { return std::numeric_limits::min(); } __forceinline operator int() const { return std::numeric_limits::min(); } __forceinline operator unsigned int() const { return std::numeric_limits::min(); } __forceinline operator short() const { return std::numeric_limits::min(); } __forceinline operator unsigned short() const { return std::numeric_limits::min(); } __forceinline operator char() const { return std::numeric_limits::min(); } __forceinline operator unsigned char() const { return std::numeric_limits::min(); } } neg_inf MAYBE_UNUSED; static struct PosInfTy { __forceinline operator double() const { return std::numeric_limits::infinity(); } __forceinline operator float() const { return std::numeric_limits::infinity(); } __forceinline operator long long() const { return std::numeric_limits::max(); } __forceinline operator unsigned long long() const { return std::numeric_limits::max(); } __forceinline operator long() const { return std::numeric_limits::max(); } __forceinline operator unsigned long() const { return std::numeric_limits::max(); } __forceinline operator int() const { return std::numeric_limits::max(); } __forceinline operator unsigned int() const { return std::numeric_limits::max(); } __forceinline operator short() const { return std::numeric_limits::max(); } __forceinline operator unsigned short() const { return std::numeric_limits::max(); } __forceinline operator char() const { return std::numeric_limits::max(); } __forceinline operator unsigned char() const { return std::numeric_limits::max(); } } inf MAYBE_UNUSED, pos_inf MAYBE_UNUSED; static struct NaNTy { __forceinline operator double() const { return std::numeric_limits::quiet_NaN(); } __forceinline operator float() const { return std::numeric_limits::quiet_NaN(); } } nan MAYBE_UNUSED; static struct UlpTy { __forceinline operator double() const { return std::numeric_limits::epsilon(); } __forceinline operator float() const { return std::numeric_limits::epsilon(); } } ulp MAYBE_UNUSED; static struct PiTy { __forceinline operator double() const { return M_PI; } __forceinline operator float() const { return M_PI; } } pi MAYBE_UNUSED; static struct OneOverPiTy { __forceinline operator double() const { return M_1_PI; } __forceinline operator float() const { return M_1_PI; } } one_over_pi MAYBE_UNUSED; static struct TwoPiTy { __forceinline operator double() const { return 2.0 * M_PI; } __forceinline operator float() const { return 2.0 * M_PI; } } two_pi MAYBE_UNUSED; static struct HalfPiTy { __forceinline operator double() const { return M_PI_2; } __forceinline operator float() const { return M_PI_2; } } half_pi MAYBE_UNUSED; static struct OneOverTwoPiTy { __forceinline operator double() const { return 0.5 * M_1_PI; } __forceinline operator float() const { return 0.5 * M_1_PI; } } one_over_two_pi MAYBE_UNUSED; static struct FourPiTy { __forceinline operator double() const { return 4.0 * M_PI; } __forceinline operator float() const { return 4.0 * M_PI; } } four_pi MAYBE_UNUSED; static struct QuarterPiTy { __forceinline operator double() const { return M_PI_4; } __forceinline operator float() const { return M_PI_4; } } quarter_pi MAYBE_UNUSED; static struct OneOverFourPiTy { __forceinline operator double() const { return 0.25 * M_1_PI; } __forceinline operator float() const { return 0.25 * M_1_PI; } } one_over_four_pi MAYBE_UNUSED; static struct StepTy { } step MAYBE_UNUSED; static struct ReverseStepTy { } reverse_step MAYBE_UNUSED; static struct EmptyTy { } empty MAYBE_UNUSED; static struct FullTy { } full MAYBE_UNUSED; } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/math.ih000066400000000000000000000367061456117377200212760ustar00rootroot00000000000000// Copyright 2019 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #ifdef ISPC #define ISPC_UNIFORM uniform #define ISPC_VARYING varying #define ISPC_OR(a, b) or (a, b) #else #include #define ISPC_UNIFORM #define ISPC_VARYING #define ISPC_OR(a, b) (a || b) namespace ispc { using uint64 = uint64_t; using uint32 = uint32_t; using uint16 = uint16_t; using uint8 = uint8_t; using int64 = int64_t; using int32 = int32_t; using int16 = int16_t; using int8 = int8_t; #endif // ------------------------------------------------------------------ // Constants // ------------------------------------------------------------------ #define inf floatbits(0x7F800000) #define pos_inf floatbits(0x7F800000) #define neg_inf floatbits(0xFF800000) #define nan floatbits(0x7FBFFFFF) // smallest positive normal number 2^-126 ~ 1.17549435e-38 #define flt_min 0x1.0p-126f #define M_PI 3.14159265358979323846f #define pi 3.14159265358979323846f #define two_pi 6.283185307179586232f #define four_pi 12.566370614359172464f #define one_over_pi 0.31830988618379069122f #define one_over_two_pi 0.15915494309189534561f #define one_over_four_pi 0.079577471545947672804f #define one_over_two_pi_sqr 0.050660591821168885722f #define lntwo_over_two 0.346573590279972654709f #ifndef ISPC static struct OneTy { inline operator float() const { return 1.f; } } one; static struct EmptyTy { } empty; // Native math functions, precision implementation defined inline float sin(const float a) { return sycl::native::sin(a); } inline float cos(const float a) { return sycl::native::cos(a); } inline float tan(const float a) { return sycl::native::tan(a); } inline float rcp(const float a) { return sycl::native::recip(a); } inline float exp(const float a) { return sycl::native::exp(a); } inline float log(const float a) { return sycl::native::log(a); } inline float pow(float a, float b) { return sycl::native::powr(a, b); } inline float sqrt(const float a) { return sycl::native::sqrt(a); } inline float rsqrt(const float a) { return sycl::native::rsqrt(a); } inline void sincos(const float phi, float *sinPhi, float *cosPhi) { *sinPhi = sycl::native::sin(phi); *cosPhi = sycl::native::cos(phi); } inline float roundf(const float f) { return sycl::round(f); } // Math functions with precision guaranteed both on host and device inline float abs(const float a) { return sycl::fabs(a); } inline float floor(const float a) { return sycl::floor(a); } inline float ceil(const float a) { return sycl::ceil(a); } inline float acos(const float a) { return sycl::acos(a); } inline float atan(const float a) { return sycl::atan(a); } inline float atan2(const float a, const float b) { return sycl::atan2(a, b); } inline float isnan(const float a) { return sycl::isnan(a); } inline float nextafter(const float a, const float b) { return sycl::nextafter(a, b); } inline float floatbits(unsigned int a) { return sycl::bit_cast(a); } inline unsigned int intbits(float a) { return sycl::bit_cast(a); } inline unsigned int signbits(float a) { return sycl::signbit(a); } template ::value>::type> inline T min(const T &a, const T &b) { return sycl::min(a, b); } template ::value>::type> inline T max(const T &a, const T &b) { return sycl::max(a, b); } template ::value>::type> inline T extract(const T &t, int) { return t; } template ::value>::type> inline T exclusive_scan_add(const T &) { return 0; } template ::value>::type> inline T reduce_add(const T &t) { return t; } template ::value>::type> inline T reduce_max(const T &t) { return t; } template ::value>::type> inline T reduce_min(const T &t) { return t; } #endif #define __define_functions(univary) \ inline univary float absf(const univary float f) \ { \ return abs(f); \ } \ /* c-style reciprocal. required since ispc 1.7 due to type changes in this \ * version */ \ inline univary float rcpf(const univary float f) \ { \ return rcp(f); \ } \ /* c-style square root */ \ inline univary float sqrtf(const univary float f) \ { \ return sqrt(f); \ } \ /* c-style reciprocal square root */ \ inline univary float rsqrtf(const univary float f) \ { \ return rsqrt(f); \ } \ /* square */ \ inline univary float sqr(const univary float f) \ { \ return f * f; \ } \ /* c-style square */ \ inline univary float sqrf(const univary float f) \ { \ return f * f; \ } \ /* c-style pow function */ \ inline univary float powf(const univary float a, const univary float b) \ { \ return pow(a, b); \ } \ /* c-style cos */ \ inline univary float cosf(const univary float f) \ { \ return cos(f); \ } \ /* c-style sin */ \ inline univary float sinf(const univary float f) \ { \ return sin(f); \ } \ /* c-style exp */ \ inline univary float expf(const univary float f) \ { \ return exp(f); \ } \ /* c-style log */ \ inline univary float logf(const univary float f) \ { \ return log(f); \ } \ inline univary float divide_safe(univary float f) \ { \ return 1.f / (abs(f) < flt_min ? (f >= 0.f ? flt_min : -flt_min) : f); \ } \ inline univary float rcp_safe(univary float f) \ { \ return rcpf(abs(f) < flt_min ? (f >= 0.f ? flt_min : -flt_min) : f); \ } \ inline univary float sqrt_safe(univary float f) \ { \ return sqrt(max(f, 0.0f)); \ } \ inline univary float clamp(const univary float v) \ { \ return max(0.0f, min(v, 1.0f)); \ } \ inline univary float clamp(const univary float v, \ const univary float lower, \ const univary float upper) \ { \ return max(lower, min(v, upper)); \ } \ inline univary int clamp( \ const univary int v, const univary int lower, const univary int upper) \ { \ return max(lower, min(v, upper)); \ } \ inline univary float frac(const univary float x) \ { \ return x - floor(x); \ } \ inline univary float deg2rad(const univary float x) \ { \ return x * 1.74532925199432957692e-2f; \ } \ inline univary float rad2deg(const univary float x) \ { \ return x * 5.72957795130823208768e1f; \ } #ifdef ISPC __define_functions(uniform); __define_functions(varying); #else __define_functions(); #endif inline float cos2sin(const float f) { return sqrt(max(0.f, 1.f - sqr(f))); } inline float sin2cos(const float f) { return cos2sin(f); } #ifdef ISPC inline float roundf(const float f) { return round(f); } inline uniform float roundf(const uniform float f) { return round(f); } inline uniform float nextafter(const uniform float a, const uniform float b) { // Match the behavior of the C99 math.h function if (a == b) return (b); // We will compute the smallest representable floating increment or decrement // around 'a' uniform float delta = (b > a) ? 1.0f : -1.0f; // Iteratively compute the positive or negative increment while (a + 0.5f * delta != a) delta *= 0.5f; // Return the smallest number greater than 'a' or the largest number smaller // than 'a' return (a + delta); } #endif #define __define_lerp(univary, type) \ inline univary type lerp( \ univary float factor, univary type a, univary type b) \ { \ return (1.f - factor) * a + factor * b; \ } #define __define_lerp_type(univary) \ __define_lerp(univary, int8); \ __define_lerp(univary, int32); \ __define_lerp(univary, float); \ __define_lerp(univary, uint8); \ __define_lerp(univary, uint32) #ifdef ISPC __define_lerp_type(uniform); __define_lerp_type(varying); #else __define_lerp_type(); #endif #undef __define_lerp_type #undef __define_lerp // ------------------------------------------------------------------ // min4/max4, for all types // ------------------------------------------------------------------ #define __define_op4(univary, type, op) \ inline univary type op##4( \ univary type a, univary type b, univary type c, univary type d) \ { \ return op(a, op(b, op(c, d))); \ } #define __define_op4_op(univary, type) \ __define_op4(univary, type, min); \ __define_op4(univary, type, max) #define __define_op4_type(univary) \ __define_op4_op(univary, int8); \ __define_op4_op(univary, int32); \ __define_op4_op(univary, uint8); \ __define_op4_op(univary, uint32); \ __define_op4_op(univary, float) #ifdef ISPC __define_op4_type(uniform); __define_op4_type(varying); #else __define_op4_type(); #endif #undef __define_op4_type #undef __define_op4_op #undef __define_op4 #define SIMILAR_EPSILON .00001f #define __define_similar(univary) \ inline univary float similar(univary float a, univary float b) \ { \ return abs(a - b) <= SIMILAR_EPSILON; \ } #ifdef ISPC __define_similar(uniform); __define_similar(varying); #else __define_similar(); #endif #undef __define_similar #undef SIMILAR_EPSILON // convert 32bit unsigned int into float in [0..1] inline float to_float_unorm(uint32 a) { return a * 0x1.0p-32f; } #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/math/range.h000066400000000000000000000106611456117377200212600ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // stl #include // common #include "constants.h" #include "vec.h" namespace rkcommon { namespace math { using std::max; using std::min; /*! default implementatoin of 'anyLessThan' for scalar types, so we can make a ranges etc. Vec-types will overwrite that and test if _any_ dimension is less */ template inline bool anyLessThan(const TA &a, const TB &b) { return a < b; } template struct range_t { using bound_t = T; range_t() : lower(pos_inf), upper(neg_inf) {} range_t(const EmptyTy &) : lower(pos_inf), upper(neg_inf) {} range_t(const ZeroTy &) : lower(zero), upper(zero) {} range_t(const OneTy &) : lower(zero), upper(one) {} range_t(const T &t) : lower(t), upper(t) {} range_t(const T &_lower, const T &_upper) : lower(_lower), upper(_upper) { } range_t(const T *v) : lower(v[0]), upper(v[1]) {} template explicit range_t(const range_t &other) : lower(T(other.lower)), upper(T(other.upper)) { } inline T size() const { return upper - lower; } inline T center() const { return .5f * (lower + upper); } inline void extend(const T &t) { lower = min(lower, t); upper = max(upper, t); } inline void extend(const range_t &t) { lower = min(lower, t.lower); upper = max(upper, t.upper); } /*! take given value t, and 'clamp' it to 'this->'range; ie, if it already is inside the range return as is, otherwise move it to either lower or upper of this range. */ inline T clamp(const T &t) const { return max(lower, min(t, upper)); } /*! Try to parse given string into a range; and return if successful. if not, return defaultvalue */ static range_t fromString( const std::string &string, const range_t &defaultValue = rkcommon::math::empty); inline bool empty() const { return anyLessThan(upper, lower); } inline bool contains(const T &t) const { return !anyLessThan(t, lower) && !anyLessThan(upper, t); } inline operator T*() { return static_cast(&lower); } inline operator const T*() const { return static_cast(&lower); } T lower, upper; }; template inline std::ostream &operator<<(std::ostream &o, const range_t &r) { o << "[" << r.lower << "," << r.upper << "]"; return o; } /*! scale range, per dimension */ template inline range_t operator*(const range_t &range, const T &scale) { return range_t(range.lower * scale, range.upper * scale); } /*! scale range, per dimension */ template inline range_t operator*(const T &scale, const range_t &range) { return range_t(range.lower * scale, range.upper * scale); } /*! translate a range, per dimension */ template inline range_t operator+(const range_t &range, const T &translation) { return range_t(range.lower + translation, range.upper + translation); } /*! translate a range, per dimension */ template inline range_t operator+(const T &translation, const range_t &range) { return range_t(range.lower + translation, range.upper + translation); } // comparison operators /////////////////////////////////////////////////// template inline bool operator==(const range_t &a, const range_t &b) { return a.lower == b.lower && a.upper == b.upper; } template inline bool operator!=(const range_t &a, const range_t &b) { return !(a == b); } // range_t aliases //////////////////////////////////////////////////////// using range1f = range_t; using range2f = range_t; using range3f = range_t; using range4f = range_t; using range1i = range_t; using range2i = range_t; using range3i = range_t; using range4i = range_t; } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/rkmath.h000066400000000000000000000060271456117377200214530ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../platform.h" #include "constants.h" // std #include // std::min()/std::max() on Windows #include // Include vector intrinsics #ifndef RKCOMMON_NO_SIMD #if defined(_WIN32) #include #elif defined(__ARM_NEON) #include "arm/emulation.h" #else #include #include #endif #endif namespace rkcommon { namespace math { using std::cos; using std::sin; using std::tan; using std::max; using std::min; using std::fmod; __forceinline float sign(const float x) { return x < 0 ? -1.0f : 1.0f; } __forceinline float rcp(const float x) { #ifdef RKCOMMON_NO_SIMD return 1.f / x; #else const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rcp_ss(a); return _mm_cvtss_f32( _mm_mul_ss(r, _mm_sub_ss(_mm_set_ss(2.0f), _mm_mul_ss(r, a)))); #endif } __forceinline double rcp(const double x) { return 1. / x; } template __forceinline T rcp_safe_t(const T x) { const T flt_min = std::numeric_limits::min(); return rcp(std::abs(x) < flt_min ? (x >= 0.f ? flt_min : -flt_min) : x); } __forceinline float rcp_safe(const float x) { return rcp_safe_t(x); } __forceinline double rcp_safe(const double x) { return rcp_safe_t(x); } __forceinline float rsqrt(const float x) { #ifdef RKCOMMON_NO_SIMD return 1.f / std::sqrt(x); #else const __m128 a = _mm_set_ss(x); const __m128 r = _mm_rsqrt_ss(a); const __m128 c = _mm_add_ss(_mm_mul_ss(_mm_set_ss(1.5f), r), _mm_mul_ss(_mm_mul_ss(_mm_mul_ss(a, _mm_set_ss(-0.5f)), r), _mm_mul_ss(r, r))); return _mm_cvtss_f32(c); #endif } __forceinline double rsqrt(const double x) { return 1. / std::sqrt(x); } template __forceinline T clamp(const T &x, const T &lower = T(zero), const T &upper = T(one)) { return max(min(x, upper), lower); } template __forceinline T deg2rad(const T &x) { return x * T(1.745329251994329576923690768489e-2); } __forceinline float madd(const float a, const float b, const float c) { return a * b + c; } template inline T lerp(const float factor, const T &a, const T &b) { return (1.f - factor) * a + factor * b; } template inline T divRoundUp(T a, T b) { return (a + b - 1) / b; } #define APPROXIMATE_SRGB inline float linear_to_srgb(const float f) { const float c = std::max(f, 0.f); #ifdef APPROXIMATE_SRGB return std::pow(c, 1.f / 2.2f); #else return c <= 0.0031308f ? 12.92f * c : std::pow(c, 1.f / 2.4f) * 1.055f - 0.055f; #endif } } // namespace math } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/math/vec.h000066400000000000000000001107571456117377200207500ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "constants.h" #include "rkmath.h" #include "../traits/rktraits.h" namespace rkcommon { // NOTE: only for identifying vec_t types at compile-time struct vec_base { }; // ------------------------------------------------------- // type traits relevant to vec_t<> type compile-time logic // ------------------------------------------------------- namespace traits { template struct is_vec { const static bool value = std::is_base_of::value; }; template struct is_valid_vec_constructor_type { const static bool value = std::is_constructible::value && !std::is_same::value && !is_vec::value; }; template using is_valid_vec_constructor_type_t = enable_if_t< is_valid_vec_constructor_type::value>; } // namespace traits namespace math { // vec_t<> types ////////////////////////////////////////////////////////// template > struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]) {} vec_t(scalar_t s) : x(s), y(s) {} template > vec_t(const OT &s) : x(s), y(s) { } vec_t(scalar_t x, scalar_t y) : x(x), y(y) {} template vec_t(const vec_t &o) : x(o.x), y(o.y) { } const T &operator[](const size_t idx) const { assert(idx < 2); return (&x)[idx]; } T &operator[](const size_t idx) { assert(idx < 2); return (&x)[idx]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y; } size_t long_product() const { return size_t(x) * size_t(y); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]) {} vec_t(scalar_t s) : x(s), y(s), z(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} template vec_t(const vec_t &o, scalar_t z) : x(o.x), y(o.y), z(z) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z) { } const T &operator[](const size_t axis) const { assert(axis < 3); return (&x)[axis]; } T &operator[](const size_t axis) { assert(axis < 3); return (&x)[axis]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]) {} vec_t(scalar_t s) : x(s), y(s), z(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} template vec_t(const vec_t &o, scalar_t z) : x(o.x), y(o.y), z(z) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z) { } const T &operator[](const size_t axis) const { assert(axis < 3); return (&x)[axis]; } T &operator[](const size_t axis) { assert(axis < 3); return (&x)[axis]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z); } operator vec_t() const { return vec_t(x, y, z); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z; T padding_; }; template struct vec_t : public vec_base { using scalar_t = T; using Scalar = T; vec_t() = default; vec_t(const scalar_t *v) : x(v[0]), y(v[1]), z(v[2]), w(v[3]) {} vec_t(scalar_t s) : x(s), y(s), z(s), w(s) {} template > vec_t(const OT &s) : x(s), y(s), z(s), w(s) { } vec_t(scalar_t x, scalar_t y, scalar_t z, scalar_t w) : x(x), y(y), z(z), w(w) { } template vec_t(const vec_t &o1, const vec_t &o2) : x(o1.x), y(o1.y), z(o2.x), w(o2.y) { } template vec_t(const vec_t &o, scalar_t w) : x(o.x), y(o.y), z(o.z), w(w) { } template vec_t(const vec_t &o) : x(o.x), y(o.y), z(o.z), w(o.w) { } const T &operator[](const size_t idx) const { assert(idx < 4); return (&x)[idx]; } T &operator[](const size_t idx) { assert(idx < 4); return (&x)[idx]; } operator T *() { return &x; } operator const T *() const { return &x; } /*! return result of reduce_add() across all components */ scalar_t sum() const { return x + y + z + w; } /*! return result of reduce_mul() across all components */ scalar_t product() const { return x * y * z * w; } size_t long_product() const { return size_t(x) * size_t(y) * size_t(z) * size_t(w); } // conversion constructor to other types to enable static_cast template explicit operator vec_t() const { return vec_t(*this); } T x, y, z, w; }; // ------------------------------------------------------- // unary operators // ------------------------------------------------------- template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z); } template inline vec_t operator-(const vec_t &v) { return vec_t(-v.x, -v.y, -v.z, -v.w); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z); } template inline vec_t operator+(const vec_t &v) { return vec_t(+v.x, +v.y, +v.z, +v.w); } using std::abs; // ------------------------------------------------------- // unary functors // ------------------------------------------------------- #define unary_functor(op) \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z)); \ } \ template \ inline vec_t op(const vec_t &v) \ { \ return vec_t(op(v.x), op(v.y), op(v.z), op(v.w)); \ } // clang-format off unary_functor(rcp) unary_functor(rcp_safe) unary_functor(abs) unary_functor(sin) unary_functor(cos) // clang-format on #undef unary_functor // ------------------------------------------------------- // binary arithmetic operators // ------------------------------------------------------- #define binary_operator(name, op) \ /* "vec op vec" */ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y); \ } \ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y, a.z op b.z); \ } \ \ template \ inline vec_t name(const vec_t &a, const vec_t &b) \ { \ return vec_t(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \ } \ \ /* "vec op vec" (element types don't match) */ \ template > \ inline auto name(const vec_t &a, const vec_t &b) \ ->vec_t \ { \ using vector_t = vec_t; \ return vector_t(vector_t(a) op vector_t(b)); \ } \ \ /* "vec op scalar" */ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b); \ } \ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b, a.z op b); \ } \ \ template \ inline vec_t name(const vec_t &a, const T &b) \ { \ return vec_t(a.x op b, a.y op b, a.z op b, a.w op b); \ } \ \ /* "vec op U" (element types don't match) */ \ template > \ inline auto name(const vec_t &a, const U &b) \ ->vec_t \ { \ using scalar_t = decltype(T() op U()); \ using vector_t = vec_t; \ return vector_t(vector_t(a) op scalar_t(b)); \ } \ \ /* "scalar op vec" */ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y); \ } \ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y, a op b.z); \ } \ \ template \ inline vec_t name(const T &a, const vec_t &b) \ { \ return vec_t(a op b.x, a op b.y, a op b.z, a op b.w); \ } \ \ /* "T op vec" (element types don't match) */ \ template > \ inline auto name(const T &a, const vec_t &b) \ ->vec_t \ { \ using scalar_t = decltype(T() op U()); \ using vector_t = vec_t; \ return vector_t(scalar_t(a) op vector_t(b)); \ } // clang-format off binary_operator(operator+, +) binary_operator(operator-, -) binary_operator(operator*, *) binary_operator(operator/, /) binary_operator(operator%, %) // clang-format on #undef binary_operator // ------------------------------------------------------- // binary arithmetic assignment operators // ------------------------------------------------------- #define binary_operator(name, op) \ /* "vec op vec" */ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ return a; \ } \ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ a.z op b.z; \ return a; \ } \ \ template \ inline vec_t &name(vec_t &a, const vec_t &b) \ { \ a.x op b.x; \ a.y op b.y; \ a.z op b.z; \ a.w op b.w; \ return a; \ } \ \ /* "vec op scalar" */ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ return a; \ } \ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ a.z op b; \ return a; \ } \ \ template > \ inline vec_t &name(vec_t &a, const U &b) \ { \ a.x op b; \ a.y op b; \ a.z op b; \ a.w op b; \ return a; \ } // clang-format off binary_operator(operator+=, +=) binary_operator(operator-=, -=) binary_operator(operator*=, *=) binary_operator(operator/=, /=) binary_operator(operator%=, %=) // clang-format on #undef binary_operator // ------------------------------------------------------- // ternary operators (just for compatibility with old embree // ------------------------------------------------------- template inline vec_t madd(const vec_t &a, const vec_t &b, const vec_t &c) { return vec_t( madd(a.x, b.x, c.x), madd(a.y, b.y, c.y), madd(a.z, b.z, c.z)); } // ------------------------------------------------------- // comparison operators // ------------------------------------------------------- template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y; } template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y && a.z == b.z; } template inline bool operator==(const vec_t &a, const vec_t &b) { return a.x == b.x && a.y == b.y && a.z == b.z && a.w == b.w; } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } template inline bool operator!=(const vec_t &a, const vec_t &b) { return !(a == b); } // 'anyLessThan' - return true if any component is less than the other vec's template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y; } template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y || a.z < b.z; } template inline bool anyLessThan(const vec_t &a, const vec_t &b) { return a.x < b.x || a.y < b.y || a.z < b.z || a.w < b.w; } // ------------------------------------------------------- // dot functions // ------------------------------------------------------- template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z; } template inline T dot(const vec_t &a, const vec_t &b) { return a.x * b.x + a.y * b.y + a.z * b.z + a.w * b.w; } // ------------------------------------------------------- // length functions // ------------------------------------------------------- template inline T length(const vec_t &v) { return sqrt(dot(v, v)); } // ------------------------------------------------------- // cross product // ------------------------------------------------------- template inline vec_t cross(const vec_t &a, const vec_t &b) { return vec_t( a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); } // ------------------------------------------------------- // normalize() // ------------------------------------------------------- template inline vec_t normalize(const vec_t &v) { return v * rsqrt(dot(v, v)); } template inline vec_t safe_normalize(const vec_t &v) { return v * rsqrt(max(T(ulp), dot(v, v))); } // ------------------------------------------------------- // interpolation // ------------------------------------------------------- // barycentric interpolation template inline vec_t interpolate_uv(const vec_t &f, const vec_t &a, const vec_t &b, const vec_t &c) { return f.x * a + f.y * b + f.z * c; } // ------------------------------------------------------- // ostream operators // ------------------------------------------------------- template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << ")"; return o; } template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << "," << v.z << ")"; return o; } template inline std::ostream &operator<<(std::ostream &o, const vec_t &v) { o << "(" << v.x << "," << v.y << "," << v.z << "," << v.w << ")"; return o; } // "inherit" std::min/max/etc for basic types using std::max; using std::min; // ------------------------------------------------------- // binary functors // ------------------------------------------------------- #define define_functor(f) \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y)); \ } \ \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z)); \ } \ \ template \ inline vec_t f(const vec_t &a, const vec_t &b) \ { \ return vec_t(f(a.x, b.x), f(a.y, b.y), f(a.z, b.z), f(a.w, b.w)); \ } // clang-format off define_functor(min) define_functor(max) define_functor(divRoundUp) // clang-format on #undef define_functor // ------------------------------------------------------- // reductions // ------------------------------------------------------- template inline T reduce_add(const vec_t &v) { return v.x + v.y; } template inline T reduce_add(const vec_t &v) { return v.x + v.y + v.z; } template inline T reduce_add(const vec_t &v) { return v.x + v.y + v.z + v.w; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y * v.z; } template inline T reduce_mul(const vec_t &v) { return v.x * v.y * v.z * v.w; } template inline T reduce_min(const vec_t &v) { return min(v.x, v.y); } template inline T reduce_min(const vec_t &v) { return min(min(v.x, v.y), v.z); } template inline T reduce_min(const vec_t &v) { return min(min(v.x, v.y), min(v.z, v.w)); } template inline T reduce_max(const vec_t &v) { return max(v.x, v.y); } template inline T reduce_max(const vec_t &v) { return max(max(v.x, v.y), v.z); } template inline T reduce_max(const vec_t &v) { return max(max(v.x, v.y), max(v.z, v.w)); } // ------------------------------------------------------- // all vec2 variants // ------------------------------------------------------- typedef vec_t vec2uc; typedef vec_t vec2c; typedef vec_t vec2us; typedef vec_t vec2s; typedef vec_t vec2ui; typedef vec_t vec2i; typedef vec_t vec2ul; typedef vec_t vec2l; typedef vec_t vec2f; typedef vec_t vec2d; // ------------------------------------------------------- // all vec3 variants // ------------------------------------------------------- typedef vec_t vec3uc; typedef vec_t vec3c; typedef vec_t vec3us; typedef vec_t vec3s; typedef vec_t vec3ui; typedef vec_t vec3i; typedef vec_t vec3ul; typedef vec_t vec3l; typedef vec_t vec3f; typedef vec_t vec3d; typedef vec_t vec3fa; typedef vec_t vec3ia; // ------------------------------------------------------- // all vec4 variants // ------------------------------------------------------- typedef vec_t vec4uc; typedef vec_t vec4c; typedef vec_t vec4us; typedef vec_t vec4s; typedef vec_t vec4ui; typedef vec_t vec4i; typedef vec_t vec4ul; typedef vec_t vec4l; typedef vec_t vec4f; typedef vec_t vec4d; template inline size_t arg_max(const vec_t &v) { size_t maxIdx = 0; for (size_t i = 1; i < N; i++) if (v[i] > v[maxIdx]) maxIdx = i; return maxIdx; } inline vec4f linear_to_srgba(const vec4f c) { return vec4f(linear_to_srgb(c.x), linear_to_srgb(c.y), linear_to_srgb(c.z), std::max(c.w, 0.f)); // alpha is never gamma-corrected } inline uint32_t cvt_uint32(const float f) { return (uint32_t)round(255.f * clamp(f, 0.f, 1.f)); } inline uint32_t cvt_uint32(const vec4f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16) | (cvt_uint32(v.w) << 24); } inline uint32_t linear_to_srgba8(const vec4f c) { return cvt_uint32(linear_to_srgba(c)); } } // namespace math } // namespace rkcommon /*! template specialization for std::less comparison operator; * we need those to be able to put vec's in std::map etc @{ */ /* Defining just operator< is prone to bugs, because a definition of an * ordering of vectors is a bit arbitrary and depends on the context. * For example, in box::extend we certainly want the element-wise min/max and * not the std::min/std::max made applicable by vec3f::operator<. */ namespace std { template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && (a.y < b.y)); } }; template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && ((a.y < b.y) || ((a.y == b.y) && (a.z < b.z)))); } }; template struct less> { inline bool operator()(const rkcommon::math::vec_t &a, const rkcommon::math::vec_t &b) const { return (a.x < b.x) || ((a.x == b.x) && ((a.y < b.y) || ((a.y == b.y) && ((a.z < b.z) || ((a.z == b.z) && (a.w < b.w)))))); } }; } // namespace std ospray-rkcommon-538f8a2/rkcommon/math/vec.ih000066400000000000000000001301161456117377200211100ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "math.ih" #ifndef ISPC namespace ispc { #endif #ifdef ISPC #define __define_vectors(type, abb) \ struct vec2##abb \ { \ type x, y; \ }; \ struct vec3##abb \ { \ type x, y, z; \ }; \ struct vec4##abb \ { \ type x, y, z, w; \ } __define_vectors(int32, i); __define_vectors(uint32, ui); __define_vectors(uint8, uc); __define_vectors(float, f); __define_vectors(int64, l); __define_vectors(uint64, ul); #undef __define_vectors #else template struct vec_t { }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s) {} vec_t(scalar_t x, scalar_t y) : x(x), y(y) {} T x, y; }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s), z(s) {} vec_t(scalar_t x, scalar_t y, scalar_t z) : x(x), y(y), z(z) {} T x, y, z; }; template struct vec_t { using scalar_t = T; vec_t() = default; vec_t(scalar_t s) : x(s), y(s), z(s), w(s) {} vec_t(scalar_t x, scalar_t y, scalar_t z, scalar_t w) : x(x), y(y), z(z), w(w) {} T x, y, z, w; }; // vec2 variants typedef vec_t vec2uc; typedef vec_t vec2ui; typedef vec_t vec2i; typedef vec_t vec2l; typedef vec_t vec2ul; typedef vec_t vec2f; // vec3 variants typedef vec_t vec3uc; typedef vec_t vec3ui; typedef vec_t vec3i; typedef vec_t vec3l; typedef vec_t vec3ul; typedef vec_t vec3f; // vec4 variants typedef vec_t vec4uc; typedef vec_t vec4ui; typedef vec_t vec4i; typedef vec_t vec4l; typedef vec_t vec4ul; typedef vec_t vec4f; #endif // ============================================================================ /* defines all constructors "make_vec2[T]" for 2-vector type */ #define __define_ispc_constructors2(univary, abb, itype, iabb) \ inline univary vec2##abb make_vec2##abb( \ const univary itype x, const univary itype y) \ { \ univary vec2##abb ret; \ ret.x = x; \ ret.y = y; \ return ret; \ } \ inline univary vec2##abb make_vec2##abb(const univary itype x) \ { \ univary vec2##abb ret; \ ret.x = x; \ ret.y = x; \ return ret; \ } /* defines all constructors "make_vec3[T]" for 3-vector type */ #define __define_ispc_constructors3(univary, abb, itype, iabb) \ inline univary vec3##abb make_vec3##abb(const univary itype x) \ { \ univary vec3##abb ret; \ ret.x = x; \ ret.y = x; \ ret.z = x; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb(const univary vec3##iabb v) \ { \ univary vec3##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb( \ const univary itype x, const univary itype y, const univary itype z) \ { \ univary vec3##abb ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ return ret; \ } \ inline univary vec3##abb make_vec3##abb(const univary vec4##iabb v) \ { \ univary vec3##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ return ret; \ } /* defines all constructors "make_vec4[T]" for 4-vector type */ #define __define_ispc_constructors4(univary, abb, itype, iabb) \ /* construct vec4 from a single scalar */ \ inline univary vec4##abb make_vec4##abb(const univary itype f) \ { \ univary vec4##abb ret; \ ret.x = f; \ ret.y = f; \ ret.z = f; \ ret.w = f; \ return ret; \ } \ /* construct vec4 from 4 scalars */ \ inline univary vec4##abb make_vec4##abb(const univary itype x, \ const univary itype y, \ const univary itype z, \ const univary itype w) \ { \ univary vec4##abb ret; \ ret.x = x; \ ret.y = y; \ ret.z = z; \ ret.w = w; \ return ret; \ } \ /* construct vec4 from another vec4 (of another type) */ \ inline univary vec4##abb make_vec4##abb(const univary vec4##iabb v) \ { \ univary vec4##abb ret; \ ret.x = v.x; \ ret.y = v.y; \ ret.z = v.z; \ ret.w = v.w; \ return ret; \ } #define __define_ispc_lift_constructors4(univary, type, abb) \ /* lift vec4 from vec3; fill in with 0es */ \ inline univary vec4##abb make_vec4##abb(const univary vec3##abb v) \ { \ univary vec4##abb ret; \ ret.x = (type)v.x; \ ret.y = (type)v.y; \ ret.z = (type)v.z; \ ret.w = (type)0; \ return ret; \ } #define __define_ispc_constructors_uv_t(univary, oabb, itype, iabb) \ __define_ispc_constructors2(univary, oabb, itype, iabb); \ __define_ispc_constructors3(univary, oabb, itype, iabb); \ __define_ispc_constructors4(univary, oabb, itype, iabb) #define __define_ispc_constructors_uv(univary, type, abb) \ __define_ispc_constructors_uv_t(univary, abb, int32, i); \ __define_ispc_constructors_uv_t(univary, abb, uint32, ui); \ __define_ispc_constructors_uv_t(univary, abb, uint8, uc); \ __define_ispc_constructors_uv_t(univary, abb, float, f); \ __define_ispc_lift_constructors4(univary, type, abb) #define __define_ispc_constructors(univary) \ __define_ispc_constructors_uv(univary, int32, i); \ __define_ispc_constructors_uv(univary, uint32, ui); \ __define_ispc_constructors_uv(univary, uint8, uc); \ __define_ispc_constructors_uv(univary, float, f) #ifdef ISPC __define_ispc_constructors(uniform); __define_ispc_constructors(varying); #else __define_ispc_constructors(); #endif #undef __define_ispc_constructors2 #undef __define_ispc_constructors3 #undef __define_ispc_constructors4 #undef __define_ispc_lift_constructors4 #undef __define_ispc_constructors_uv #undef __define_ispc_constructors // ============================================================================ // define 'lifted' binary operators (min/max/...) #define __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, abb) \ inline univary_r vec2##abb fct( \ const univary_a vec2##abb a, const univary_b vec2##abb b) \ { \ return make_vec2##abb(fct(a.x, b.x), fct(a.y, b.y)); \ } \ inline univary_r vec3##abb fct( \ const univary_a vec3##abb a, const univary_b vec3##abb b) \ { \ return make_vec3##abb(fct(a.x, b.x), fct(a.y, b.y), fct(a.z, b.z)); \ } \ inline univary_r vec4##abb fct( \ const univary_a vec4##abb a, const univary_b vec4##abb b) \ { \ return make_vec4##abb( \ fct(a.x, b.x), fct(a.y, b.y), fct(a.z, b.z), fct(a.w, b.w)); \ } #define __define_binary_fct_types(univary_r, univary_a, univary_b, fct) \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, f); \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, i); \ __define_binary_fct_dims(univary_r, univary_a, univary_b, fct, ui) #define __define_binary_fct(univary_r, univary_a, univary_b) \ __define_binary_fct_types(univary_r, univary_a, univary_b, min); \ __define_binary_fct_types(univary_r, univary_a, univary_b, max) #ifdef ISPC __define_binary_fct(uniform, uniform, uniform); __define_binary_fct(varying, varying, varying); __define_binary_fct(varying, varying, uniform); __define_binary_fct(varying, uniform, varying); #else __define_binary_fct(, , ); #endif #undef __define_binary_fct #undef __define_binary_fct_types #undef __define_binary_fct_dims // ============================================================================ #define __define_binary_operator_dims(uv, opname, op, abb, type) \ /* vec2##abb */ \ inline uv vec2##abb opname(const uv vec2##abb a, const uv vec2##abb b) \ { \ return make_vec2##abb(a.x op b.x, a.y op b.y); \ } \ inline uv vec2##abb opname(const uv vec2##abb a, const uv type b) \ { \ return make_vec2##abb(a.x op b, a.y op b); \ } \ inline uv vec2##abb opname(const uv type a, const uv vec2##abb b) \ { \ return make_vec2##abb(a op b.x, a op b.y); \ } \ /* vec3##abb */ \ inline uv vec3##abb opname(const uv vec3##abb a, const uv vec3##abb b) \ { \ return make_vec3##abb(a.x op b.x, a.y op b.y, a.z op b.z); \ } \ inline uv vec3##abb opname(const uv vec3##abb a, const uv type b) \ { \ return make_vec3##abb(a.x op b, a.y op b, a.z op b); \ } \ inline uv vec3##abb opname(const uv type a, const uv vec3##abb b) \ { \ return make_vec3##abb(a op b.x, a op b.y, a op b.z); \ } \ /* vec4##abb */ \ inline uv vec4##abb opname(const uv vec4##abb a, const uv vec4##abb b) \ { \ return make_vec4##abb(a.x op b.x, a.y op b.y, a.z op b.z, a.w op b.w); \ } \ inline uv vec4##abb opname(const uv vec4##abb a, const uv type b) \ { \ return make_vec4##abb(a.x op b, a.y op b, a.z op b, a.w op b); \ } \ inline uv vec4##abb opname(const uv type a, const uv vec4##abb b) \ { \ return make_vec4##abb(a op b.x, a op b.y, a op b.z, a op b.w); \ } #define __define_binary_operator_types(uv, opname, op) \ __define_binary_operator_dims(uv, opname, op, f, float); \ __define_binary_operator_dims(uv, opname, op, i, int32); \ __define_binary_operator_dims(uv, opname, op, ui, uint32) // define 'regular' operators #define __define_binary_operator(uv) \ __define_binary_operator_types(uv, operator+, +); \ __define_binary_operator_types(uv, operator-, -); \ __define_binary_operator_types(uv, operator*, *); \ __define_binary_operator_types(uv, operator/, /) #ifdef ISPC __define_binary_operator(uniform); __define_binary_operator(varying); #else __define_binary_operator(); #endif #undef __define_binary_operator #undef __define_binary_operator_types #undef __define_binary_operator_dims // ============================================================================ #define __define_comp_fn(univary) \ inline univary bool eq(const univary vec2f a, const univary vec2f b) \ { \ return a.x == b.x && a.y == b.y; \ } \ inline univary bool eq(const univary vec3f a, const univary float b) \ { \ return a.x == b && a.y == b && a.z == b; \ } \ inline univary bool eq(const univary vec3f a, const univary vec3f b) \ { \ return a.x == b.x && a.y == b.y && a.z == b.z; \ } \ inline univary bool ne(const univary vec2f a, const univary vec2f b) \ { \ return !eq(a, b); \ } \ inline univary bool ne(const univary vec3f a, const univary float b) \ { \ return !eq(a, b); \ } \ inline univary bool ne(const univary vec3f a, const univary vec3f b) \ { \ return !eq(a, b); \ } \ inline univary vec3f neg(const univary vec3f v) \ { \ return make_vec3f(-v.x, -v.y, -v.z); \ } #ifdef ISPC __define_comp_fn(uniform); __define_comp_fn(varying); #else __define_comp_fn(); #endif #undef __define_comp_fn // ------------------------------------------------------------------ // anyLessThan() // ------------------------------------------------------------------ #define __define_anyLessThan(univary, abb) \ inline univary bool anyLessThan( \ const univary vec2##abb &a, const univary vec2##abb &b) \ { \ return ISPC_OR(a.x < b.x, a.y < b.y); \ } \ inline univary bool anyLessThan( \ const univary vec3##abb &a, const univary vec3##abb &b) \ { \ return ISPC_OR(a.x < b.x, ISPC_OR(a.y < b.y, a.z < b.z)); \ } #define __define_anyLessThan_type(univary) \ __define_anyLessThan(univary, f); \ __define_anyLessThan(univary, i); #ifdef ISPC __define_anyLessThan_type(uniform); __define_anyLessThan_type(varying); #else __define_anyLessThan_type(); #endif #undef __define_anyLessThan_type #undef __define_anyLessThan // ------------------------------------------------------------------ // dot product // ------------------------------------------------------------------ #define __define_dot_product(univary) \ /*! computes 3D dot product for vec3fs */ \ inline univary float dot(const univary vec3f a, const univary vec3f b) \ { \ return a.x * b.x + a.y * b.y + a.z * b.z; \ } \ inline univary float length(const univary vec3f a) \ { \ return sqrtf(dot(a, a)); \ } \ inline univary float distance(const univary vec3f a, const univary vec3f b) \ { \ return length(a - b); \ } #ifdef ISPC __define_dot_product(uniform); __define_dot_product(varying); #else __define_dot_product(); #endif #undef __define_dot_product // ------------------------------------------------------------------ // cross product // ------------------------------------------------------------------ #define __define_cross(univary_r, univary_a, univary_b) \ inline univary_r vec3f cross( \ const univary_a vec3f &a, const univary_b vec3f &b) \ { \ return make_vec3f( \ a.y * b.z - a.z * b.y, a.z * b.x - a.x * b.z, a.x * b.y - a.y * b.x); \ } #ifdef ISPC __define_cross(uniform, uniform, uniform); __define_cross(varying, varying, varying); __define_cross(varying, varying, uniform); __define_cross(varying, uniform, varying); #else __define_cross(, , ); #endif #undef __define_cross // ------------------------------------------------------------------ // rotate // ------------------------------------------------------------------ #ifdef ISPC /* rotates vector around axis for *all-uniform* vec3fs */ inline uniform vec3f rotate( const uniform vec3f &v, const uniform vec3f &axis, uniform float theta) { return v * cos(theta) + cross(axis, v) * sin(theta) + axis * dot(axis, v) * (1.f - cos(theta)); } #endif /* rotates vector around axis for vec3fs that produce varying results */ inline vec3f rotate(const vec3f &v, const vec3f &axis, float theta) { return v * cos(theta) + cross(axis, v) * sin(theta) + axis * dot(axis, v) * (1.f - cos(theta)); } // ------------------------------------------------------------------ // normalize // ------------------------------------------------------------------ #ifdef ISPC /* compute and return normalized version of uniform vec3f passed to this fct */ inline uniform vec3f normalize(const uniform vec3f &v) { return v * (1.f / sqrt(dot(v, v))); } #endif /* compute and return normalized version of varying vec3f passed to this fct */ inline vec3f normalize(const vec3f v) { return v * (1.f / sqrt(dot(v, v))); } /* compute and return normalized version of varying vec3f passed to this fct */ inline vec3f normalize(const vec3f v, float &len) { len = sqrtf(dot(v, v)); return v * rcpf(len); } inline vec3f safe_normalize(const vec3f v) { return v * (1.f / sqrtf(max(flt_min, dot(v, v)))); } /* differentiated normalization */ inline vec3f dnormalize(const vec3f &p, const vec3f &dp) { const float pp = dot(p, p); const float pdp = dot(p, dp); return (pp * dp - pdp * p) * rcp(pp) * rsqrt(pp); } // ------------------------------------------------------------------ // unary functions // ------------------------------------------------------------------ #define __define_unary_fct_dims(univary, fct) \ inline univary vec2f fct(const univary vec2f v) \ { \ return make_vec2f(fct(v.x), fct(v.y)); \ } \ inline univary vec3f fct(const univary vec3f v) \ { \ return make_vec3f(fct(v.x), fct(v.y), fct(v.z)); \ } \ inline univary vec4f fct(const univary vec4f v) \ { \ return make_vec4f(fct(v.x), fct(v.y), fct(v.z), fct(v.w)); \ } #define __define_unary_fct(univary) \ __define_unary_fct_dims(univary, abs); \ __define_unary_fct_dims(univary, absf); \ __define_unary_fct_dims(univary, rcpf); \ __define_unary_fct_dims(univary, expf); \ __define_unary_fct_dims(univary, logf); \ __define_unary_fct_dims(univary, floor); \ __define_unary_fct_dims(univary, divide_safe); \ __define_unary_fct_dims(univary, rcp); \ __define_unary_fct_dims(univary, rcp_safe); \ __define_unary_fct_dims(univary, exp); \ __define_unary_fct_dims(univary, frac); \ __define_unary_fct_dims(univary, sqr); \ __define_unary_fct_dims(univary, sqrt); \ __define_unary_fct_dims(univary, sqrt_safe) #ifdef ISPC __define_unary_fct(uniform); __define_unary_fct(varying); #else __define_unary_fct(); #endif #undef __define_unary_fct #undef __define_unary_fct_dims // ------------------------------------------------------------------ // lerp // ------------------------------------------------------------------ #define __define_lerp_fn(univary, abb) \ inline univary vec2##abb lerp(univary float factor, \ const univary vec2##abb a, \ const univary vec2##abb b) \ { \ return make_vec2##abb(lerp(factor, a.x, b.x), lerp(factor, a.y, b.y)); \ } \ inline univary vec2##abb lerp(univary vec2f factor, \ const univary vec2##abb a, \ const univary vec2##abb b) \ { \ return make_vec2##abb(lerp(factor.x, a.x, b.x), lerp(factor.y, a.y, b.y)); \ } \ inline univary vec3##abb lerp(univary float factor, \ const univary vec3##abb a, \ const univary vec3##abb b) \ { \ return make_vec3##abb(lerp(factor, a.x, b.x), \ lerp(factor, a.y, b.y), \ lerp(factor, a.z, b.z)); \ } \ inline univary vec3##abb lerp(univary vec3f factor, \ const univary vec3##abb a, \ const univary vec3##abb b) \ { \ return make_vec3##abb(lerp(factor.x, a.x, b.x), \ lerp(factor.y, a.y, b.y), \ lerp(factor.z, a.z, b.z)); \ } \ inline univary vec4##abb lerp(univary float factor, \ const univary vec4##abb a, \ const univary vec4##abb b) \ { \ return make_vec4##abb(lerp(factor, a.x, b.x), \ lerp(factor, a.y, b.y), \ lerp(factor, a.z, b.z), \ lerp(factor, a.w, b.w)); \ } \ inline univary vec4##abb lerp(univary vec4f factor, \ const univary vec4##abb a, \ const univary vec4##abb b) \ { \ return make_vec4##abb(lerp(factor.x, a.x, b.x), \ lerp(factor.y, a.y, b.y), \ lerp(factor.z, a.z, b.z), \ lerp(factor.w, a.w, b.w)); \ } #define __define_lerp_type(univary) \ __define_lerp_fn(univary, f); \ __define_lerp_fn(univary, i); \ __define_lerp_fn(univary, ui); \ __define_lerp_fn(univary, uc) #ifdef ISPC __define_lerp_type(varying); __define_lerp_type(uniform); #else __define_lerp_type(); #endif #undef __define_lerp_type #undef __define_lerp_fn // ------------------------------------------------------------------ // interpolate // ------------------------------------------------------------------ #define __define_interpolate_fn(univary, type) \ inline type interpolate(const vec3f &f, \ const univary type a, \ const univary type b, \ const univary type c) \ { \ return f.x * a + f.y * b + f.z * c; \ } #define __define_interpolate_type(univary) \ __define_interpolate_fn(univary, vec2f); \ __define_interpolate_fn(univary, vec3f); \ __define_interpolate_fn(univary, vec4f); #ifdef ISPC __define_interpolate_type(varying); __define_interpolate_type(uniform); #else __define_interpolate_type(); #endif #undef __define_interpolate_type #undef __define_interpolate_fn // ------------------------------------------------------------------ // clamp // ------------------------------------------------------------------ inline vec3f clamp(const vec3f &a) { return (make_vec3f(clamp(a.x), clamp(a.y), clamp(a.z))); } #define __define_clamp_dims(univary_v, univary_l, abb) \ inline univary_v vec2##abb clamp(const univary_v vec2##abb &a, \ const univary_l vec2##abb &b, \ const univary_l vec2##abb &c) \ { \ return (make_vec2##abb(clamp(a.x, b.x, c.x), clamp(a.y, b.y, c.y))); \ } \ inline univary_v vec3##abb clamp(const univary_v vec3##abb &a, \ const univary_l vec3##abb &b, \ const univary_l vec3##abb &c) \ { \ return (make_vec3##abb( \ clamp(a.x, b.x, c.x), clamp(a.y, b.y, c.y), clamp(a.z, b.z, c.z))); \ } #define __define_clamp_types(univary_v, univary_l) \ __define_clamp_dims(univary_v, univary_l, f); \ __define_clamp_dims(univary_v, univary_l, i) #ifdef ISPC __define_clamp_types(varying, varying); __define_clamp_types(uniform, uniform); __define_clamp_types(varying, uniform); #else __define_clamp_types(, ); #endif #undef __define_clamp_types #undef __define_clamp_dims #define __define_reduce_op_dims(univary, op, abb, type) \ inline univary type reduce_##op(const univary vec3##abb &a) \ { \ return op(op(a.x, a.y), a.z); \ } \ inline univary type reduce_##op(const univary vec4##abb &a) \ { \ return op(op(a.x, a.y), op(a.z, a.w)); \ } #define __define_reduce_op_types(univary, op) \ __define_reduce_op_dims(univary, op, i, int) \ __define_reduce_op_dims(univary, op, f, float) #define __define_reduce_op(univary) \ __define_reduce_op_types(univary, min) __define_reduce_op_types(univary, max) #ifdef ISPC __define_reduce_op(varying); __define_reduce_op(uniform); #else __define_reduce_op(); #endif #undef __define_reduce_op #undef __define_reduce_op_types #undef __define_reduce_op_dims // ------------------------------------------------------------------ // other // ------------------------------------------------------------------ #define __define_other(univary) \ inline univary vec4f make_vec4f( \ const univary vec3f rgb, const univary float a) \ { \ return make_vec4f(rgb.x, rgb.y, rgb.z, a); \ } \ inline univary vec3f to_float(const univary vec3i &a) \ { \ return make_vec3f(a); \ } \ inline univary vec3i to_int(const univary vec3f &a) \ { \ return make_vec3i(a); \ } \ inline univary vec3i operator>>(const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x >> b, a.y >> b, a.z >> b)); \ } \ inline univary vec3i operator<<(const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x << b, a.y << b, a.z << b)); \ } \ inline univary vec3i bitwise_AND( \ const univary vec3i &a, const univary int b) \ { \ return (make_vec3i(a.x & b, a.y & b, a.z & b)); \ } \ inline univary vec3f powf(const univary vec3f v, const univary float f) \ { \ return make_vec3f(powf(v.x, f), powf(v.y, f), powf(v.z, f)); \ } \ inline univary float reduce_mul(const univary vec3f &a) \ { \ return a.x * a.y * a.z; \ } \ inline univary float reduce_add(const univary vec3f &a) \ { \ return a.x + a.y + a.z; \ } \ inline univary float reduce_add(const univary vec4f &a) \ { \ return (a.x + a.y) + (a.z + a.w); \ } \ inline univary float reduce_avg(const univary vec3f &a) \ { \ return reduce_add(a) * (1.0f / 3.0f); \ } \ inline univary float luminance(const univary vec3f &c) \ { \ return 0.212671f * c.x + 0.715160f * c.y + 0.072169f * c.z; \ } \ inline univary bool isnan(const univary vec3f v) \ { \ return isnan(v.x + v.y + v.z); \ } #ifdef ISPC __define_other(varying); __define_other(uniform); #else __define_other(); #endif #undef __define_other // The next machine representable number from 'a' in the direction of 'b' inline ISPC_UNIFORM vec3f nextafter( const ISPC_UNIFORM vec3i &a, const ISPC_UNIFORM vec3i &b) { return (make_vec3f( nextafter(a.x, b.x), nextafter(a.y, b.y), nextafter(a.z, b.z))); } inline vec2i make_vec2i(const vec2f &a) { return make_vec2i((int)a.x, (int)a.y); } inline vec2i to_int(const vec2f &a) { return make_vec2i(a); } inline vec2f to_float_unorm(const vec2ui &a) { return make_vec2f(to_float_unorm(a.x), to_float_unorm(a.y)); } inline vec3f to_float_unorm(const vec3ui &a) { return make_vec3f( to_float_unorm(a.x), to_float_unorm(a.y), to_float_unorm(a.z)); } inline vec3f floatbits(const vec3i &a) { return make_vec3f(floatbits(a.x), floatbits(a.y), floatbits(a.z)); } inline vec3ui intbits(const vec3f &a) { return make_vec3ui(intbits(a.x), intbits(a.y), intbits(a.z)); } inline vec3f pow(const vec3f &a, const float b) { return make_vec3f(pow(a.x, b), pow(a.y, b), pow(a.z, b)); } inline vec4f pow(const vec4f &a, const float b) { return make_vec4f(pow(a.x, b), pow(a.y, b), pow(a.z, b), pow(a.w, b)); } // ------------------------------------------------------- // float / int conversion functions // ------------------------------------------------------- /* convert float-color into rgba-uint format, i.e. normalized fixed-point * round to nearest, see "2.3.5 Fixed-Point Data Conversions" of OpenGL 4.6 */ inline uint32 cvt_uint32(const float f) { return (uint32)roundf(255.f * clamp(f, 0.f, 1.f)); } inline uint32 cvt_uint32(const vec4f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16) | (cvt_uint32(v.w) << 24); } inline uint32 cvt_uint32(const vec3f &v) { return (cvt_uint32(v.x) << 0) | (cvt_uint32(v.y) << 8) | (cvt_uint32(v.z) << 16); } // ------------------------------------------------------- // sRGB conversion functions // ------------------------------------------------------- #define APPROXIMATE_SRGB inline float linear_to_srgb(const float f) { const float c = max(f, 0.f); #ifdef APPROXIMATE_SRGB return pow(c, 1.f / 2.2f); #else return c <= 0.0031308f ? 12.92f * c : pow(c, 1.f / 2.4f) * 1.055f - 0.055f; #endif } inline vec4f linear_to_srgba(const vec4f c) { return make_vec4f(linear_to_srgb(c.x), linear_to_srgb(c.y), linear_to_srgb(c.z), max(c.w, 0.f)); // alpha is never gamma-corrected } inline uint32 linear_to_srgba8(const vec4f c) { #if 1 return cvt_uint32(linear_to_srgba(c)); #else // TODO use ISPC's float_to_srgb8 once it is fixed (issue #1198) return (float_to_srgb8(c.x) << 0) | (float_to_srgb8(c.y) << 8) | (float_to_srgb8(c.z) << 16) | ((uint32)clamp(c.w, 0.f, 1.f) << 24); // alpha is never gamma-corrected #endif } inline float srgb_to_linear(const float f) { const float c = max(f, 0.f); #ifdef APPROXIMATE_SRGB return pow(c, 2.2f); #else return c <= 0.04045f ? c / 12.92f : pow((c + 0.055f) / 1.055f, 2.4f); #endif } inline vec4f srgba_to_linear(const vec4f c) { return make_vec4f(srgb_to_linear(c.x), srgb_to_linear(c.y), srgb_to_linear(c.z), max(c.w, 0.f)); // alpha is never gamma-corrected } // TODO implement srgba8_to_linear with a 256 entry LUT #undef APPROXIMATE_SRGB #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/memory/000077500000000000000000000000001456117377200203665ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/memory/DeletedUniquePtr.h000066400000000000000000000012031456117377200237560ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace memory { template using DeletedUniquePtr = std::unique_ptr>; template inline DeletedUniquePtr make_deleted_unique(DELETE_FCN &&deleter, Args &&... args) { return DeletedUniquePtr(new T(std::forward(args)...), deleter); } } // namespace memory } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/memory/IntrusivePtr.h000066400000000000000000000106561456117377200232250ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include namespace rkcommon { namespace memory { class RefCountedObject { public: RefCountedObject() = default; virtual ~RefCountedObject() = default; RefCountedObject(const RefCountedObject &) = delete; RefCountedObject &operator=(const RefCountedObject &) = delete; RefCountedObject(RefCountedObject &&) = delete; RefCountedObject &operator=(RefCountedObject &&) = delete; void refInc() const; void refDec() const; long long useCount() const; private: mutable std::atomic refCounter{1}; }; // Inlined definitions // inline void RefCountedObject::refInc() const { refCounter++; } inline void RefCountedObject::refDec() const { if ((--refCounter) == 0) delete this; } inline long long RefCountedObject::useCount() const { return refCounter.load(); } /////////////////////////////////////////////////////////////////////////// // Pointer to a RefCountedObject //////////////////////////////////////// /////////////////////////////////////////////////////////////////////////// template class IntrusivePtr { static_assert(std::is_base_of::value, "IntrusivePtr can only be used with objects derived " "from RefCountedObject"); public: T *ptr{nullptr}; IntrusivePtr() = default; ~IntrusivePtr(); IntrusivePtr(const IntrusivePtr &input); IntrusivePtr(IntrusivePtr &&input); template IntrusivePtr(const IntrusivePtr &input); IntrusivePtr(T *const input); IntrusivePtr &operator=(const IntrusivePtr &input); IntrusivePtr &operator=(IntrusivePtr &&input); IntrusivePtr &operator=(T *input); operator bool() const; T &operator*() const; T *operator->() const; }; // Inlined definitions // template inline IntrusivePtr::~IntrusivePtr() { if (ptr) ptr->refDec(); } template inline IntrusivePtr::IntrusivePtr(const IntrusivePtr &input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } template inline IntrusivePtr::IntrusivePtr(IntrusivePtr &&input) : ptr(input.ptr) { input.ptr = nullptr; } template template inline IntrusivePtr::IntrusivePtr(const IntrusivePtr &input) : ptr(input.ptr) { if (ptr) ptr->refInc(); } template inline IntrusivePtr::IntrusivePtr(T *const input) : ptr(input) { if (ptr) ptr->refInc(); } template inline IntrusivePtr &IntrusivePtr::operator=( const IntrusivePtr &input) { if (input.ptr) input.ptr->refInc(); if (ptr) ptr->refDec(); ptr = input.ptr; return *this; } template inline IntrusivePtr &IntrusivePtr::operator=(IntrusivePtr &&input) { if (ptr) ptr->refDec(); ptr = input.ptr; input.ptr = nullptr; return *this; } template inline IntrusivePtr &IntrusivePtr::operator=(T *input) { if (input) input->refInc(); if (ptr) ptr->refDec(); ptr = input; return *this; } template inline IntrusivePtr::operator bool() const { return ptr != nullptr; } template inline T &IntrusivePtr::operator*() const { return *ptr; } template inline T *IntrusivePtr::operator->() const { return ptr; } // Inlined operators ////////////////////////////////////////////////////// template inline bool operator<(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr < b.ptr; } template bool operator==(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr == b.ptr; } template bool operator!=(const IntrusivePtr &a, const IntrusivePtr &b) { return a.ptr != b.ptr; } } // namespace memory } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/memory/RefCount.h000066400000000000000000000005471456117377200222720ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "./IntrusivePtr.h" namespace rkcommon { namespace memory { // Type aliases for backward compatibility template using Ref = IntrusivePtr; using RefCount = RefCountedObject; } // namespace memory } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/memory/malloc.cpp000066400000000000000000000017061456117377200223450ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "malloc.h" #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #include "tbb/scalable_allocator.h" #else #ifdef _WIN32 #include #else #include #endif #endif namespace rkcommon { namespace memory { void *alignedMalloc(size_t size, size_t align) { assert((align & (align - 1)) == 0); #if defined(RKCOMMON_TASKING_TBB) return scalable_aligned_malloc(size, align); #else #ifdef _WIN32 return _aligned_malloc(size, align); #else // __UNIX__ return _mm_malloc(size, align); #endif #endif } void alignedFree(void *ptr) { #if defined(RKCOMMON_TASKING_TBB) scalable_aligned_free(ptr); #else #ifdef _WIN32 return _aligned_free(ptr); #else // __UNIX__ _mm_free(ptr); #endif #endif } } // namespace memory } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/memory/malloc.h000066400000000000000000000022401456117377200220040ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace memory { #define ALIGN_PTR(ptr, alignment) \ ((((size_t)ptr) + alignment - 1) & ((size_t) - (ssize_t)alignment)) /*! aligned allocation */ RKCOMMON_INTERFACE void *alignedMalloc(size_t size, size_t align = 64); RKCOMMON_INTERFACE void alignedFree(void *ptr); template __forceinline T *alignedMalloc(size_t nElements, size_t align = 64) { return (T *)alignedMalloc(nElements * sizeof(T), align); } inline bool isAligned(void *ptr, int alignment = 64) { return reinterpret_cast(ptr) % alignment == 0; } // NOTE(jda) - can't use function wrapped alloca solution as Clang won't // inline a function containing alloca()...but works w/ gcc+icc #if 0 template __forceinline T* stackBuffer(size_t nElements) { return static_cast(alloca(sizeof(T) * nElements)); } #else #define STACK_BUFFER(TYPE, nElements) (TYPE *)alloca(sizeof(TYPE) * nElements) #endif } // namespace memory } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/networking/000077500000000000000000000000001456117377200212455ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/networking/DataStreaming.cpp000066400000000000000000000045161456117377200245020ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "DataStreaming.h" #include "../common.h" #include namespace rkcommon { namespace networking { BufferWriter::BufferWriter() : buffer(std::make_shared>()) { } void BufferWriter::write(const void *mem, size_t size) { const size_t bsize = buffer->size(); buffer->resize(buffer->size() + size, 0); if (mem && size > 0) std::memcpy(buffer->begin() + bsize, mem, size); } BufferReader::BufferReader( const std::shared_ptr> &buf) : buffer(buf) { } void BufferReader::read(void *mem, size_t size) { if (cursor + size > buffer->size()) throw std::runtime_error("Attempt to read past end of BufferReader!"); if (mem && size > 0) std::memcpy(mem, buffer->begin() + cursor, size); cursor += size; } bool BufferReader::end() { return cursor >= buffer->size(); } void WriteSizeCalculator::write(const void *, size_t size) { writtenSize += size; } FixedBufferWriter::FixedBufferWriter(size_t size) : buffer(std::make_shared>(size)) { } void FixedBufferWriter::write(const void *mem, size_t size) { if (cursor + size >= buffer->size()) { throw std::runtime_error( "FixedBufferWriter::write size exceeds buffer"); } if (mem && size > 0) std::memcpy(buffer->begin() + cursor, mem, size); cursor += size; } void *FixedBufferWriter::reserve(size_t size) { if (cursor + size >= buffer->size()) { throw std::runtime_error( "FixedBufferWriter::reserve size exceeds buffer"); } void *mem = buffer->begin() + cursor; cursor += size; return mem; } std::shared_ptr::View> FixedBufferWriter::getWrittenView() { return std::make_shared::View>( buffer, 0, cursor); } size_t FixedBufferWriter::available() const { return buffer->size() - cursor; } size_t FixedBufferWriter::capacity() const { return buffer->size(); } } // namespace networking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/networking/DataStreaming.h000066400000000000000000000130051456117377200241400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "../utility/AbstractArray.h" #include "../utility/ArrayView.h" #include "../utility/FixedArray.h" #include "../utility/FixedArrayView.h" #include "../utility/OwnedArray.h" #include namespace rkcommon { namespace networking { /*! abstraction of an object that we can serailize/write (raw) data into */ struct RKCOMMON_INTERFACE WriteStream { virtual ~WriteStream() = default; virtual void write(const void *mem, size_t size) = 0; virtual void flush() {} }; /*! abstraction of an object that we can read (raw) data from to then de-serialize into work objects */ struct RKCOMMON_INTERFACE ReadStream { virtual ~ReadStream() = default; virtual void read(void *mem, size_t size) = 0; virtual bool end() = 0; }; struct RKCOMMON_INTERFACE BufferWriter : WriteStream { BufferWriter(); void write(const void *mem, size_t size) override; std::shared_ptr> buffer; }; struct RKCOMMON_INTERFACE BufferReader : ReadStream { BufferReader(const std::shared_ptr> &buf); void read(void *mem, size_t size) override; /* Get a view of the buffer at the current cursor with the desired number of * elements. This creates a view, not a copy of the data, so the underlying * buffer must be kept valid while the view is in use. The cursor will be * advanced to the data following this view */ template std::shared_ptr> getView(size_t count); bool end() override; size_t cursor = 0; const std::shared_ptr> buffer; }; /*! Utility which behaves as a write stream, but just computes the number of * bytes which have been written to it */ struct RKCOMMON_INTERFACE WriteSizeCalculator : public WriteStream { void write(const void *mem, size_t size) override; size_t writtenSize = 0; }; /*! Buffer writer for writing to a fixed size output buffer. The cursor * points to the next location to write at. Trying to write more than the * fixed buffer's size will throw an error */ struct RKCOMMON_INTERFACE FixedBufferWriter : public WriteStream { FixedBufferWriter() = default; FixedBufferWriter(size_t size); void write(const void *mem, size_t size) override; // Reserve space in the buffer and return the pointer to the start of it void *reserve(size_t size); // Get a view of the region written so far of the buffer std::shared_ptr::View> getWrittenView(); // Get the space available to write in the buffer size_t available() const; // Get the underlying buffer size being written to size_t capacity() const; size_t cursor = 0; std::shared_ptr> buffer; }; /*! generic stream operators into/out of streams, for raw data blocks */ template inline WriteStream &operator<<(WriteStream &buf, const T &rh) { buf.write((const byte_t *)&rh, sizeof(T)); return buf; } template inline ReadStream &operator>>(ReadStream &buf, T &rh) { buf.read((byte_t *)&rh, sizeof(T)); return buf; } /*! @{ stream operators into/out of read/write streams, for std::vectors * of non-POD types*/ template inline WriteStream &operator<<(WriteStream &buf, const std::vector &rh) { const size_t sz = rh.size(); buf << sz; for (const auto &x : rh) buf << x; return buf; } template inline ReadStream &operator>>(ReadStream &buf, std::vector &rh) { size_t sz; buf >> sz; rh.resize(sz); for (size_t i = 0; i < sz; ++i) buf >> rh[i]; return buf; } /*! @} */ /*! @{ stream operators into/out of read/write streams, for AbstractArray */ template inline WriteStream &operator<<(WriteStream &buf, const utility::AbstractArray &rh) { const size_t sz = rh.size(); buf << sz; buf.write((const byte_t *)rh.data(), sizeof(T) * sz); return buf; } /*! @} */ /*! @{ serialize operations for strings */ inline WriteStream &operator<<(WriteStream &buf, const std::string &rh) { const size_t sz = rh.size(); buf << sz; buf.write((const void *)rh.data(), sz); return buf; } inline WriteStream &operator<<(WriteStream &buf, const char *rh) { const size_t sz = std::strlen(rh); buf << sz; buf.write((const void *)rh, sz); return buf; } inline ReadStream &operator>>(ReadStream &buf, std::string &rh) { size_t sz; buf >> sz; rh.resize(sz); buf.read((void *)rh.data(), sz); return buf; } template std::shared_ptr> BufferReader::getView(size_t count) { const size_t size = count * sizeof(T); if (cursor + size > buffer->size()) { throw std::runtime_error("Attempt to read past end of BufferReader!"); } auto view = std::make_shared>(buffer->begin() + cursor, size); cursor += size; return view; } } // namespace networking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/networking/Fabric.cpp000066400000000000000000000004201456117377200231330ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include #include #include "../common.h" #include "../utility/AbstractArray.h" #include "Fabric.h" namespace rkcommon { namespace networking { Fabric::Fabric() {} } } ospray-rkcommon-538f8a2/rkcommon/networking/Fabric.h000066400000000000000000000025621456117377200226110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "../common.h" #include "../utility/AbstractArray.h" namespace rkcommon { namespace networking { /*! abstraction for a physical fabric that can transmit data - sockets, mpi, etc */ struct RKCOMMON_INTERFACE Fabric { Fabric(); virtual ~Fabric() = default; // Broadcast the data to all clients on the other end of the fabric // TODO: only makes sense to call on the root rank, so maybe a separate // "send" fabric ? virtual void sendBcast( std::shared_ptr> buf) = 0; virtual void flushBcastSends() = 0; // Receive a broadcast of data from the fabric sender // TODO: only makes sense to call on the receivers, so maybe a separate // "recv" fabric ? virtual void recvBcast(utility::AbstractArray &buf) = 0; // Send data to a specific rank in the fabric (callable on any rank) virtual void send(std::shared_ptr> buf, int rank) = 0; // Receive data from a specific rank on the fabric (callable on any rank) virtual void recv(utility::AbstractArray &buf, int rank) = 0; }; } // namespace networking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/os/000077500000000000000000000000001456117377200174775ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/os/FileName.cpp000066400000000000000000000107061456117377200216670ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "FileName.h" namespace rkcommon { #ifdef _WIN32 const char path_sep = '\\'; #else const char path_sep = '/'; #endif /*! create an empty filename */ FileName::FileName() {} /*! create a valid filename from a string */ FileName::FileName(const char *in) { filename = in; for (size_t i = 0; i < filename.size(); i++) if (filename[i] == '\\' || filename[i] == '/') filename[i] = path_sep; while (!filename.empty() && filename[filename.size() - 1] == path_sep) filename.resize(filename.size() - 1); } /*! create a valid filename from a string */ FileName::FileName(const std::string &in) { filename = in; for (size_t i = 0; i < filename.size(); i++) if (filename[i] == '\\' || filename[i] == '/') filename[i] = path_sep; while (!filename.empty() && filename[filename.size() - 1] == path_sep) filename.resize(filename.size() - 1); } /*! returns path to home folder */ FileName FileName::homeFolder() { #ifdef _WIN32 const char *home = getenv("UserProfile"); #else const char *home = getenv("HOME"); #endif if (home) return home; return ""; } /*! returns the canonical absolute path to filename */ FileName FileName::canonical() { /* pre-C++17 implementation of std::filesystem::canonical */ char *cTemp = nullptr; #ifdef _WIN32 cTemp = _fullpath(NULL, filename.c_str(), 0); #else // POSIX cTemp = realpath(filename.c_str(), NULL); #endif rkcommon::FileName canonical(cTemp ? cTemp : ""); free(cTemp); return canonical; } /*! returns the path */ std::string FileName::path() const { size_t pos = filename.find_last_of(path_sep); if (pos == std::string::npos) return ""; return filename.substr(0, pos + 1); } /*! returns the basename */ std::string FileName::base() const { size_t pos = filename.find_last_of(path_sep); if (pos == std::string::npos) return filename; return filename.substr(pos + 1); } /*! returns the extension */ std::string FileName::ext() const { size_t pos = filename.find_last_of('.'); if (pos == std::string::npos) return ""; return filename.substr(pos + 1); } /*! returns the extension */ FileName FileName::dropExt() const { size_t pos = filename.find_last_of('.'); if (pos == std::string::npos) return filename; return filename.substr(0, pos); } /*! returns the basename without extension */ std::string FileName::name() const { size_t start = filename.find_last_of(path_sep); if (start == std::string::npos) start = 0; else start++; size_t end = filename.find_last_of('.'); if (end == std::string::npos || end < start) end = filename.size(); return filename.substr(start, end - start); } /*! replaces the extension */ FileName FileName::setExt(const std::string &ext) const { size_t start = filename.find_last_of(path_sep); if (start == std::string::npos) start = 0; else start++; size_t end = filename.find_last_of('.'); if (end == std::string::npos || end < start) return FileName(filename + ext); return FileName(filename.substr(0, end) + ext); } /*! adds the extension */ FileName FileName::addExt(const std::string &ext) const { return FileName(filename + ext); } /*! concatenates two filenames to this/other */ FileName FileName::operator+(const FileName &other) const { if (filename == "") return FileName(other); else return FileName(filename + path_sep + other.filename); } /*! concatenates two filenames to this/other */ FileName FileName::operator+(const std::string &other) const { return operator+(FileName(other)); } /*! removes the base from a filename (if possible) */ FileName FileName::operator-(const FileName &base) const { size_t pos = filename.find_first_of(base); if (pos == std::string::npos) return *this; return FileName(filename.substr(pos + 1)); } /*! == operator */ bool operator==(const FileName &a, const FileName &b) { return a.filename == b.filename; } /*! != operator */ bool operator!=(const FileName &a, const FileName &b) { return a.filename != b.filename; } /*! output operator */ std::ostream &operator<<(std::ostream &cout, const FileName &filename) { return cout << filename.filename; } } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/os/FileName.h000066400000000000000000000053611456117377200213350ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { /*! Convenience class for handling file names and paths. */ class FileName { public: /*! create an empty filename */ RKCOMMON_INTERFACE FileName(); /*! create a valid filename from a string */ RKCOMMON_INTERFACE FileName(const char *filename); /*! create a valid filename from a string */ RKCOMMON_INTERFACE FileName(const std::string &filename); /*! returns path to home folder */ RKCOMMON_INTERFACE static FileName homeFolder(); /*! auto convert into a string */ RKCOMMON_INTERFACE operator std::string() const { return filename; } /*! returns a string of the filename */ RKCOMMON_INTERFACE const std::string &str() const { return filename; } /*! returns a c-string of the filename */ RKCOMMON_INTERFACE const char *c_str() const { return filename.c_str(); } /*! returns the canonical absolute path to filename */ /*! pre-C++17 implementation of std::filesystem::canonical */ RKCOMMON_INTERFACE FileName canonical(); /*! returns the path of a filename with separator at the end */ RKCOMMON_INTERFACE std::string path() const; /*! returns the file of a filename */ RKCOMMON_INTERFACE std::string base() const; /*! returns the base of a filename without extension */ RKCOMMON_INTERFACE std::string name() const; /*! returns the file extension */ RKCOMMON_INTERFACE std::string ext() const; /*! drops the file extension */ RKCOMMON_INTERFACE FileName dropExt() const; /*! replaces the file extension */ RKCOMMON_INTERFACE FileName setExt(const std::string &ext = "") const; /*! adds file extension */ RKCOMMON_INTERFACE FileName addExt(const std::string &ext = "") const; /*! concatenates two filenames to this/other */ RKCOMMON_INTERFACE FileName operator+(const FileName &other) const; /*! concatenates two filenames to this/other */ RKCOMMON_INTERFACE FileName operator+(const std::string &other) const; /*! removes the base from a filename (if possible) */ RKCOMMON_INTERFACE FileName operator-(const FileName &base) const; /*! == operator */ RKCOMMON_INTERFACE friend bool operator==(const FileName &a, const FileName &b); /*! != operator */ RKCOMMON_INTERFACE friend bool operator!=(const FileName &a, const FileName &b); /*! output operator */ RKCOMMON_INTERFACE friend std::ostream &operator<<( std::ostream &cout, const FileName &filename); private: std::string filename; }; } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/os/library.cpp000066400000000000000000000177461456117377200216660ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "library.h" #include "FileName.h" #include #ifndef _WIN32 #include #include #endif namespace { std::string directory_from_path(const std::string &path) { // Remove the filename from the path const size_t lastPathSep = path.find_last_of("/\\"); if (lastPathSep == std::string::npos) throw std::runtime_error("could not get absolute path of module directory"); return path.substr(0, lastPathSep + 1); } std::string library_location(const void *address) { // implementation taken from OIDN module.cpp if (address == nullptr) throw std::runtime_error("library_location(): NULL address provided"); #if defined(_WIN32) // Get the handle of the module which contains the address HMODULE module; const DWORD flags = GET_MODULE_HANDLE_EX_FLAG_FROM_ADDRESS | GET_MODULE_HANDLE_EX_FLAG_UNCHANGED_REFCOUNT; if (!GetModuleHandleExA(flags, reinterpret_cast(address), &module)) throw std::runtime_error("GetModuleHandleExA failed"); // Get the path of the module // Since we don't know the length of the path, we use a buffer of increasing // size DWORD pathSize = MAX_PATH + 1; for (;;) { std::vector path(pathSize); DWORD result = GetModuleFileNameA(module, path.data(), pathSize); if (result == 0) throw std::runtime_error("GetModuleFileNameA failed"); else if (result < pathSize) return directory_from_path(path.data()); else pathSize *= 2; } #else // dladdr should return an absolute path on Linux except for the main // executable On macOS it should always return an absolute path Dl_info info; if (dladdr(address, &info)) { // Check whether the path is absolute if (info.dli_fname && info.dli_fname[0] == '/') return directory_from_path(info.dli_fname); } #if defined(__APPLE__) // This shouldn't happen throw std::runtime_error("failed to get absolute path with dladdr"); #else // We failed to get an absolute path, so we try to parse /proc/self/maps std::ifstream file("/proc/self/maps"); if (!file) throw std::runtime_error("could not open /proc/self/maps"); // Parse the lines for (std::string lineStr; std::getline(file, lineStr);) { std::istringstream line(lineStr); // Parse the address range uintptr_t addressBegin, addressEnd; line >> std::hex; line >> addressBegin; if (line.get() != '-') continue; // parse error line >> addressEnd; if (!isspace(line.peek()) || !line) continue; // parse error // Check whether the address is in this range if (reinterpret_cast(address) < addressBegin || reinterpret_cast(address) >= addressEnd) continue; // Skip the permissions, offset, device, inode std::string str; for (int i = 0; i < 4; ++i) line >> str; // Parse the path line >> std::ws; if (!std::getline(line, str)) continue; // no path or parse error // Check whether the path is absolute if (str[0] == '/') return directory_from_path(str); } throw std::runtime_error("could not find address in /proc/self/maps"); #endif #endif } } // namespace namespace rkcommon { Library::Library( const void *anchorAddress, const std::string &name, const Version &version) : libraryName(name), libraryVersion(version) { bool success = false; try { success = loadLibrary(anchorAddress); } catch (const std::exception &e) { // handle exceptions from e.g. library_location() throw std::runtime_error( "Load of " + name + " failed due to: '" + e.what() + "'"); } if (!success) { throw std::runtime_error( "Load of " + name + " failed due to: '" + errorMessage + "'"); } } Library::Library(void *const _lib) : libraryName(""), lib(_lib), freeLibOnDelete(false) { } bool Library::loadLibrary(const void *anchorAddress) { std::string file = libraryName; std::string errorMsg; std::string libLocation = anchorAddress != nullptr ? library_location(anchorAddress) : std::string(); #ifdef _WIN32 std::string fullName = libLocation + file + ".dll"; lib = LoadLibrary(fullName.c_str()); if (lib == nullptr) { DWORD err = GetLastError(); LPTSTR lpMsgBuf; FormatMessage(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM | FORMAT_MESSAGE_IGNORE_INSERTS, NULL, err, MAKELANGID(LANG_NEUTRAL, SUBLANG_DEFAULT), (LPTSTR)&lpMsgBuf, 0, NULL); errorMsg = lpMsgBuf; LocalFree(lpMsgBuf); } #else std::string versionStr; for (int i: libraryVersion) versionStr += "." + std::to_string(i); std::string fullName = libLocation + "lib" + file; #if defined(__MACOSX__) || defined(__APPLE__) fullName += versionStr + ".dylib"; #else fullName += ".so" + versionStr; #endif lib = dlopen(fullName.c_str(), RTLD_LAZY | RTLD_LOCAL); if (lib == nullptr) errorMsg = dlerror(); #endif if (lib == nullptr) { errorMessage = "could not open module lib " + libraryName + ": " + errorMsg; return false; } return true; } Library::~Library() { /* Only dlclose/free libraries if we're not running through addrsan * so that the shared library symbols remain accessible to addrsan * at exit (see https://github.com/google/sanitizers/issues/89) */ #ifndef RKCOMMON_ADDRSAN if (freeLibOnDelete && lib) { #ifdef _WIN32 FreeLibrary((HMODULE)lib); #else dlclose(lib); #endif } #endif } void *Library::getSymbol(const std::string &sym) const { #ifdef _WIN32 return GetProcAddress((HMODULE)lib, sym.c_str()); #else return dlsym(lib, sym.c_str()); #endif } std::unique_ptr LibraryRepository::instance; LibraryRepository *LibraryRepository::getInstance() { if (instance.get() == nullptr) instance = std::unique_ptr(new LibraryRepository); return instance.get(); } void LibraryRepository::cleanupInstance() { LibraryRepository::instance.reset(); } LibraryRepository::~LibraryRepository() { // Close libraries in the opposite order that they were opened while (!repo.empty()) { repo.pop_back(); } } void LibraryRepository::add(const void *anchorAddress, const std::string &name, const Library::Version &version) { if (libraryExists(name)) return; // lib already loaded. repo.push_back(rkcommon::make_unique( anchorAddress, name, version)); } void LibraryRepository::remove(const std::string &name) { auto lib = findLibrary(name); if (lib != repo.end()) { repo.erase(lib); } } void *LibraryRepository::getSymbol(const std::string &name) const { void *sym = nullptr; for (auto lib = repo.cbegin(); sym == nullptr && lib != repo.end(); ++lib) { sym = (*lib)->getSymbol(name); } return sym; } bool LibraryRepository::libraryExists(const std::string &name) const { return findLibrary(name) != repo.end(); } LibraryRepository::const_library_iterator_t LibraryRepository::findLibrary( const std::string &name) const { auto fnd = std::find_if( repo.begin(), repo.end(), [&](const std::unique_ptr &l) { return l->libraryName == name; }); return fnd; } LibraryRepository::library_iterator_t LibraryRepository::findLibrary( const std::string &name) { auto fnd = std::find_if( repo.begin(), repo.end(), [&](const std::unique_ptr &l) { return l->libraryName == name; }); return fnd; } } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/os/library.h000066400000000000000000000041201456117377200213110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "../common.h" // std #include #include #include namespace rkcommon { class RKCOMMON_INTERFACE Library { public: using Version = std::vector; // Opens a shared library; anchorAddress = nullptr will disable anchored loads Library(const void *anchorAddress, const std::string &name, const Version &version); ~Library(); // Returns address of a symbol from the library void *getSymbol(const std::string &sym) const; private: Library(void *const lib); bool loadLibrary(const void *anchorAddress); std::string libraryName; Version libraryVersion; std::string errorMessage; void *lib{nullptr}; bool freeLibOnDelete{true}; friend class LibraryRepository; template friend inline std::unique_ptr make_unique(Args &&... args); }; class RKCOMMON_INTERFACE LibraryRepository { public: static LibraryRepository *getInstance(); static void cleanupInstance(); ~LibraryRepository(); LibraryRepository(const LibraryRepository &) = delete; LibraryRepository &operator=(const LibraryRepository &) = delete; // add/remove a library to/from the repo void add(const void *anchorAddress, const std::string &name, const Library::Version &version = {}); void remove(const std::string &name); // Returns address of a symbol from any library in the repo void *getSymbol(const std::string &sym) const; bool libraryExists(const std::string &name) const; private: using const_library_iterator_t = std::vector>::const_iterator; using library_iterator_t = std::vector>::iterator; const_library_iterator_t findLibrary(const std::string &name) const; library_iterator_t findLibrary(const std::string &name); static std::unique_ptr instance; LibraryRepository() = default; std::vector> repo; }; } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/platform.h000066400000000000000000000223641456117377200210620ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _WIN32 #include #ifndef NOMINMAX #define NOMINMAX #endif #ifndef WIN32_LEAN_AND_MEAN #define WIN32_LEAN_AND_MEAN #endif #include #undef NOMINMAX #undef WIN32_LEAN_AND_MEAN #endif //////////////////////////////////////////////////////////////////////////////// /// Macros //////////////////////////////////////////////////////////////////////////////// #ifdef _WIN32 #undef __noinline #define __noinline __declspec(noinline) //#define __forceinline __forceinline //#define __restrict __restrict #ifdef __INTEL_COMPILER #define __restrict__ __restrict #else #define __restrict__ //__restrict // causes issues with MSVC #endif #define __thread __declspec(thread) #define __aligned(...) __declspec(align(__VA_ARGS__)) //#define __FUNCTION__ __FUNCTION__ #define debugbreak() __debugbreak() #else #undef __noinline #undef __forceinline #define __noinline __attribute__((noinline)) #define __forceinline inline __attribute__((always_inline)) //#define __restrict __restrict //#define __thread __thread #define __aligned(...) __attribute__((aligned(__VA_ARGS__))) #define __FUNCTION__ __PRETTY_FUNCTION__ #define debugbreak() asm("int $3") #endif #ifdef __GNUC__ #define MAYBE_UNUSED __attribute__((unused)) #else #define MAYBE_UNUSED #endif #if defined(_MSC_VER) && !defined(__INTEL_COMPILER) #define likely(expr) (expr) #define unlikely(expr) (expr) #else #define likely(expr) __builtin_expect((bool)(expr), true) #define unlikely(expr) __builtin_expect((bool)(expr), false) #endif //////////////////////////////////////////////////////////////////////////////// /// Error handling and debugging //////////////////////////////////////////////////////////////////////////////// /* debug printing macros */ #define STRING(x) #x #define TOSTRING(x) STRING(x) #define CODE_LOCATION __FILE__ " (" TOSTRING(__LINE__) ")" #define PING \ { \ std::stringstream msg; \ msg << CODE_LOCATION << ": " << __FUNCTION__ << std::endl; \ std::cout << msg.str(); \ } #define PRINT(x) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << std::endl; \ std::cout << msg.str(); \ } #define PRINT2(x, y) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << std::endl; \ std::cout << msg.str(); \ } #define PRINT3(x, y, z) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << ", " << STRING(z) << " = " << (z) << std::endl; \ std::cout << msg.str(); \ } #define PRINT4(x, y, z, w) \ { \ std::stringstream msg; \ msg << STRING(x) << " = " << (x) << ", " << STRING(y) << " = " << (y) \ << ", " << STRING(z) << " = " << (z) << ", " << STRING(w) << " = " \ << (w) << std::endl; \ std::cout << msg.str(); \ } #define THROW_RUNTIME_ERROR(str) \ throw std::runtime_error(std::string(__FILE__) + " (" + \ std::to_string((long long)__LINE__) + \ "): " + std::string(str)) #define FATAL(x) THROW_RUNTIME_ERROR(x) #define WARNING(x) std::cerr << "Warning:" << std::string(x) << std::endl #define NOT_IMPLEMENTED FATAL(std::string(__FUNCTION__) + " not implemented") // NOTE(jda) - These macros are used to construct the last UNUSED(...) macro, // used to mark a variable number of arguments as unused so the // compiler doesn't warn when -Wextra (gcc/clang/icc) is used. Only // works with 1 to 5 passed arguments. #define UNUSED_1(x) (void)x #define UNUSED_2(x, y) \ UNUSED_1(x); \ UNUSED_1(y) #define UNUSED_3(x, ...) UNUSED_2(x, UNUSED_2(__VA_ARGS__)) #define UNUSED_4(x, ...) UNUSED_2(x, UNUSED_3(__VA_ARGS__)) #define UNUSED_5(x, ...) UNUSED_2(x, UNUSED_4(__VA_ARGS__)) // NUM_ARGS(...) evaluates to the literal number of the passed-in arguments. #define _NUM_ARGS2(X, X5, X4, X3, X2, X1, N, ...) N #define NUM_ARGS(...) _NUM_ARGS2(0, __VA_ARGS__, 5, 4, 3, 2, 1, 0) #define _UNUSED_N3(N, ...) UNUSED_##N(__VA_ARGS__) #define _UNUSED_N2(N, ...) _UNUSED_N3(N, __VA_ARGS__) #define UNUSED(...) _UNUSED_N2(NUM_ARGS(__VA_ARGS__), __VA_ARGS__) #if defined(__x86_64__) || defined(__ia64__) || defined(_M_X64) #define __X86_64__ #endif //////////////////////////////////////////////////////////////////////////////// /// Basic Types //////////////////////////////////////////////////////////////////////////////// /* windows does not have ssize_t */ #ifdef __WIN32 #ifdef __X86_64__ typedef int64_t ssize_t; #else typedef int32_t ssize_t; #endif #endif //////////////////////////////////////////////////////////////////////////////// /// Disable some compiler warnings //////////////////////////////////////////////////////////////////////////////// #if defined(__INTEL_COMPILER) #pragma warning( \ disable : 265) // floating-point operation result is out of range #pragma warning( \ disable : 383) // value copied to temporary, reference to temporary used #pragma warning(disable : 869) // parameter was never referenced #pragma warning(disable : 981) // operands are evaluated in unspecified order #pragma warning( \ disable : 1418) // external function definition with no prior declaration #pragma warning(disable : 1419) // external declaration in primary source file #pragma warning(disable : 1572) // floating-point equality and inequality // comparisons are unreliable #pragma warning(disable : 94) // the size of an array must be greater than zero #pragma warning(disable : 1599) // declaration hides parameter #pragma warning(disable : 424) // extra ";" ignored #pragma warning(disable : 2196) // routine is both "inline" and "noinline" #pragma warning(disable : 177) // label was declared but never referenced #pragma warning(disable : 114) // function was referenced but not defined #endif #if defined(_MSC_VER) #pragma warning(disable : 4200) // nonstandard extension used : zero-sized // array in struct/union #pragma warning(disable : 4800) // forcing value to bool 'true' or 'false' // (performance warning) #pragma warning(disable : 4267) // '=' : conversion from 'size_t' to 'unsigned // long', possible loss of data #pragma warning(disable : 4244) // 'argument' : conversion from 'ssize_t' to // 'unsigned int', possible loss of data #pragma warning( \ disable : 4355) // 'this' : used in base member initializer list #pragma warning(disable : 391) // '<=' : signed / unsigned mismatch #pragma warning(disable : 4018) // '<' : signed / unsigned mismatch #pragma warning( \ disable : 4305) // 'initializing' : truncation from 'double' to 'float' #pragma warning(disable : 4068) // unknown pragma #pragma warning(disable : 4146) // unary minus operator applied to unsigned // type, result still unsigned #pragma warning(disable : 4838) // conversion from 'unsigned int' to 'const // int' requires a narrowing conversion) #pragma warning( \ disable : 4227) // anachronism used : qualifiers on reference are ignored #pragma warning( \ disable : 4251) // class 'type1' needs to have dll-interface // to be used by clients of class 'type2' #endif #if defined(__clang__) && !defined(__INTEL_COMPILER) #pragma clang diagnostic ignored "-Wunknown-pragmas" #pragma clang diagnostic ignored "-Wunused-variable" #pragma clang diagnostic ignored "-Wreorder" #pragma clang diagnostic ignored "-Wmicrosoft" #pragma clang diagnostic ignored "-Wunused-private-field" #pragma clang diagnostic ignored "-Wunused-local-typedef" #pragma clang diagnostic ignored "-Wunused-function" #endif ospray-rkcommon-538f8a2/rkcommon/rkcommon.rc000066400000000000000000000037041456117377200212350ustar00rootroot00000000000000’ž// Copyright 2016 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "rkcommon/version.h" 1 VERSIONINFO FILEVERSION RKCOMMON_VERSION_MAJOR,RKCOMMON_VERSION_MINOR,RKCOMMON_VERSION_PATCH,0 PRODUCTVERSION RKCOMMON_VERSION_MAJOR,RKCOMMON_VERSION_MINOR,RKCOMMON_VERSION_PATCH,0 FILEFLAGSMASK 0x3fL #ifdef _DEBUG FILEFLAGS 0x1L #else FILEFLAGS 0x0L #endif FILEOS 0x40004L FILETYPE 0x2L FILESUBTYPE 0x0L BEGIN BLOCK "StringFileInfo" BEGIN BLOCK "040904b0" BEGIN VALUE "CompanyName", "Intel" VALUE "FileDescription", "Intel® oneAPI Rendering Toolkit Common Library" VALUE "FileVersion", RKCOMMON_VERSION VALUE "ProductVersion", RKCOMMON_VERSION VALUE "LegalCopyright", "© 2009 Intel Corporation" VALUE "InternalName", "rkCommon" VALUE "ProductName", "rkCommon" END END BLOCK "VarFileInfo" BEGIN VALUE "Translation", 0x409, 1200 END END ospray-rkcommon-538f8a2/rkcommon/tasking/000077500000000000000000000000001456117377200205165ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/tasking/AsyncLoop.h000066400000000000000000000106301456117377200225760ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include #include #include "../traits/rktraits.h" #include "schedule.h" #include "tasking_system_init.h" namespace rkcommon { namespace tasking { /*! This calls a given function in a continuous loop on a background thread owned by AsyncLoop. While it is running, the function it was constructed with is called over and over in a loop. When stopped, the thread is put to sleep until it is started again. An AsyncLoop has to be explicitly started, it is not automatically started on construction. */ class AsyncLoop { public: enum LaunchMethod { AUTO = 0, THREAD = 1, TASK = 2 }; template AsyncLoop(LOOP_BODY_FCN &&fcn, LaunchMethod m = AUTO); ~AsyncLoop(); void start(); void stop(); private: // Struct shared with the background thread to avoid dangling ptrs or // tricky synchronization when destroying the AsyncLoop and scheduling // threads with TBB, since we don't have a join point to sync with // the running thread struct AsyncLoopData { std::atomic threadShouldBeAlive{true}; std::atomic shouldBeRunning{false}; std::atomic insideLoopBody{false}; std::condition_variable runningCond; std::mutex runningMutex; }; std::shared_ptr loop; std::thread backgroundThread; }; // Inlined members // ////////////////////////////////////////////////////////// template inline AsyncLoop::AsyncLoop(LOOP_BODY_FCN &&fcn, AsyncLoop::LaunchMethod m) : loop(nullptr) { static_assert(traits::has_operator_method::value, "rkcommon::AsyncLoop() requires the implementation of " "method 'void LOOP_BODY_FCN::operator()' in order to " "construct the loop instance."); std::shared_ptr l = std::make_shared(); loop = l; auto mainLoop = [l, fcn]() { while (l->threadShouldBeAlive) { if (!l->threadShouldBeAlive) return; if (l->shouldBeRunning) { l->insideLoopBody = true; fcn(); l->insideLoopBody = false; } else { std::unique_lock lock(l->runningMutex); l->runningCond.wait(lock, [&] { return l->shouldBeRunning.load() || !l->threadShouldBeAlive.load(); }); } } }; if (m == AUTO) m = tasking::numTaskingThreads() > 4 ? TASK : THREAD; if (m == THREAD) backgroundThread = std::thread(mainLoop); else // m == TASK tasking::schedule(mainLoop); } inline AsyncLoop::~AsyncLoop() { // Note that the mutex here is still required even though these vars // are atomic, because we need to sync with the condition variable waiting // state on the async thread. Otherwise we might signal and the thread // will miss it, since it wasn't watching. { std::unique_lock lock(loop->runningMutex); loop->threadShouldBeAlive = false; loop->shouldBeRunning = false; } loop->runningCond.notify_one(); if (backgroundThread.joinable()) { backgroundThread.join(); } } inline void AsyncLoop::start() { if (!loop->shouldBeRunning) { // Note that the mutex here is still required even though these vars // are atomic, because we need to sync with the condition variable // waiting state on the async thread. Otherwise we might signal and the // thread will miss it, since it wasn't watching. { std::unique_lock lock(loop->runningMutex); loop->shouldBeRunning = true; } loop->runningCond.notify_one(); } } inline void AsyncLoop::stop() { if (loop->shouldBeRunning) { loop->shouldBeRunning = false; while (loop->insideLoopBody.load()) { std::this_thread::yield(); } } } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/AsyncTask.h000066400000000000000000000020771456117377200225750ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "detail/async_task.inl" #include #include namespace rkcommon { namespace tasking { template struct AsyncTask { AsyncTask(std::function fcn) : taskImpl([this, fcn]() { retValue = fcn(); jobFinished = true; }) { } virtual ~AsyncTask() noexcept { wait(); } bool finished() const { return jobFinished; } bool valid() const { return jobFinished; } void wait() { taskImpl.wait(); } T get() { if (!jobFinished) wait(); return retValue; } private: // declaration before taskImpl: ensure initialization before task finishes std::atomic jobFinished{false}; detail::AsyncTaskImpl> taskImpl; T retValue; }; } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/async.h000066400000000000000000000025401456117377200220050ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "schedule.h" namespace rkcommon { namespace tasking { template #if defined(__cpp_lib_is_invocable) && __cpp_lib_is_invocable >= 201703 using operator_return_t = std::invoke_result_t; #else using operator_return_t = typename std::result_of::type; #endif // NOTE(jda) - This abstraction takes a lambda which should take captured // variables by *value* to ensure no captured references race // with the task itself. template inline auto async(TASK_T &&fcn) -> std::future> { static_assert(traits::has_operator_method::value, "rkcommon::tasking::async() requires the implementation of" "method 'RETURN_T TASK_T::operator()', where RETURN_T " "is the return value of the passed in task."); using package_t = std::packaged_task()>; auto task = new package_t(std::forward(fcn)); auto future = task->get_future(); schedule([=]() { (*task)(); delete task; }); return future; } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/000077500000000000000000000000001456117377200217605ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/tasking/detail/TaskSys.cpp000066400000000000000000000021711456117377200240660ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "TaskSys.h" // ospray #include "../../platform.h" // stl #include namespace rkcommon { namespace tasking { namespace detail { // TaskSys definitions ////////////////////////////////////////////////// static std::unique_ptr g_ts; // Interface definitions //////////////////////////////////////////////// void initTaskSystemInternal(int nThreads) { g_ts = std::unique_ptr(new enki::TaskScheduler()); if (nThreads < 1) nThreads = enki::GetNumHardwareThreads(); g_ts->Initialize(nThreads); } int numThreadsTaskSystemInternal() { return g_ts->GetNumTaskThreads(); } void scheduleTaskInternal(Task *task) { if (g_ts.get() == nullptr) initTaskSystemInternal(-1); g_ts->AddTaskSetToPipe(task); } void waitInternal(Task *task) { g_ts->WaitforTask(task); } } // namespace detail } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/TaskSys.h000066400000000000000000000035601456117377200235360ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../../common.h" // enkiTS #include "enkiTS/TaskScheduler.h" namespace rkcommon { namespace tasking { namespace detail { // Public interface to the tasking system /////////////////////////////// using Task = enki::ITaskSet; void RKCOMMON_INTERFACE initTaskSystemInternal(int numThreads = -1); int RKCOMMON_INTERFACE numThreadsTaskSystemInternal(); void RKCOMMON_INTERFACE scheduleTaskInternal(Task *task); void RKCOMMON_INTERFACE waitInternal(Task *task); template inline void parallel_for_internal(int nTasks, TASK_T &&fcn) { struct LocalTask : public Task { const TASK_T &t; LocalTask(int nunTasks, TASK_T &&fcn) : Task(nunTasks), t(std::forward(fcn)) { } ~LocalTask() override = default; void ExecuteRange(enki::TaskSetPartition tp, uint32_t) override { for (auto i = tp.start; i < tp.end; ++i) t(i); } }; LocalTask task(nTasks, std::forward(fcn)); scheduleTaskInternal(&task); waitInternal(&task); } template inline void schedule_internal(TASK_T &&fcn) { struct LocalTask : public Task { TASK_T t; LocalTask(TASK_T &&fcn) : Task(1), t(std::forward(fcn)) {} ~LocalTask() override = default; void ExecuteRange(enki::TaskSetPartition, uint32_t) override { t(); delete this; } }; auto *task = new LocalTask(std::forward(fcn)); scheduleTaskInternal(task); } } // namespace detail } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/async_task.inl000066400000000000000000000037711456117377200246330ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #include #elif defined(RKCOMMON_TASKING_OMP) #include #elif defined(RKCOMMON_TASKING_INTERNAL) #include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template struct AsyncTaskImpl { AsyncTaskImpl(TASK_T &&fcn); void wait(); private: #if defined(RKCOMMON_TASKING_TBB) tbb::task_group taskGroup; #elif defined(RKCOMMON_TASKING_OMP) std::thread thread; #elif defined(RKCOMMON_TASKING_INTERNAL) struct LocalTask : public enki::ITaskSet { TASK_T t; LocalTask(TASK_T &&fcn) : t(std::forward(fcn)) {} void ExecuteRange(enki::TaskSetPartition, uint32_t) override { t(); } }; LocalTask task; #endif }; // Inlined definitions // //////////////////////////////////////////////////// template inline AsyncTaskImpl::AsyncTaskImpl(TASK_T &&fcn) #if defined(RKCOMMON_TASKING_TBB) { taskGroup.run(std::forward(fcn)); } #elif defined(RKCOMMON_TASKING_OMP) : thread(std::forward(fcn)) { } #elif defined(RKCOMMON_TASKING_INTERNAL) : task(std::forward(fcn)) { detail::scheduleTaskInternal(&task); } #else { fcn(); } #endif template inline void AsyncTaskImpl::wait() { #if defined(RKCOMMON_TASKING_TBB) taskGroup.wait(); #elif defined(RKCOMMON_TASKING_OMP) if (thread.joinable()) thread.join(); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::waitInternal(&task); #endif } } // namespace detail } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/000077500000000000000000000000001456117377200231555ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/Atomics.h000066400000000000000000000066701456117377200247360ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #ifdef _WIN32 #define NOMINMAX #define WIN32_LEAN_AND_MEAN #include #undef GetObject #include extern "C" void _ReadWriteBarrier(); #pragma intrinsic(_ReadWriteBarrier) #pragma intrinsic(_InterlockedCompareExchange) #pragma intrinsic(_InterlockedExchangeAdd) // Memory Barriers to prevent CPU and Compiler re-ordering #define BASE_MEMORYBARRIER_ACQUIRE() _ReadWriteBarrier() #define BASE_MEMORYBARRIER_RELEASE() _ReadWriteBarrier() #define BASE_ALIGN(x) __declspec( align( x ) ) #else #define BASE_MEMORYBARRIER_ACQUIRE() __asm__ __volatile__("": : :"memory") #define BASE_MEMORYBARRIER_RELEASE() __asm__ __volatile__("": : :"memory") #define BASE_ALIGN(x) __attribute__ ((aligned( x ))) #endif namespace enki { // Atomically performs: if( *pDest == compareWith ) { *pDest = swapTo; } // returns old *pDest (so if successfull, returns compareWith) inline uint32_t AtomicCompareAndSwap( volatile uint32_t* pDest, uint32_t swapTo, uint32_t compareWith ) { #ifdef _WIN32 return _InterlockedCompareExchange( (volatile long*)pDest,swapTo, compareWith ); #else return __sync_val_compare_and_swap( pDest, compareWith, swapTo ); #endif } inline uint64_t AtomicCompareAndSwap( volatile uint64_t* pDest, uint64_t swapTo, uint64_t compareWith ) { #ifdef _WIN32 return _InterlockedCompareExchange64( (__int64 volatile*)pDest, swapTo, compareWith ); #else return __sync_val_compare_and_swap( pDest, compareWith, swapTo ); #endif } inline void* AtomicCompareAndSwapPtr( void* volatile* pDest, void* swapTo, void* compareWith ) { #ifdef _WIN32 return _InterlockedCompareExchangePointer( pDest, swapTo, compareWith ); #else return __sync_val_compare_and_swap( pDest, compareWith, swapTo ); #endif } // exchange ptr and return previous value inline void* AtomicExchangePtr( void* volatile* pDest, void* swapTo ) { #ifdef _WIN32 return _InterlockedExchangePointer( pDest, swapTo ); #else return __sync_lock_test_and_set( pDest, swapTo ); #endif } // Atomically performs: tmp = *pDest; *pDest += value; return tmp; inline int32_t AtomicAdd( volatile int32_t* pDest, int32_t value ) { #ifdef _WIN32 return _InterlockedExchangeAdd( (long*)pDest, value ); #else return __sync_fetch_and_add( pDest, value ); #endif } }ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/LockLessMultiReadPipe.h000066400000000000000000000251631456117377200275010ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #include #include "Atomics.h" #include namespace enki { // LockLessMultiReadPipe - Single writer, multiple reader thread safe pipe using (semi) lockless programming // Readers can only read from the back of the pipe // The single writer can write to the front of the pipe, and read from both ends (a writer can be a reader) // for many of the principles used here, see http://msdn.microsoft.com/en-us/library/windows/desktop/ee418650(v=vs.85).aspx // Note: using log2 sizes so we do not need to clamp (multi-operation) // T is the contained type // Note this is not true lockless as the use of flags as a form of lock state. template class LockLessMultiReadPipe { public: LockLessMultiReadPipe(); ~LockLessMultiReadPipe() {} // ReaderTryReadBack returns false if we were unable to read // This is thread safe for both multiple readers and the writer bool ReaderTryReadBack( T* pOut ); // WriterTryReadFront returns false if we were unable to read // This is thread safe for the single writer, but should not be called by readers bool WriterTryReadFront( T* pOut ); // WriterTryWriteFront returns false if we were unable to write // This is thread safe for the single writer, but should not be called by readers bool WriterTryWriteFront( const T& in ); // IsPipeEmpty() is a utility function, not intended for general use // Should only be used very prudently. bool IsPipeEmpty() const { return 0 == m_WriteIndex - m_ReadCount; } void Clear() { m_WriteIndex = 0; m_ReadIndex = 0; m_ReadCount = 0; memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); } private: const static uint32_t ms_cSize = ( 1 << cSizeLog2 ); const static uint32_t ms_cIndexMask = ms_cSize - 1; const static uint32_t FLAG_INVALID = 0xFFFFFFFF; // 32bit for CAS const static uint32_t FLAG_CAN_WRITE = 0x00000000; // 32bit for CAS const static uint32_t FLAG_CAN_READ = 0x11111111; // 32bit for CAS T m_Buffer[ ms_cSize ]; // read and write indexes allow fast access to the pipe, but actual access // controlled by the access flags. volatile uint32_t BASE_ALIGN(4) m_WriteIndex; volatile uint32_t BASE_ALIGN(4) m_ReadCount; volatile uint32_t m_Flags[ ms_cSize ]; volatile uint32_t BASE_ALIGN(4) m_ReadIndex; }; template inline LockLessMultiReadPipe::LockLessMultiReadPipe() : m_WriteIndex(0) , m_ReadCount(0) , m_ReadIndex(0) { assert( cSizeLog2 < 32 ); memset( (void*)m_Flags, 0, sizeof( m_Flags ) ); } template inline bool LockLessMultiReadPipe::ReaderTryReadBack( T* pOut ) { uint32_t actualReadIndex; uint32_t readCount = m_ReadCount; // We get hold of read index for consistency, // and do first pass starting at read count uint32_t readIndexToUse = readCount; while(true) { uint32_t writeIndex = m_WriteIndex; // power of two sizes ensures we can use a simple calc without modulus uint32_t numInPipe = writeIndex - readCount; if( 0 == numInPipe ) { return false; } if( readIndexToUse >= writeIndex ) { // move back to start readIndexToUse = m_ReadIndex; } // power of two sizes ensures we can perform AND for a modulus actualReadIndex = readIndexToUse & ms_cIndexMask; // Multiple potential readers mean we should check if the data is valid, // using an atomic compare exchange uint32_t previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ ); if( FLAG_CAN_READ == previous ) { break; } ++readIndexToUse; //update known readcount readCount = m_ReadCount; } // we update the read index using an atomic add, as we've only read one piece of data. // this ensure consistency of the read index, and the above loop ensures readers // only read from unread data AtomicAdd( (volatile int32_t*)&m_ReadCount, 1 ); BASE_MEMORYBARRIER_ACQUIRE(); // now read data, ensuring we do so after above reads & CAS *pOut = m_Buffer[ actualReadIndex ]; m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE; return true; } template inline bool LockLessMultiReadPipe::WriterTryReadFront( T* pOut ) { uint32_t writeIndex = m_WriteIndex; uint32_t frontReadIndex = writeIndex; // Multiple potential readers mean we should check if the data is valid, // using an atomic compare exchange - which acts as a form of lock (so not quite lockless really). uint32_t previous = FLAG_INVALID; uint32_t actualReadIndex = 0; while( true ) { // power of two sizes ensures we can use a simple calc without modulus uint32_t readCount = m_ReadCount; uint32_t numInPipe = writeIndex - readCount; if( 0 == numInPipe || 0 == frontReadIndex ) { // frontReadIndex can get to 0 here if that item was just being read by another thread. m_ReadIndex = readCount; return false; } --frontReadIndex; actualReadIndex = frontReadIndex & ms_cIndexMask; previous = AtomicCompareAndSwap( &m_Flags[ actualReadIndex ], FLAG_INVALID, FLAG_CAN_READ ); if( FLAG_CAN_READ == previous ) { break; } else if( m_ReadIndex >= frontReadIndex ) { return false; } } // now read data, ensuring we do so after above reads & CAS *pOut = m_Buffer[ actualReadIndex ]; m_Flags[ actualReadIndex ] = FLAG_CAN_WRITE; BASE_MEMORYBARRIER_RELEASE(); // 32-bit aligned stores are atomic, and writer owns the write index // we only move one back as this is as many as we have read, not where we have read from. --m_WriteIndex; return true; } template inline bool LockLessMultiReadPipe::WriterTryWriteFront( const T& in ) { // The writer 'owns' the write index, and readers can only reduce // the amount of data in the pipe. // We get hold of both values for consistency and to reduce false sharing // impacting more than one access uint32_t writeIndex = m_WriteIndex; // power of two sizes ensures we can perform AND for a modulus uint32_t actualWriteIndex = writeIndex & ms_cIndexMask; // a reader may still be reading this item, as there are multiple readers if( m_Flags[ actualWriteIndex ] != FLAG_CAN_WRITE ) { return false; // still being read, so have caught up with tail. } // as we are the only writer we can update the data without atomics // whilst the write index has not been updated m_Buffer[ actualWriteIndex ] = in; m_Flags[ actualWriteIndex ] = FLAG_CAN_READ; // We need to ensure the above writes occur prior to updating the write index, // otherwise another thread might read before it's finished BASE_MEMORYBARRIER_RELEASE(); // 32-bit aligned stores are atomic, and the writer controls the write index ++writeIndex; m_WriteIndex = writeIndex; return true; } // Lockless multiwriter intrusive list // Type T must implement T* volatile pNext; template class LocklessMultiWriteIntrusiveList { T* volatile pHead; T tail; public: LocklessMultiWriteIntrusiveList() : pHead( &tail ) { tail.pNext = NULL; } bool IsListEmpty() const { return pHead == &tail; } // Add - safe to perform from any thread void WriterWriteFront( T* pNode_ ) { assert( pNode_ ); pNode_->pNext = NULL; T* pPrev = (T*)AtomicExchangePtr( (void* volatile*)&pHead, (void*)pNode_ ); pPrev->pNext = pNode_; } // Remove - only thread safe for owner T* ReaderReadBack() { T* pTailPlus1 = tail.pNext; if( pTailPlus1 ) { T* pTailPlus2 = pTailPlus1->pNext; if( pTailPlus2 ) { //not head tail.pNext = pTailPlus2; } else { // pTailPlus1 is the head, attempt swap with tail tail.pNext = NULL; T* pPrev = (T*)AtomicCompareAndSwapPtr( (void* volatile*)&pHead, (void*)&tail, (void*)pTailPlus1 ); if( pPrev != pTailPlus1 ) { // pTailPlus1 is no longer the head, so pTailPlus1->pNext should be non NULL assert( pTailPlus1->pNext ); tail.pNext = pTailPlus1->pNext; } } } return pTailPlus1; } }; } ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/TaskScheduler.cpp000066400000000000000000000345261456117377200264340ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #include #include "TaskScheduler.h" #include "LockLessMultiReadPipe.h" #if defined __i386__ || defined __x86_64__ #include "x86intrin.h" #elif defined _WIN32 #include #endif using namespace enki; static const uint32_t PIPESIZE_LOG2 = 8; static const uint32_t SPIN_COUNT = 100; static const uint32_t SPIN_BACKOFF_MULTIPLIER = 10; static const uint32_t MAX_NUM_INITIAL_PARTITIONS = 8; // each software thread gets it's own copy of gtl_threadNum, so this is safe to use as a static variable static THREAD_LOCAL uint32_t gtl_threadNum = 0; namespace enki { struct SubTaskSet { ITaskSet* pTask; TaskSetPartition partition; }; // we derive class TaskPipe rather than typedef to get forward declaration working easily class TaskPipe : public LockLessMultiReadPipe {}; struct ThreadArgs { uint32_t threadNum; TaskScheduler* pTaskScheduler; }; class PinnedTaskList : public LocklessMultiWriteIntrusiveList {}; } namespace { SubTaskSet SplitTask( SubTaskSet& subTask_, uint32_t rangeToSplit_ ) { SubTaskSet splitTask = subTask_; uint32_t rangeLeft = subTask_.partition.end - subTask_.partition.start; if( rangeToSplit_ > rangeLeft ) { rangeToSplit_ = rangeLeft; } splitTask.partition.end = subTask_.partition.start + rangeToSplit_; subTask_.partition.start = splitTask.partition.end; return splitTask; } #if ( defined _WIN32 && ( defined _M_IX86 || defined _M_X64 ) ) || ( defined __i386__ || defined __x86_64__ ) static void SpinWait( uint32_t spinCount_ ) { uint64_t end = __rdtsc() + spinCount_; while( __rdtsc() < end ) { _mm_pause(); } } #else static void SpinWait( uint32_t spinCount_ ) { while( spinCount_ ) { // TODO: may have NOP or yield equiv --spinCount_; } } #endif } static void SafeCallback(ProfilerCallbackFunc func_, uint32_t threadnum_) { if( func_ ) { func_(threadnum_); } } ProfilerCallbacks* TaskScheduler::GetProfilerCallbacks() { return &m_ProfilerCallbacks; } THREADFUNC_DECL TaskScheduler::TaskingThreadFunction( void* pArgs ) { ThreadArgs args = *(ThreadArgs*)pArgs; uint32_t threadNum = args.threadNum; TaskScheduler* pTS = args.pTaskScheduler; gtl_threadNum = threadNum; SafeCallback( pTS->m_ProfilerCallbacks.threadStart, threadNum ); uint32_t spinCount = SPIN_COUNT + 1; uint32_t hintPipeToCheck_io = threadNum + 1; // does not need to be clamped. while( pTS->m_bRunning ) { if(!pTS->TryRunTask( threadNum, hintPipeToCheck_io ) ) { // no tasks, will spin then wait ++spinCount; if( spinCount > SPIN_COUNT ) { pTS->WaitForTasks( threadNum ); spinCount = 0; } else { // Note: see https://software.intel.com/en-us/articles/a-common-construct-to-avoid-the-contention-of-threads-architecture-agnostic-spin-wait-loops uint32_t spinBackoffCount = spinCount * SPIN_BACKOFF_MULTIPLIER; SpinWait( spinBackoffCount ); } } else { spinCount = 0; } } AtomicAdd( &pTS->m_NumThreadsRunning, -1 ); SafeCallback( pTS->m_ProfilerCallbacks.threadStop, threadNum ); return 0; } void TaskScheduler::StartThreads() { if( m_bHaveThreads ) { return; } m_bRunning = true; SemaphoreCreate( m_NewTaskSemaphore ); // we create one less thread than m_NumThreads as the main thread counts as one m_pThreadArgStore = new ThreadArgs[m_NumThreads]; m_pThreadIDs = new threadid_t[m_NumThreads]; m_pThreadArgStore[0].threadNum = 0; m_pThreadArgStore[0].pTaskScheduler = this; m_pThreadIDs[0] = 0; m_NumThreadsWaiting = 0; m_NumThreadsRunning = 1;// acount for main thread for( uint32_t thread = 1; thread < m_NumThreads; ++thread ) { m_pThreadArgStore[thread].threadNum = thread; m_pThreadArgStore[thread].pTaskScheduler = this; ThreadCreate( &m_pThreadIDs[thread], TaskingThreadFunction, &m_pThreadArgStore[thread] ); ++m_NumThreadsRunning; } // ensure we have sufficient tasks to equally fill either all threads including main // or just the threads we've launched, this is outside the firstinit as we want to be able // to runtime change it if( 1 == m_NumThreads ) { m_NumPartitions = 1; m_NumInitialPartitions = 1; } else { m_NumPartitions = m_NumThreads * (m_NumThreads - 1); m_NumInitialPartitions = m_NumThreads - 1; if( m_NumInitialPartitions > MAX_NUM_INITIAL_PARTITIONS ) { m_NumInitialPartitions = MAX_NUM_INITIAL_PARTITIONS; } } m_bHaveThreads = true; } void TaskScheduler::StopThreads( bool bWait_ ) { if( m_bHaveThreads ) { // wait for them threads quit before deleting data m_bRunning = false; while( bWait_ && m_NumThreadsRunning > 1 ) { // keep firing event to ensure all threads pick up state of m_bRunning SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsRunning ); } for( uint32_t thread = 1; thread < m_NumThreads; ++thread ) { ThreadTerminate( m_pThreadIDs[thread] ); } m_NumThreads = 0; delete[] m_pThreadArgStore; delete[] m_pThreadIDs; m_pThreadArgStore = 0; m_pThreadIDs = 0; SemaphoreClose( m_NewTaskSemaphore ); m_bHaveThreads = false; m_NumThreadsWaiting = 0; m_NumThreadsRunning = 0; } } bool TaskScheduler::TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ ) { // Run any tasks for this thread RunPinnedTasks( threadNum ); // check for tasks SubTaskSet subTask; bool bHaveTask = m_pPipesPerThread[ threadNum ].WriterTryReadFront( &subTask ); uint32_t threadToCheck = hintPipeToCheck_io_; uint32_t checkCount = 0; while( !bHaveTask && checkCount < m_NumThreads ) { threadToCheck = ( hintPipeToCheck_io_ + checkCount ) % m_NumThreads; if( threadToCheck != threadNum ) { bHaveTask = m_pPipesPerThread[ threadToCheck ].ReaderTryReadBack( &subTask ); } ++checkCount; } if( bHaveTask ) { // update hint, will preserve value unless actually got task from another thread. hintPipeToCheck_io_ = threadToCheck; uint32_t partitionSize = subTask.partition.end - subTask.partition.start; if( subTask.pTask->m_RangeToRun < partitionSize ) { SubTaskSet taskToRun = SplitTask( subTask, subTask.pTask->m_RangeToRun ); SplitAndAddTask( threadNum, subTask, subTask.pTask->m_RangeToRun ); taskToRun.pTask->ExecuteRange( taskToRun.partition, threadNum ); AtomicAdd( &taskToRun.pTask->m_RunningCount, -1 ); } else { // the task has already been divided up by AddTaskSetToPipe, so just run it subTask.pTask->ExecuteRange( subTask.partition, threadNum ); AtomicAdd( &subTask.pTask->m_RunningCount, -1 ); } } return bHaveTask; } void TaskScheduler::WaitForTasks( uint32_t threadNum ) { // We incrememt the number of threads waiting here in order // to ensure that the check for tasks occurs after the increment // to prevent a task being added after a check, then the thread waiting. // This will occasionally result in threads being mistakenly awoken, // but they will then go back to sleep. AtomicAdd( &m_NumThreadsWaiting, 1 ); bool bHaveTasks = false; for( uint32_t thread = 0; thread < m_NumThreads; ++thread ) { if( !m_pPipesPerThread[ thread ].IsPipeEmpty() ) { bHaveTasks = true; break; } } if( !bHaveTasks && !m_pPinnedTaskListPerThread[ threadNum ].IsListEmpty() ) { bHaveTasks = true; } if( !bHaveTasks ) { SafeCallback( m_ProfilerCallbacks.waitStart, threadNum ); SemaphoreWait( m_NewTaskSemaphore ); SafeCallback( m_ProfilerCallbacks.waitStop, threadNum ); } AtomicAdd( &m_NumThreadsWaiting, -1 ); } void TaskScheduler::WakeThreads( int32_t maxToWake_ ) { if( maxToWake_ > 0 && maxToWake_ < m_NumThreadsWaiting ) { SemaphoreSignal( m_NewTaskSemaphore, maxToWake_ ); } else { SemaphoreSignal( m_NewTaskSemaphore, m_NumThreadsWaiting ); } } void TaskScheduler::SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, uint32_t rangeToSplit_ ) { while( subTask_.partition.start != subTask_.partition.end ) { SubTaskSet taskToAdd = SplitTask( subTask_, rangeToSplit_ ); // add the partition to the pipe AtomicAdd( &subTask_.pTask->m_RunningCount, 1 ); if( !m_pPipesPerThread[ threadNum_ ].WriterTryWriteFront( taskToAdd ) ) { // alter range to run the appropriate fraction if( taskToAdd.pTask->m_RangeToRun < rangeToSplit_ ) { taskToAdd.partition.end = taskToAdd.partition.start + taskToAdd.pTask->m_RangeToRun; subTask_.partition.start = taskToAdd.partition.end; } taskToAdd.pTask->ExecuteRange( taskToAdd.partition, threadNum_ ); AtomicAdd( &subTask_.pTask->m_RunningCount, -1 ); } else { WakeThreads( 1 ); } } } void TaskScheduler::AddTaskSetToPipe( ITaskSet* pTaskSet ) { pTaskSet->m_RunningCount = 0; // divide task up and add to pipe pTaskSet->m_RangeToRun = pTaskSet->m_SetSize / m_NumPartitions; if( pTaskSet->m_RangeToRun < pTaskSet->m_MinRange ) { pTaskSet->m_RangeToRun = pTaskSet->m_MinRange; } uint32_t rangeToSplit = pTaskSet->m_SetSize / m_NumInitialPartitions; if( rangeToSplit < pTaskSet->m_MinRange ) { rangeToSplit = pTaskSet->m_MinRange; } SubTaskSet subTask; subTask.pTask = pTaskSet; subTask.partition.start = 0; subTask.partition.end = pTaskSet->m_SetSize; SplitAndAddTask( gtl_threadNum, subTask, rangeToSplit ); } void TaskScheduler::AddPinnedTask( IPinnedTask* pTask_ ) { pTask_->m_RunningCount = 1; m_pPinnedTaskListPerThread[ pTask_->threadNum ].WriterWriteFront( pTask_ ); WakeThreads(); } void TaskScheduler::RunPinnedTasks() { uint32_t threadNum = gtl_threadNum; RunPinnedTasks( threadNum ); } void TaskScheduler::RunPinnedTasks( uint32_t threadNum ) { IPinnedTask* pPinnedTaskSet = NULL; do { pPinnedTaskSet = m_pPinnedTaskListPerThread[ threadNum ].ReaderReadBack(); if( pPinnedTaskSet ) { pPinnedTaskSet->Execute(); pPinnedTaskSet->m_RunningCount = 0; } } while( pPinnedTaskSet ); } void TaskScheduler::WaitforTask( const ICompletable* pCompletable_ ) { uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped. if( pCompletable_ ) { while( pCompletable_->m_RunningCount ) { TryRunTask( gtl_threadNum, hintPipeToCheck_io ); // should add a spin then wait for task completion event. } } else { TryRunTask( gtl_threadNum, hintPipeToCheck_io ); } } void TaskScheduler::WaitforAll() { bool bHaveTasks = true; uint32_t hintPipeToCheck_io = gtl_threadNum + 1; // does not need to be clamped. int32_t threadsRunning = m_NumThreadsRunning - 1; while( bHaveTasks || m_NumThreadsWaiting < threadsRunning ) { bHaveTasks = TryRunTask( gtl_threadNum, hintPipeToCheck_io ); if( !bHaveTasks ) { for( uint32_t thread = 0; thread < m_NumThreads; ++thread ) { if( !m_pPipesPerThread[ thread ].IsPipeEmpty() ) { bHaveTasks = true; break; } } } } } void TaskScheduler::WaitforAllAndShutdown() { WaitforAll(); StopThreads(true); delete[] m_pPipesPerThread; m_pPipesPerThread = 0; delete[] m_pPinnedTaskListPerThread; m_pPinnedTaskListPerThread = 0; } uint32_t TaskScheduler::GetNumTaskThreads() const { return m_NumThreads; } TaskScheduler::TaskScheduler() : m_pPipesPerThread(NULL) , m_pPinnedTaskListPerThread(NULL) , m_NumThreads(0) , m_pThreadArgStore(NULL) , m_pThreadIDs(NULL) , m_bRunning(false) , m_NumThreadsRunning(0) , m_NumThreadsWaiting(0) , m_NumPartitions(0) , m_bHaveThreads(false) { memset(&m_ProfilerCallbacks, 0, sizeof(m_ProfilerCallbacks)); } TaskScheduler::~TaskScheduler() { #ifndef _WIN32 WaitforAllAndShutdown(); #endif } void TaskScheduler::Initialize( uint32_t numThreads_ ) { assert( numThreads_ ); StopThreads( true ); // Stops threads, waiting for them. delete[] m_pPipesPerThread; delete[] m_pPinnedTaskListPerThread; m_NumThreads = numThreads_; m_pPipesPerThread = new TaskPipe[ m_NumThreads ]; m_pPinnedTaskListPerThread = new PinnedTaskList[ m_NumThreads ]; StartThreads(); } void TaskScheduler::Initialize() { Initialize( GetNumHardwareThreads() ); } ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/TaskScheduler.h000066400000000000000000000232511456117377200260720ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #include "Threads.h" #include "Atomics.h" #if defined(_WIN32) && defined(ENKITS_BUILD_DLL) // Building enkiTS as a DLL #define ENKITS_API __declspec(dllexport) #elif defined(_WIN32) && defined(ENKITS_DLL) // Using enkiTS as a DLL #define ENKITS_API __declspec(dllimport) #elif defined(__GNUC__) && defined(ENKITS_BUILD_DLL) // Building enkiTS as a shared library #define ENKITS_API __attribute__((visibility("default"))) #else #define ENKITS_API #endif namespace enki { struct TaskSetPartition { uint32_t start; uint32_t end; }; class TaskScheduler; class TaskPipe; class PinnedTaskList; struct ThreadArgs; struct SubTaskSet; // ICompletable is a base class used to check for completion. // Do not use this class directly, instead derive from ITaskSet or IPinnedTask. class ICompletable { public: ICompletable() : m_RunningCount(0) {} virtual ~ICompletable() = default; bool GetIsComplete() { bool bRet = ( 0 == m_RunningCount ); BASE_MEMORYBARRIER_ACQUIRE(); return bRet; } private: friend class TaskScheduler; volatile int32_t m_RunningCount; }; // Subclass ITaskSet to create tasks. // TaskSets can be re-used, but check completion first. class ITaskSet : public ICompletable { public: ITaskSet() : m_SetSize(1) , m_MinRange(1) , m_RangeToRun(1) {} ITaskSet( uint32_t setSize_ ) : m_SetSize( setSize_ ) , m_MinRange(1) , m_RangeToRun(1) {} ITaskSet( uint32_t setSize_, uint32_t minRange_ ) : m_SetSize( setSize_ ) , m_MinRange( minRange_ ) , m_RangeToRun(minRange_) {} virtual ~ITaskSet() override = default; // Execute range should be overloaded to process tasks. It will be called with a // range_ where range.start >= 0; range.start < range.end; and range.end < m_SetSize; // The range values should be mapped so that linearly processing them in order is cache friendly // i.e. neighbouring values should be close together. // threadnum should not be used for changing processing of data, it's intended purpose // is to allow per-thread data buckets for output. virtual void ExecuteRange( TaskSetPartition range, uint32_t threadnum ) = 0; // Size of set - usually the number of data items to be processed, see ExecuteRange. Defaults to 1 uint32_t m_SetSize; // Minimum size of of TaskSetPartition range when splitting a task set into partitions. // This should be set to a value which results in computation effort of at least 10k // clock cycles to minimize tast scheduler overhead. // NOTE: The last partition will be smaller than m_MinRange if m_SetSize is not a multiple // of m_MinRange. // Also known as grain size in literature. uint32_t m_MinRange; private: friend class TaskScheduler; uint32_t m_RangeToRun; }; // Subclass IPinnedTask to create tasks which cab be run on a given thread only. class IPinnedTask : public ICompletable { public: IPinnedTask() : threadNum(0), pNext(NULL) {} // default is to run a task on main thread IPinnedTask( uint32_t threadNum_ ) : threadNum(threadNum_), pNext(NULL) {} // default is to run a task on main thread virtual ~IPinnedTask() override = default; // IPinnedTask needs to be non abstract for intrusive list functionality. // Should never be called as should be overridden. virtual void Execute() { assert(false); } uint32_t threadNum; // thread to run this pinned task on IPinnedTask* volatile pNext; // Do not use. For intrusive list only. }; // TaskScheduler implements several callbacks intended for profilers typedef void (*ProfilerCallbackFunc)( uint32_t threadnum_ ); struct ProfilerCallbacks { ProfilerCallbackFunc threadStart; ProfilerCallbackFunc threadStop; ProfilerCallbackFunc waitStart; ProfilerCallbackFunc waitStop; }; class TaskScheduler { public: ENKITS_API TaskScheduler(); ENKITS_API ~TaskScheduler(); // Call either Initialize() or Initialize( numThreads_ ) before adding tasks. // Initialize() will create GetNumHardwareThreads()-1 threads, which is // sufficient to fill the system when including the main thread. // Initialize can be called multiple times - it will wait for completion // before re-initializing. ENKITS_API void Initialize(); // Initialize( numThreads_ ) - numThreads_ (must be > 0) // will create numThreads_-1 threads, as thread 0 is // the thread on which the initialize was called. ENKITS_API void Initialize( uint32_t numThreads_ ); // Adds the TaskSet to pipe and returns if the pipe is not full. // If the pipe is full, pTaskSet is run. // should only be called from main thread, or within a task ENKITS_API void AddTaskSetToPipe( ITaskSet* pTaskSet ); // Thread 0 is main thread, otherwise use threadNum ENKITS_API void AddPinnedTask( IPinnedTask* pTask_ ); // This function will run any IPinnedTask* for current thread, but not run other // Main thread should call this or use a wait to ensure it's tasks are run. ENKITS_API void RunPinnedTasks(); // Runs the TaskSets in pipe until true == pTaskSet->GetIsComplete(); // should only be called from thread which created the taskscheduler , or within a task // if called with 0 it will try to run tasks, and return if none available. ENKITS_API void WaitforTask( const ICompletable* pCompletable_ ); // WaitforTaskSet, deprecated interface use WaitforTask inline void WaitforTaskSet( const ICompletable* pCompletable_ ) { WaitforTask( pCompletable_ ); } // Waits for all task sets to complete - not guaranteed to work unless we know we // are in a situation where tasks aren't being continuosly added. ENKITS_API void WaitforAll(); // Waits for all task sets to complete and shutdown threads - not guaranteed to work unless we know we // are in a situation where tasks aren't being continuosly added. ENKITS_API void WaitforAllAndShutdown(); // Returns the number of threads created for running tasks + 1 // to account for the main thread. ENKITS_API uint32_t GetNumTaskThreads() const; // Returns the ProfilerCallbacks structure so that it can be modified to // set the callbacks. ENKITS_API ProfilerCallbacks* GetProfilerCallbacks(); private: static THREADFUNC_DECL TaskingThreadFunction( void* pArgs ); void WaitForTasks( uint32_t threadNum ); void RunPinnedTasks( uint32_t threadNum ); bool TryRunTask( uint32_t threadNum, uint32_t& hintPipeToCheck_io_ ); void StartThreads(); void StopThreads( bool bWait_ ); void SplitAndAddTask( uint32_t threadNum_, SubTaskSet subTask_, uint32_t rangeToSplit_ ); void WakeThreads( int32_t maxToWake_ = 0 ); TaskPipe* m_pPipesPerThread; PinnedTaskList* m_pPinnedTaskListPerThread; uint32_t m_NumThreads; ThreadArgs* m_pThreadArgStore; threadid_t* m_pThreadIDs; volatile bool m_bRunning; volatile int32_t m_NumThreadsRunning; volatile int32_t m_NumThreadsWaiting; uint32_t m_NumPartitions; uint32_t m_NumInitialPartitions; semaphoreid_t m_NewTaskSemaphore; bool m_bHaveThreads; ProfilerCallbacks m_ProfilerCallbacks; TaskScheduler( const TaskScheduler& nocopy ); TaskScheduler& operator=( const TaskScheduler& nocopy ); }; } ospray-rkcommon-538f8a2/rkcommon/tasking/detail/enkiTS/Threads.h000066400000000000000000000120401456117377200247150ustar00rootroot00000000000000// Copyright (c) 2013 Doug Binks // // This software is provided 'as-is', without any express or implied // warranty. In no event will the authors be held liable for any damages // arising from the use of this software. // // Permission is granted to anyone to use this software for any purpose, // including commercial applications, and to alter it and redistribute it // freely, subject to the following restrictions: // // 1. The origin of this software must not be misrepresented; you must not // claim that you wrote the original software. If you use this software // in a product, an acknowledgement in the product documentation would be // appreciated but is not required. // 2. Altered source versions must be plainly marked as such, and must not be // misrepresented as being the original software. // 3. This notice may not be removed or altered from any source distribution. #pragma once #include #include #ifdef _WIN32 #include "Atomics.h" #define NOMINMAX #define WIN32_LEAN_AND_MEAN #include #define THREADFUNC_DECL DWORD WINAPI #define THREAD_LOCAL __declspec( thread ) namespace enki { typedef HANDLE threadid_t; // declare the thread start function as: // THREADFUNC_DECL MyThreadStart( void* pArg ); inline bool ThreadCreate( threadid_t* returnid, DWORD ( WINAPI *StartFunc) (void* ), void* pArg ) { // posix equiv pthread_create DWORD threadid; *returnid = CreateThread( 0, 0, StartFunc, pArg, 0, &threadid ); return *returnid != NULL; } inline bool ThreadTerminate( threadid_t threadid ) { // posix equiv pthread_cancel return CloseHandle( threadid ) == 0; } inline uint32_t GetNumHardwareThreads() { SYSTEM_INFO sysInfo; GetSystemInfo(&sysInfo); return sysInfo.dwNumberOfProcessors; } } #else // posix #include #include #define THREADFUNC_DECL void* #define THREAD_LOCAL __thread namespace enki { typedef pthread_t threadid_t; // declare the thread start function as: // THREADFUNC_DECL MyThreadStart( void* pArg ); inline bool ThreadCreate( threadid_t* returnid, void* ( *StartFunc) (void* ), void* pArg ) { // posix equiv pthread_create int32_t retval = pthread_create( returnid, NULL, StartFunc, pArg ); return retval == 0; } inline bool ThreadTerminate( threadid_t threadid ) { // posix equiv pthread_cancel pthread_cancel( threadid ); return pthread_detach( threadid ) == 0; } inline uint32_t GetNumHardwareThreads() { return (uint32_t)sysconf( _SC_NPROCESSORS_ONLN ); } } #endif // posix // Semaphore implementation #ifdef _WIN32 namespace enki { struct semaphoreid_t { HANDLE sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { semaphoreid.sem = CreateSemaphore(NULL, 0, MAXLONG, NULL ); } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { CloseHandle( semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { DWORD retval = WaitForSingleObject( semaphoreid.sem, INFINITE ); assert( retval != WAIT_FAILED ); } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { if( countWaiting ) { ReleaseSemaphore( semaphoreid.sem, countWaiting, NULL ); } } } #elif defined(__MACH__) // OS X does not have POSIX semaphores // see https://developer.apple.com/library/content/documentation/Darwin/Conceptual/KernelProgramming/synchronization/synchronization.html #include namespace enki { struct semaphoreid_t { semaphore_t sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { semaphore_create( mach_task_self(), &semaphoreid.sem, SYNC_POLICY_FIFO, 0 ); } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { semaphore_destroy( mach_task_self(), semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { semaphore_wait( semaphoreid.sem ); } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { while( countWaiting-- > 0 ) { semaphore_signal( semaphoreid.sem ); } } } #else // POSIX #include namespace enki { struct semaphoreid_t { sem_t sem; }; inline void SemaphoreCreate( semaphoreid_t& semaphoreid ) { sem_init( &semaphoreid.sem, 0, 0 ); } inline void SemaphoreClose( semaphoreid_t& semaphoreid ) { sem_destroy( &semaphoreid.sem ); } inline void SemaphoreWait( semaphoreid_t& semaphoreid ) { sem_wait( &semaphoreid.sem ); } inline void SemaphoreSignal( semaphoreid_t& semaphoreid, int32_t countWaiting ) { while( countWaiting-- > 0 ) { sem_post( &semaphoreid.sem ); } } } #endif ospray-rkcommon-538f8a2/rkcommon/tasking/detail/parallel_for.inl000066400000000000000000000022131456117377200251240ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #ifdef RKCOMMON_TASKING_TBB # define __TBB_NO_IMPLICIT_LINKAGE 1 # define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 # include #elif defined(RKCOMMON_TASKING_INTERNAL) # include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template inline void parallel_for_impl(INDEX_T nTasks, TASK_T&& fcn) { #ifdef RKCOMMON_TASKING_TBB tbb::parallel_for(INDEX_T(0), nTasks, std::forward(fcn)); #elif defined(RKCOMMON_TASKING_OMP) # pragma omp parallel for schedule(dynamic) for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) { fcn(taskIndex); } #elif defined(RKCOMMON_TASKING_INTERNAL) detail::parallel_for_internal(nTasks, std::forward(fcn)); #else // Debug (no tasking system) for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) { fcn(taskIndex); } #endif } } // ::rkcommon::tasking::detail } // ::rkcommon::tasking } // ::rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/schedule.inl000066400000000000000000000017271456117377200242670ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #ifdef RKCOMMON_TASKING_TBB # define __TBB_NO_IMPLICIT_LINKAGE 1 # define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 # include "tbb/task_arena.h" #elif defined(RKCOMMON_TASKING_OMP) # include #elif defined(RKCOMMON_TASKING_INTERNAL) # include "TaskSys.h" #endif namespace rkcommon { namespace tasking { namespace detail { template inline void schedule_impl(TASK_T fcn) { #ifdef RKCOMMON_TASKING_TBB tbb::task_arena ta = tbb::task_arena(tbb::task_arena::attach()); ta.enqueue(fcn); #elif defined(RKCOMMON_TASKING_OMP) std::thread thread(fcn); thread.detach(); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::schedule_internal(std::move(fcn)); #else// Debug --> synchronous! fcn(); #endif } } // ::rkcommon::tasking::detail } // ::rkcommon::tasking } // ::rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/detail/tasking_system_init.cpp000066400000000000000000000060111456117377200265510ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "../tasking_system_init.h" // tasking system internals #if defined(RKCOMMON_TASKING_TBB) #define __TBB_NO_IMPLICIT_LINKAGE 1 #define __TBBMALLOC_NO_IMPLICIT_LINKAGE 1 #define TBB_PREVIEW_GLOBAL_CONTROL 1 #include #elif defined(RKCOMMON_TASKING_OMP) #include #elif defined(RKCOMMON_TASKING_INTERNAL) #include "TaskSys.h" #endif // std #include // intrinsics #ifndef RKCOMMON_NO_SIMD #if !defined(__ARM_NEON) #include #elif !defined(_WIN32) #include "math/arm/emulation.h" #endif /* normally defined in pmmintrin.h, but we always need this */ #if !defined(_MM_SET_DENORMALS_ZERO_MODE) #define _MM_DENORMALS_ZERO_ON (0x0040) #define _MM_DENORMALS_ZERO_OFF (0x0000) #define _MM_DENORMALS_ZERO_MASK (0x0040) #define _MM_SET_DENORMALS_ZERO_MODE(x) \ (_mm_setcsr((_mm_getcsr() & ~_MM_DENORMALS_ZERO_MASK) | (x))) #endif #else #if !defined(_MM_SET_DENORMALS_ZERO_MODE) #define _MM_SET_FLUSH_ZERO_MODE(x) \ do { \ } while (0) #define _MM_SET_DENORMALS_ZERO_MODE(x) \ do { \ } while (0) #endif #endif // rkcommon #include "../../common.h" namespace rkcommon { namespace tasking { struct tasking_system_handle { tasking_system_handle(int numThreads) : numThreads(numThreads) { #if defined(RKCOMMON_TASKING_TBB) if (numThreads > 0) tbb_gc = make_unique( tbb::global_control::max_allowed_parallelism, numThreads); #elif defined(RKCOMMON_TASKING_OMP) if (numThreads > 0) omp_set_num_threads(numThreads); #elif defined(RKCOMMON_TASKING_INTERNAL) detail::initTaskSystemInternal(numThreads <= 0 ? -1 : numThreads); #endif } int num_threads() { #if defined(RKCOMMON_TASKING_TBB) return tbb::global_control::active_value( tbb::global_control::max_allowed_parallelism); #elif defined(RKCOMMON_TASKING_OMP) return omp_get_max_threads(); #elif defined(RKCOMMON_TASKING_INTERNAL) return detail::numThreadsTaskSystemInternal(); #else return 1; #endif } int numThreads{-1}; #if defined(RKCOMMON_TASKING_TBB) std::unique_ptr tbb_gc; #endif }; static std::unique_ptr g_tasking_handle; void initTaskingSystem(int numThreads, bool flushDenormals) { if (flushDenormals) { _MM_SET_FLUSH_ZERO_MODE(_MM_FLUSH_ZERO_ON); _MM_SET_DENORMALS_ZERO_MODE(_MM_DENORMALS_ZERO_ON); } g_tasking_handle = make_unique(numThreads); } int numTaskingThreads() { if (!g_tasking_handle.get()) return 0; else return g_tasking_handle->num_threads(); } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/parallel_for.h000066400000000000000000000062711456117377200233370ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../traits/rktraits.h" #include "detail/parallel_for.inl" #include namespace rkcommon { namespace tasking { // NOTE(jda) - This abstraction wraps "fork-join" parallelism, with an // implied synchronizsation after all of the tasks have run. template inline void parallel_for(INDEX_T nTasks, TASK_T &&fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::parallel_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " long long, unsigned long long or size_t."); static_assert(has_operator_method_matching_param::value, "rkcommon::tasking::parallel_for() requires the " "implementation of method " "'void TASK_T::operator(P taskIndex), where P is of " "type INDEX_T [first parameter of parallel_for()]."); detail::parallel_for_impl(nTasks, std::forward(fcn)); } // NOTE(jda) - Allow serial version of parallel_for() without the need to // change the entire tasking system backend template inline void serial_for(INDEX_T nTasks, const TASK_T &fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::serial_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " long long, unsigned long long or size_t."); static_assert(has_operator_method_matching_param::value, "rkcommon::tasking::serial_for() requires the " "implementation of method " "'void TASK_T::operator(P taskIndex), where P is of " "type INDEX_T [first parameter of serial_for()]."); for (INDEX_T taskIndex = 0; taskIndex < nTasks; ++taskIndex) fcn(taskIndex); } /* NOTE(iw) - This abstraction extends the 'parallel_for' to mixed parallel/serial: we logically view the domain of N input tasks as grouped into roundUp(N/M) blocks of (at most) M items each; then 'itearte over the N/M blocks in parallel, and process each block serailly */ template inline void parallel_in_blocks_of(INDEX_T nTasks, TASK_T &&fcn) { using namespace traits; static_assert(is_valid_index::value, "rkcommon::tasking::parallel_for() requires the type" " INDEX_T to be unsigned char, short, int, uint, long," " or size_t."); INDEX_T numBlocks = (nTasks + BLOCK_SIZE - 1) / BLOCK_SIZE; parallel_for(numBlocks, [&](INDEX_T blockID) { INDEX_T begin = blockID * (INDEX_T)BLOCK_SIZE; INDEX_T end = std::min(begin + (INDEX_T)BLOCK_SIZE, nTasks); fcn(begin, end); }); } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/parallel_foreach.h000066400000000000000000000020071456117377200241510ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "parallel_for.h" #include namespace rkcommon { namespace tasking { template inline void parallel_foreach(ITERATOR_T begin, ITERATOR_T end, TASK_T &&f) { using ITERATOR_KIND = typename std::iterator_traits::iterator_category; static_assert( std::is_same::value, "rkcommon::tasking::parallel_foreach() requires random-" "access iterators!"); const size_t count = std::distance(begin, end); auto *v = &(*begin); parallel_for(count, [&](size_t i) { f(v[i]); }); } template inline void parallel_foreach(CONTAINER_T &&c, TASK_T &&f) { parallel_foreach(std::begin(c), std::end(c), std::forward(f)); } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/schedule.h000066400000000000000000000017371456117377200224730ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../traits/rktraits.h" #include "detail/schedule.inl" namespace rkcommon { namespace tasking { // NOTE(jda) - This abstraction takes a lambda which should take captured // variables by *value* to ensure no captured references race // with the task itself. // NOTE(jda) - No priority is associated with this call, but could be added // later with a hint enum, using a default value for the // priority to not require specifying it. template inline void schedule(TASK_T fcn) { static_assert(traits::has_operator_method::value, "rkcommon::tasking::schedule() requires the " "implementation of method 'void TASK_T::operator()'."); detail::schedule_impl(std::move(fcn)); } } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tasking/tasking_system_init.h000066400000000000000000000006301456117377200247550ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace tasking { void RKCOMMON_INTERFACE initTaskingSystem(int numThreads = -1, bool flushDenormals = false); int RKCOMMON_INTERFACE numTaskingThreads(); } // namespace tasking } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tracing/000077500000000000000000000000001456117377200205055ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/tracing/Tracing.cpp000066400000000000000000000254661456117377200226150ustar00rootroot00000000000000// Copyright 2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include #include #include #include #include #include #include #ifdef _WIN32 #include #endif // We force the define on here to build the right header // at compile time, but apps that build with profiling off // will see the empty defines #define RKCOMMON_ENABLE_PROFILING #include "Tracing.h" #define THREAD_EVENT_CHUNK_SIZE 8192 namespace rkcommon { namespace tracing { using namespace std::chrono; static std::unique_ptr traceRecorder = rkcommon::make_unique(); static thread_local std::shared_ptr threadEventList = nullptr; std::ostream &operator<<(std::ostream &os, const EventType &ty) { switch (ty) { case EventType::INVALID: os << "INVALID"; break; case EventType::BEGIN: os << "B"; break; case EventType::END: os << "E"; break; case EventType::MARKER: os << "i"; break; case EventType::COUNTER: os << "C"; break; default: break; } return os; } TraceEvent::TraceEvent() { #ifdef __linux__ rusage usage; getrusage(RUSAGE_SELF, &usage); ru_utime = usage.ru_utime; ru_stime = usage.ru_stime; #endif time = steady_clock::now(); } TraceEvent::TraceEvent(const EventType ty) : TraceEvent() { type = ty; } TraceEvent::TraceEvent(const EventType type, const char *n, const char *c) : TraceEvent(type) { name = n; category = c; } TraceEvent::TraceEvent( const EventType type, const char *name, const uint64_t value) : TraceEvent(type, name, nullptr) { counterValue = value; } void ThreadEventList::beginEvent(const char *name, const char *category) { getCurrentEventList().push_back(TraceEvent( EventType::BEGIN, getCachedString(name), getCachedString(category))); } void ThreadEventList::endEvent() { getCurrentEventList().push_back(TraceEvent(EventType::END)); } void ThreadEventList::setMarker(const char *name, const char *category) { getCurrentEventList().push_back(TraceEvent( EventType::MARKER, getCachedString(name), getCachedString(category))); } void ThreadEventList::setCounter(const char *name, const uint64_t counterValue) { getCurrentEventList().push_back( TraceEvent(EventType::COUNTER, getCachedString(name), counterValue)); } std::vector &ThreadEventList::getCurrentEventList() { if (events.empty() || events.back().size() >= THREAD_EVENT_CHUNK_SIZE) { events.push_back(std::vector()); events.back().reserve(THREAD_EVENT_CHUNK_SIZE); } return events.back(); } const char *ThreadEventList::getCachedString(const char *str) { if (!str) { return nullptr; } // Lookup string in the uniqueEventNames list, since most strings are likely // to just be static/constant data strings (e.g., rkTraceBeginEvent("X")) // this caching is just based on the pointer and we skip doing more // expensive string comparison. Dynamically generated strings will likely // have different ptrs, though this will be wrong if some memory is // re-used with different text content. auto fnd = stringCache.find(str); if (fnd == stringCache.end()) { auto en = std::make_shared(str); stringCache[str] = en; return en->c_str(); } return fnd->second->c_str(); } std::shared_ptr TraceRecorder::getThreadTraceList( const std::thread::id &id) { std::lock_guard lock(threadTraceMutex); auto fnd = threadTrace.find(id); if (fnd == threadTrace.end()) { auto threadEventList = std::make_shared(); threadTrace[id] = threadEventList; return threadEventList; } return fnd->second; } void TraceRecorder::saveLog(const char *logFile, const char *processName) { std::lock_guard lock(threadTraceMutex); // chrome:://tracing / ui.perfetto.dev takes a JSON array of events, but to // keep dependencies down we don't need a JSON library to produce this simple // format std::ofstream fout(logFile); #ifdef _WIN32 const int pid = _getpid(); #else const int pid = getpid(); #endif fout << "["; // Emit metadata about the process name if (processName) { // Emit metadata event for the thread's ID/name fout << "{" << "\"ph\": \"M\"," << "\"pid\":" << pid << "," << "\"tid\":" << 0 << "," << "\"name\":" << "\"process_name\"," << "\"args\":{\"name\":\"" << processName << "\"}" << "},"; } // Go through each thread and output its data // We renumber thread IDs here because chrome:://tracing UI doesn't display // the true thread ID numbers well int nextTid = 0; for (const auto &trace : threadTrace) { const std::thread::id tid = trace.first; // Emit metadata event for the thread's ID/name fout << "{" << "\"ph\": \"M\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"name\":" << "\"thread_name\"," << "\"args\":{\"name\":\""; if (!trace.second->threadName.empty()) { fout << trace.second->threadName << "\"}"; } else { fout << tid << "\"}"; } fout << "},"; // Track the begin events so that when we hit an end we can compute CPU % // and other stats to include std::stack beginEvents; for (const auto &evtChunk : trace.second->events) { for (const auto &evt : evtChunk) { if (evt.type == EventType::INVALID) { std::cerr << "Got invalid event type!?\n"; } if (evt.type == EventType::BEGIN) { beginEvents.push(&evt); } if (evt.type == EventType::END && beginEvents.empty()) { std::cerr << "Tracing Error: Too many rkTraceEndEvent calls!\n"; break; } const uint64_t timestamp = std::chrono::duration_cast( evt.time.time_since_epoch()) .count(); fout << "{" << "\"ph\": \"" << evt.type << "\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"ts\":" << timestamp << "," << "\"name\":\"" << (evt.name ? evt.name : "") << "\""; if (evt.type != EventType::END && evt.category) { fout << ",\"cat\":\"" << evt.category << "\""; } // Compute CPU utilization % over the begin/end interval for end events float utilization = 0.f; uint64_t duration = 0; const TraceEvent *begin = nullptr; if (evt.type == EventType::END) { begin = beginEvents.top(); utilization = cpuUtilization(*begin, evt); duration = std::chrono::duration_cast( evt.time - begin->time) .count(); fout << ",\"args\":{\"cpuUtilization\":" << utilization << "}"; beginEvents.pop(); } else if (evt.type == EventType::COUNTER) { fout << ",\"args\":{\"value\":" << evt.counterValue << "}"; } fout << "},"; // For each end event also emit an update of the CPU % utilization // counter for events that were long enough to reasonably measure // utilization. CPU % is emitted at the time of the beginning of the // event to display the counter properly over the interval if (evt.type == EventType::END && duration > 100 && begin) { const uint64_t beginTimestamp = std::chrono::duration_cast( begin->time.time_since_epoch()) .count(); fout << "{" << "\"ph\": \"C\"," << "\"pid\":" << pid << "," << "\"tid\":" << nextTid << "," << "\"ts\":" << beginTimestamp << "," << "\"name\":\"cpuUtilization\"," << "\"cat\":\"builtin\"," << "\"args\":{\"value\":" << utilization << "}},"; } } } if (!beginEvents.empty()) { std::cerr << "Tracing Error: Missing end for some events!\n"; while (!beginEvents.empty()) { std::cerr << "\t" << beginEvents.top()->name << "\n"; beginEvents.pop(); } } ++nextTid; } // We need to remove the last , we output to ensure the JSON array is correct // Overwrite it with the ] character. fout.seekp(-1, std::ios::cur); fout << "]"; } float cpuUtilization(const TraceEvent &start, const TraceEvent &end) { #ifdef __linux__ const double elapsed_cpu = end.ru_utime.tv_sec + end.ru_stime.tv_sec - (start.ru_utime.tv_sec + start.ru_stime.tv_sec) + 1e-6f * (end.ru_utime.tv_usec + end.ru_stime.tv_usec - (start.ru_utime.tv_usec + start.ru_stime.tv_usec)); const double elapsed_wall = duration_cast>(end.time - start.time).count(); return elapsed_cpu / elapsed_wall * 100.0; #else return -1.f; #endif } std::string getProcStatus() { // Note: this file doesn't exist on OS X, would we want some alternative to // fetch this info? std::ifstream file("/proc/self/status"); if (!file.is_open()) { return ""; } return std::string( std::istreambuf_iterator(file), std::istreambuf_iterator()); } void getProcMemUse(uint64_t &virtMem, uint64_t &resMem) { virtMem = 0; resMem = 0; #ifdef __linux__ // TODO: Windows? FILE *file = std::fopen("/proc/self/statm", "r"); if (file) { // These values are measured in pages if (std::fscanf(file, "%lu %lu", &virtMem, &resMem) == 2) { const int pageSize = getpagesize(); virtMem *= pageSize; resMem *= pageSize; } std::fclose(file); } #endif } void initThreadEventList() { if (!threadEventList) { threadEventList = traceRecorder->getThreadTraceList(std::this_thread::get_id()); } } void beginEvent(const char *name, const char *category) { initThreadEventList(); threadEventList->beginEvent(name, category); } void endEvent() { // Begin takes care of getting the threadEventList set // in thread_local storage so we can assume it exists here threadEventList->endEvent(); } void setMarker(const char *name, const char *category) { initThreadEventList(); threadEventList->setMarker(name, category); } void setCounter(const char *name, uint64_t value) { initThreadEventList(); threadEventList->setCounter(name, value); } void recordMemUse() { initThreadEventList(); uint64_t virtMem = 0; uint64_t resMem = 0; getProcMemUse(virtMem, resMem); threadEventList->setCounter("rkTraceVirtMem_B", virtMem); threadEventList->setCounter("rkTraceRssMem_B", resMem); } void setThreadName(const char *name) { initThreadEventList(); threadEventList->threadName = name; } void saveLog(const char *logFile, const char *processName) { traceRecorder->saveLog(logFile, processName); } } // namespace tracing } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/tracing/Tracing.h000066400000000000000000000074561456117377200222610ustar00rootroot00000000000000// Copyright 2023 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include #include #include #include #include #include #ifdef __linux__ #include #include #include #include #include #endif #include "rkcommon/common.h" namespace rkcommon { namespace tracing { enum class EventType { INVALID, BEGIN, END, MARKER, COUNTER }; struct RKCOMMON_INTERFACE TraceEvent { EventType type = EventType::INVALID; // Refers to a string in the thread's stringCache, nullptr for end events const char *name = nullptr; // Refers to the event category in the thread's stringCache, may be null const char *category = nullptr; #ifdef __linux__ timeval ru_utime; timeval ru_stime; #endif std::chrono::steady_clock::time_point time; uint64_t counterValue = 0; TraceEvent(); TraceEvent(const EventType type); TraceEvent(const EventType type, const char *name, const char *category); TraceEvent( const EventType type, const char *name, const uint64_t counterValue); }; struct RKCOMMON_INTERFACE ThreadEventList { // We store events in chunks to reduce memory copy // costs when when tracking very large numbers of events std::list> events; std::string threadName; // Applications are typically running a rendering loop, emitting // the same event name repeatedly. If these names are inline // strings they will have the same pointer and we can cache // them in a map to reduce string copying costs and overhead // Note: the string is wrapped in a shared/unique ptr // to guard against copy ctor use when adding to the map which would // invalidate the pointer to the string data std::unordered_map> stringCache; void beginEvent(const char *name, const char *category); void endEvent(); void setMarker(const char *name, const char *category); void setCounter(const char *name, const uint64_t value); private: std::vector &getCurrentEventList(); const char *getCachedString(const char *str); }; class RKCOMMON_INTERFACE TraceRecorder { std::unordered_map> threadTrace; std::mutex threadTraceMutex; public: /* Get the thread trace list, creating it if this is the first time * this thread has requested its list. This call locks the TraceRecorder, * so threads cache the returned value in thread_local storage to avoid * calling this each event. */ std::shared_ptr getThreadTraceList( const std::thread::id &id); void saveLog(const char *logFile, const char *processName); }; float cpuUtilization(const TraceEvent &start, const TraceEvent &end); std::string getProcStatus(); // Begin an event, must be paired with an end event. Name is required, // category is optional void beginEvent(const char *name, const char *category); void endEvent(); // Set a marker in the trace timeline, e.g., for things that have no duration // Name is required, category is optional void setMarker(const char *name, const char *category); // Counter values are displayed per-process by chrome:://tracing // but are recorded per-thread without synchronization void setCounter(const char *name, uint64_t value); // Record the built-in counters traceVirtMem and traceRssMem tracking the // virtual and resident memory sizes respectively void recordMemUse(); void setThreadName(const char *name); void saveLog(const char *logFile, const char *processName); } // namespace tracing } // namespace rkcommon #ifdef RKCOMMON_ENABLE_PROFILING #define RKCOMMON_IF_TRACING_ENABLED(CMD) CMD #else #define RKCOMMON_IF_TRACING_ENABLED(CMD) #endif ospray-rkcommon-538f8a2/rkcommon/traits/000077500000000000000000000000001456117377200203645ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/traits/rktraits.h000066400000000000000000000110101456117377200223710ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include namespace rkcommon { namespace traits { using byte_t = unsigned char; // C++14 traits for C++11 ///////////////////////////////////////////////// template using enable_if_t = typename std::enable_if::type; // Helper operators /////////////////////////////////////////////////////// template std::true_type operator==(const T &, const Arg &); // type 'T' having '==' operator ////////////////////////////////////////// template struct HasOperatorEqualsT { enum { value = !std::is_same::value }; }; template using HasOperatorEquals = typename std::enable_if::value, TYPE>::type; template using NoOperatorEquals = typename std::enable_if::value, TYPE>::type; // type 'T' (decayed) is a valid parallel_for() index type //////////////// template struct is_valid_index { using TYPE = typename std::decay::type; enum { value = std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value }; }; // type 'T' implementing T::operator() //////////////////////////////////// // NOTE(jda) - This checks at compile time if T implements the method // 'void T::operator()'. template struct has_operator_method { using TASK_T = typename std::decay::type; template class checker; template static std::true_type test(checker *); template static std::false_type test(...); using type = decltype(test(nullptr)); static const bool value = std::is_same::value; }; // type 'T' implementing T::operator(P) with P being integral ///////////// #ifdef _WIN32 template using has_operator_method_matching_param = has_operator_method; #else // NOTE(jda) - This checks at compile time if T implements the method // 'void T::operator(P taskIndex)', where P matches the second // template parameter 'EXPECTED_PARAM_T' template struct has_operator_method_matching_param { using TASK_T = typename std::decay::type; template using t_param = void (TASK_T::*)(P) const; using operator_t = decltype(&TASK_T::operator()); using valid_param = std::is_same, operator_t>; static const bool value = has_operator_method::value && valid_param::value; }; #endif // type 'DERIVED' (decayed) comes from 'BASE' ///////////////////////////// template using is_base_of_t = enable_if_t< std::is_base_of::type>::value>; // type 'T' (decayed) is a class/struct /////////////////////////////////// template using is_class_t = enable_if_t::type>::value>; // type 'T1' and 'T2' are not the same //////////////////////////////////// template using is_not_same_t = enable_if_t::value>; // If a single type is convertible to another ///////////////////////////// template using can_convert = std::is_convertible; template using can_convert_t = enable_if_t::value>; // type 'T' is arithmetic ///////////////////////////////////////////////// template using is_arithmetic_t = enable_if_t::value>; } // namespace traits } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/000077500000000000000000000000001456117377200205615ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/utility/AbstractArray.h000066400000000000000000000050751456117377200235030ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include #include #include namespace rkcommon { namespace utility { /* 'AbstractArray' implements an array interface on a pointer * to data which may be owned by the object. */ template struct AbstractArray { virtual ~AbstractArray() = default; size_t size() const; T &operator[](size_t offset) const; // with bounds checking T &at(size_t offset) const; operator bool() const; explicit operator T *() const; T *data() const; T *begin() const; T *end() const; const T *cbegin() const; const T *cend() const; protected: // Can only be constructed by child classes AbstractArray() = default; // Called by children to initialize the ptr/numItems values void setPtr(T *ptr, size_t numItems); private: T *ptr{nullptr}; size_t numItems{0}; }; // Inlined definitions //////////////////////////////////////////////////// template inline size_t AbstractArray::size() const { return numItems; } template inline T &AbstractArray::operator[](size_t offset) const { return *(begin() + offset); } template inline T &AbstractArray::at(size_t offset) const { if (offset >= size()) throw std::runtime_error("ArrayView: out of bounds access!"); return *(begin() + offset); } template inline AbstractArray::operator bool() const { return size() != 0; } template inline AbstractArray::operator T *() const { return begin(); } template inline T *AbstractArray::data() const { return begin(); } template inline T *AbstractArray::begin() const { return ptr; } template inline T *AbstractArray::end() const { return ptr + size(); } template inline const T *AbstractArray::cbegin() const { return begin(); } template inline const T *AbstractArray::cend() const { return end(); } template inline void AbstractArray::setPtr(T *ptr, size_t numItems) { this->ptr = numItems > 0 ? ptr : nullptr; this->numItems = numItems; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/Any.h000066400000000000000000000156371456117377200214750ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include "../common.h" #include "../traits/rktraits.h" #include "demangle.h" namespace rkcommon { namespace utility { /* 'Any' implements a single item container which erases its type (can hold * any value which is copyable). The value can be extracted successfully * only if the correct type is queried for the held value, where an * exception is thrown otherwise. Similar (but perhaps not identical to) * 'boost::any' or C++17's 'std::any'. * * Example: * * Any myAny = 1; // myAny is an 'int' w/ value of '1' * int value = myAny.get(); // get value of '1' out of myAny * char bad = myAny.get(); // throws exception */ struct Any { Any() = default; Any(const Any ©); template Any(T value); ~Any() = default; Any &operator=(const Any &rhs); template Any &operator=(T rhs); bool operator==(const Any &rhs) const; bool operator!=(const Any &rhs) const; template T &get(); template const T &get() const; template bool is() const; bool valid() const; std::string toString() const; private: // Helper types // struct handle_base { virtual ~handle_base() = default; virtual handle_base *clone() const = 0; virtual const std::type_info &valueTypeID() const = 0; virtual bool isSame(handle_base *other) const = 0; virtual void *data() = 0; }; template struct handle : public handle_base { handle(T value); handle_base *clone() const override; const std::type_info &valueTypeID() const override; bool isSame(handle_base *other) const override; void *data() override; T value; // NOTE(jda) - Use custom type trait to select a real implementation of // isSame(), or one that always returns 'false' if the // template type 'T' does not implement operator==() with // itself. template inline traits::HasOperatorEquals //<-- substitues to 'bool' isSameImpl(handle_base *other) const; template inline traits::NoOperatorEquals //<-- substitutes to 'bool' isSameImpl(handle_base *other) const; }; // Data members // std::unique_ptr currentValue; }; // Inlined Any definitions //////////////////////////////////////////////// template inline Any::Any(T value) : currentValue(new handle::type>( std::forward(value))) { static_assert(std::is_copy_constructible::value && std::is_copy_assignable::value, "Any can only be constructed with copyable values!"); } inline Any::Any(const Any ©) : currentValue(copy.valid() ? copy.currentValue->clone() : nullptr) { } inline Any &Any::operator=(const Any &rhs) { Any temp(rhs); currentValue = std::move(temp.currentValue); return *this; } template inline Any &Any::operator=(T rhs) { static_assert(std::is_copy_constructible::value && std::is_copy_assignable::value, "Any can only be assigned values which are copyable!"); currentValue = std::unique_ptr( new handle::type>( std::forward(rhs))); return *this; } inline bool Any::operator==(const Any &rhs) const { return currentValue->isSame(rhs.currentValue.get()); } inline bool Any::operator!=(const Any &rhs) const { return !(*this == rhs); } template inline T &Any::get() { if (!valid()) throw std::runtime_error("Can't query value from an empty Any!"); if (is()) return *(static_cast(currentValue->data())); else { std::stringstream msg; msg << "Incorrect type queried for Any!" << '\n'; msg << " queried type == " << nameOf() << '\n'; msg << " current type == " << demangle(currentValue->valueTypeID().name()) << '\n'; throw std::runtime_error(msg.str()); } } template inline const T &Any::get() const { if (!valid()) throw std::runtime_error("Can't query value from an empty Any!"); if (is()) return *(static_cast(currentValue->data())); else { std::stringstream msg; msg << "Incorrect type queried for Any!" << '\n'; msg << " queried type == " << nameOf() << '\n'; msg << " current type == " << demangle(currentValue->valueTypeID().name()) << '\n'; throw std::runtime_error(msg.str()); } } template inline bool Any::is() const { return valid() && (strcmp(typeid(T).name(), currentValue->valueTypeID().name()) == 0); } inline bool Any::valid() const { return currentValue.get() != nullptr; } inline std::string Any::toString() const { std::stringstream retval; retval << "Any : (currently holds value of type) --> " << demangle(currentValue->valueTypeID().name()); return retval.str(); } template inline Any::handle::handle(T v) : value(std::move(v)) { } template inline Any::handle_base *Any::handle::clone() const { return new handle(value); } template inline const std::type_info &Any::handle::valueTypeID() const { return typeid(T); } template inline bool Any::handle::isSame(Any::handle_base *other) const { return isSameImpl(other); } template template inline traits::HasOperatorEquals Any::handle::isSameImpl( Any::handle_base *other) const { handle *otherHandle = dynamic_cast *>(other); return (otherHandle != nullptr) && (otherHandle->value == this->value); } template template inline traits::NoOperatorEquals Any::handle::isSameImpl( Any::handle_base *) const { return false; } template inline void *Any::handle::data() { return &value; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/ArgumentList.h000066400000000000000000000074441456117377200233610ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /*! \file ArgumentList.h Defines an interface for storing - and consuming - command line parameters The way this is supposed to work is that the app creates an arglist from the ac/av parameters passed to it, then individual modules can scan through this class, check if they recognize any, parse those, and 'consume' them (it, take them off this list), thus indicating that those have been properly processed. This in particular allows an app to determine if any command lin eparameters have _not_ been processed by any modules, which typically indicates a user having used a deprecated way of specifying an input parameter (or simply, had a typo :-/ ). */ #pragma once #include #include namespace rkcommon { namespace utility { /*! class that abstracts command line arguments */ struct ArgumentList { /*! initialize a new argument list. note that we will _drop_ av[0], as this isn't actually an argument */ ArgumentList(int ac, const char **av); /*! return (a copy of) the idx'th argument. Note that unlike 'real' ac/av numbering of the args starts at 0, not 1 (because we drop the binary name, and only store arguments) */ std::string operator[](const int idx) const; /*! return number of arguments still in list */ int size() const; /*! return number of arguments still in list */ bool empty() const; /*! remove given number of arguments at given index in list */ void remove(int where, int howMany = 1); private: std::vector arg; }; /*! helper abstraction for any class that can wants to arguments - rather than having to manually iterate over all arguments, this class allows any class that is derived from it to simply call ArgsParser::parseAndConsume(), and do all its detection of parsing of command line arguments by overriding 'tryConsume()' */ struct ArgumentsParser { virtual ~ArgumentsParser() = default; /*! check if given arg is one of ours. if so, consume it (and all its successive parameters that depend on it, and return the total number of arguments consumed */ virtual int tryConsume(ArgumentList &argList, int argID) = 0; /*! This function goes over an argument list, and calls 'tryConsume()' for every argument, then takes those that _have_ been indicated as 'yes, we rcongized those' from the argument list. (usually does not have to be overridden) */ void parseAndRemove(ArgumentList &args); }; // ------------------------------------------------------------------ // (header-only) implementatoin section from here on: // ------------------------------------------------------------------ inline ArgumentList::ArgumentList(int ac, const char **av) { for (int i = 1; i < ac; i++) arg.push_back(av[i]); } inline std::string ArgumentList::operator[](int idx) const { return arg.at(idx); } inline int ArgumentList::size() const { return static_cast(arg.size()); } inline bool ArgumentList::empty() const { return arg.empty(); } inline void ArgumentList::remove(int where, int howMany) { for (int i = 0; i < howMany; i++) arg.erase(arg.begin() + where, arg.begin() + where + 1); } inline void ArgumentsParser::parseAndRemove(ArgumentList &argList) { for (int argID = 0; argID < argList.size(); /*no-op*/) { int numConsumed = tryConsume(argList, argID); if (numConsumed == 0) ++argID; else argList.remove(argID, numConsumed); } } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/ArrayView.h000066400000000000000000000046421456117377200226510ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include namespace rkcommon { namespace utility { /* 'ArrayView' implements an array interface on a pointer to data which * is *NOT* owned by ArrayView. If you want ArrayView to own data, then * instead use std::array or std::vector. */ template struct ArrayView : public AbstractArray { ArrayView() = default; ~ArrayView() override = default; template ArrayView(std::array &init); ArrayView(std::vector &init); explicit ArrayView(T *data, size_t size); void reset(); void reset(T *data, size_t size); template ArrayView &operator=(std::array &rhs); ArrayView &operator=(std::vector &rhs); }; // Inlined ArrayView definitions ////////////////////////////////////////// template inline ArrayView::ArrayView(T *_data, size_t _size) { AbstractArray::setPtr(_data, _size); } template template inline ArrayView::ArrayView(std::array &init) { AbstractArray::setPtr(init.data(), init.size()); } template inline ArrayView::ArrayView(std::vector &init) { AbstractArray::setPtr(init.data(), init.size()); } template inline void ArrayView::reset() { AbstractArray::setPtr(nullptr, 0); } template inline void ArrayView::reset(T *_data, size_t _size) { AbstractArray::setPtr(_data, _size); } template template inline ArrayView &ArrayView::operator=(std::array &rhs) { AbstractArray::setPtr(rhs.data(), rhs.size()); return *this; } template inline ArrayView &ArrayView::operator=(std::vector &rhs) { AbstractArray::setPtr(rhs.data(), rhs.size()); return *this; } // ArrayView utility functions //////////////////////////////////////////// template inline ArrayView make_ArrayView(T *data, size_t size) { return ArrayView(data, size); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/CodeTimer.h000066400000000000000000000036221456117377200226100ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // std #include namespace rkcommon { namespace utility { /*! Helper class that assists with timing a region of code. */ struct CodeTimer { void start(); void stop(); double seconds() const; double milliseconds() const; double perSecond() const; double secondsSmoothed() const; double millisecondsSmoothed() const; double perSecondSmoothed() const; private: double smooth_nom{0.0}; double smooth_den{0.0}; std::chrono::time_point frameEndTime; std::chrono::time_point frameStartTime; }; // Inlined CodeTimer definitions ////////////////////////////////////////// inline void CodeTimer::start() { frameStartTime = std::chrono::steady_clock::now(); } inline void CodeTimer::stop() { frameEndTime = std::chrono::steady_clock::now(); smooth_nom = smooth_nom * 0.8f + seconds(); smooth_den = smooth_den * 0.8f + 1.f; } inline double CodeTimer::seconds() const { auto diff = frameEndTime - frameStartTime; return std::chrono::duration(diff).count(); } inline double CodeTimer::milliseconds() const { auto diff = frameEndTime - frameStartTime; return std::chrono::duration(diff).count(); } inline double CodeTimer::perSecond() const { return 1.0 / seconds(); } inline double CodeTimer::secondsSmoothed() const { return 1.0 / perSecondSmoothed(); } inline double CodeTimer::millisecondsSmoothed() const { return secondsSmoothed() * 1000.0; } inline double CodeTimer::perSecondSmoothed() const { return smooth_den / smooth_nom; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/DataView.h000066400000000000000000000022511456117377200224360ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" namespace rkcommon { namespace utility { template struct DataView { DataView() = default; ~DataView() = default; DataView(const void *data, size_t stride = sizeof(T)); void reset(const void *data, size_t stride = sizeof(T)); const T &operator[](size_t index) const; protected: const byte_t *ptr{nullptr}; size_t stride{1}; }; // Inlined member definitions // /////////////////////////////////////////////// template inline DataView::DataView(const void *_data, size_t _stride) : ptr(static_cast(_data)), stride(_stride) { } template inline void DataView::reset(const void *_data, size_t _stride) { ptr = static_cast(_data); stride = _stride; } template inline const T &DataView::operator[](size_t index) const { return *reinterpret_cast(ptr + (index * stride)); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/DoubleBufferedValue.h000066400000000000000000000041431456117377200246060ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include namespace rkcommon { namespace utility { /*! This class represents two values which are double buffered. This is useful if one thread wants to work on a piece of data while another "uses" it. Then at some point, the caller can swap() the front and back values, where front() and back() references will be exchanged. Example: A rendering thread wants to work on a framebuffer while a GUI thread wants to continuously draw the latest complete framebuffer. Once the new frame is ready, they are swapped. NOTE: This isn't thread safe! Any references to front() and back() must be synchronized with when swap() gets called. */ template class DoubleBufferedValue { public: // This assumes that T is default constructable. If you want to use this // abstraction with non default constructable types, you will need to add // additional constructors. DoubleBufferedValue() = default; ~DoubleBufferedValue() = default; T &front(); const T &front() const; T &back(); const T &back() const; void swap(); private: int front_value{0}; int back_value{1}; T values[2]; }; // Inlined members //////////////////////////////////////////////////////// template inline T &DoubleBufferedValue::front() { return values[front_value]; } template inline const T &DoubleBufferedValue::front() const { return values[front_value]; } template inline T &DoubleBufferedValue::back() { return values[back_value]; } template inline const T &DoubleBufferedValue::back() const { return values[back_value]; } template inline void DoubleBufferedValue::swap() { std::swap(front_value, back_value); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/FixedArray.h000066400000000000000000000061721456117377200227760ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include #include namespace rkcommon { namespace utility { template struct FixedArrayView; /* 'FixedArray' implements an array interface on a pointer to * data which is owned by the FixedArray. The array is not * initialized on creation and cannot be resized, though it can * be recreated with a new size */ template struct FixedArray : public AbstractArray { using View = FixedArrayView; FixedArray() = default; ~FixedArray() override = default; explicit FixedArray(size_t size); explicit FixedArray(T *data, size_t size); template FixedArray(std::array &init); FixedArray(std::vector &init); template FixedArray &operator=(std::array &rhs); FixedArray &operator=(std::vector &rhs); private: // We use a shared ptr to actually manage lifetime the data lifetime std::shared_ptr array = nullptr; }; // Inlined FixedArray definitions ///////////////////////////////////////// template inline FixedArray::FixedArray(size_t _size) : array(std::shared_ptr(new T[_size], std::default_delete())) { AbstractArray::setPtr(array.get(), _size); } template inline FixedArray::FixedArray(T *_data, size_t _size) : FixedArray(_size) { // Note: // UB in memcpy if // 1) source / destination are NULL, even if _size is 0 // 2) buffers overlap // 3) destination is too small. // We catch the first case here, and the others are impossible // since we just allocated the destination. if (_data && _size > 0) std::memcpy(array.get(), _data, _size * sizeof(T)); } template template inline FixedArray::FixedArray(std::array &init) : FixedArray(init.data(), init.size()) { } template inline FixedArray::FixedArray(std::vector &init) : FixedArray(init.data(), init.size()) { } template template inline FixedArray &FixedArray::operator=(std::array &rhs) { array = std::shared_ptr(new T[rhs.size()], std::default_delete()); AbstractArray::setPtr(array.get(), rhs.size()); if (rhs.data() && rhs.size() > 0) std::memcpy(array.get(), rhs.data(), rhs.size() * sizeof(T)); return *this; } template inline FixedArray &FixedArray::operator=(std::vector &rhs) { array = std::shared_ptr(new T[rhs.size()], std::default_delete()); AbstractArray::setPtr(array.get(), rhs.size()); if (rhs.data() && rhs.size() > 0) std::memcpy(array.get(), rhs.data(), rhs.size() * sizeof(T)); return *this; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/FixedArrayView.h000066400000000000000000000025151456117377200236260ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include "FixedArrayView.h" namespace rkcommon { namespace utility { /* 'FixedArrayView' implements an array interface on a pointer to * data which is owned by the FixedArrayView. The array is not * initialized on creation and cannot be resized, though it can * be recreated with a new size */ template struct FixedArrayView : public AbstractArray { FixedArrayView() = default; ~FixedArrayView() override = default; FixedArrayView(std::shared_ptr> &data, size_t offset, size_t size); private: // The underlying array from the fixed array being viewed, to keep // the data alive for the view's lifetime std::shared_ptr> data; }; // Inlined FixedArrayView definitions template FixedArrayView::FixedArrayView(std::shared_ptr> &_data, size_t offset, size_t size) : data(_data) { AbstractArray::setPtr(data->begin() + offset, size); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/Observer.h000066400000000000000000000045611456117377200225270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "TimeStamp.h" #include #include namespace rkcommon { namespace utility { struct Observer; // Something that an observer looks at // // NOTE(jda) - This can either be used as a base class or as a stand-alone // member of a class. It is up to the user to decide best how to // use this abstraction. struct Observable { Observable() = default; virtual ~Observable(); void notifyObservers(); private: friend Observer; void registerObserver(Observer &newObserver); void removeObserver(Observer &toRemove); TimeStamp lastNotified; std::vector observers; }; // Something that looks an an observable instance. // // NOTE(jda) - I think this makes more sense for objects to hold an instance // of an Observer and avoid _being_ and observer. struct Observer { Observer(Observable &observee); ~Observer(); bool wasNotified(); private: friend Observable; TimeStamp lastObserved; Observable *observee{nullptr}; }; // Inlined definitions //////////////////////////////////////////////////// // Observable // inline Observable::~Observable() { for (auto *observer : observers) observer->observee = nullptr; } inline void Observable::notifyObservers() { lastNotified.renew(); } inline void Observable::registerObserver(Observer &newObserver) { observers.push_back(&newObserver); } inline void Observable::removeObserver(Observer &toRemove) { auto &o = observers; o.erase(std::remove(o.begin(), o.end(), &toRemove), o.end()); } // Observer // inline Observer::Observer(Observable &_observee) : observee(&_observee) { observee->registerObserver(*this); } inline Observer::~Observer() { if (observee) observee->removeObserver(*this); } inline bool Observer::wasNotified() { if (!observee) return false; bool notified = lastObserved < observee->lastNotified; if (notified) lastObserved.renew(); return notified; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/OnScopeExit.h000066400000000000000000000015641456117377200231400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../traits/rktraits.h" namespace rkcommon { namespace utility { /* Execute a given function when a scope exits */ struct OnScopeExit { template OnScopeExit(FCN_T &&_fcn); ~OnScopeExit(); private: std::function fcn; }; // Inlined OnScopeExit definitions //////////////////////////////////////// template inline OnScopeExit::OnScopeExit(FCN_T &&_fcn) { static_assert(traits::has_operator_method::value, "FCN_T must implement operator() with no arguments!"); fcn = std::forward(_fcn); } inline OnScopeExit::~OnScopeExit() { fcn(); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/Optional.h000066400000000000000000000244541456117377200225300ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "../traits/rktraits.h" #include namespace rkcommon { namespace utility { /* 'Optional' implements a single item container which only _may_ have a * value in it. Use Optional<>::value_or() as a way to get either the value * or some default if the value doesn't exist --> makes it easier to read * "branchy" code. * * NOTE: Similar (but perhaps not identical to) 'boost::optional' or C++17's * 'std::optional'. * * Example: * * Optional myOpt; // 'myOpt' doesn't contain a value * assert(!myOpt.has_value()); // true. * assert(!myOpt); // true, shorthand for has_value(). * * myOpt = 5; // 'myOpt' now has a valid value, above * // asserts no longer true. * * assert(myOpt.value()); // true. * assert(*myOpt == 5); // true, shorthand for value(). * assert(myOpt.value_or(3) // true because myOpt has a value. * == 5); * * myOpt.reset(); // destroy held value. * assert(myOpt.value_or(3) // now true because myOpt had value * == 3); // removed by reset(). */ template struct Optional { // Members in C++17 specified std::optional interface in C++11 // Optional() = default; Optional(const Optional &other); template Optional(const Optional &other); Optional(Optional &&other); template Optional(Optional &&other); #if 0 // NOTE(jda) - can't get this to NOT conflict with copy/move ctors... template Optional(Args&&... args); #else Optional(const T &value); #endif ~Optional(); Optional &operator=(const Optional &other); Optional &operator=(Optional &&other); template Optional &operator=(U &&value); template Optional &operator=(const Optional &other); template Optional &operator=(Optional &&other); const T *operator->() const; T *operator->(); const T &operator*() const; T &operator*(); bool has_value() const; explicit operator bool() const; const T &value() const; T &value(); template T value_or(U &&default_value) const; void reset(); template T &emplace(Args &&... args); // Extra members // std::string toString() const; private: // Helper functions // void default_construct_storage_if_needed(); // Data members // std::array storage; bool hasValue{false}; }; // Inlined Optional definitions /////////////////////////////////////////// template inline Optional::Optional(const Optional &other) : Optional() { if (other.has_value()) *this = other.value(); } template template inline Optional::Optional(const Optional &other) : Optional() { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); if (other.has_value()) *this = other.value(); } template inline Optional::Optional(Optional &&other) : Optional() { if (other.has_value()) { reset(); value() = std::move(other.value()); hasValue = true; } } template template inline Optional::Optional(Optional &&other) : Optional() { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); if (other.has_value()) { reset(); value() = std::move(other.value()); hasValue = true; } } #if 0 // NOTE(jda) - see comment in declaration... template template inline Optional::Optional(Args&&... args) { emplace(std::forward(args)...); } #else template inline Optional::Optional(const T &value) { emplace(value); } #endif template inline Optional::~Optional() { reset(); } template inline Optional &Optional::operator=(const Optional &other) { default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template inline Optional &Optional::operator=(Optional &&other) { default_construct_storage_if_needed(); value() = std::move(other.value()); hasValue = true; return *this; } template template inline Optional &Optional::operator=(U &&rhs) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " being assigned from be convertible to the type parameter" " of the destination Optional<>."); default_construct_storage_if_needed(); this->value() = rhs; hasValue = true; return *this; } template template inline Optional &Optional::operator=(const Optional &other) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being copied-from be" " convertible to the type parameter of the destination" " Optional<>."); default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template template inline Optional &Optional::operator=(Optional &&other) { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type" " parameter of an instance being moved-from be" " convertible to the type parameter of the destination" " Optional<>."); default_construct_storage_if_needed(); value() = other.value(); hasValue = true; return *this; } template inline const T *Optional::operator->() const { return &value(); } template inline T *Optional::operator->() { return &value(); } template inline const T &Optional::operator*() const { return value(); } template inline T &Optional::operator*() { return value(); } template inline bool Optional::has_value() const { return hasValue; } template inline Optional::operator bool() const { return has_value(); } template inline const T &Optional::value() const { return *(reinterpret_cast(storage.data())); } template inline T &Optional::value() { return *(reinterpret_cast(storage.data())); } template template inline T Optional::value_or(U &&default_value) const { static_assert(std::is_convertible::value, "rkcommon::utility::Optional requires the type given" " to value_or() to be convertible to type T, the type" " parameter of Optional<>."); return has_value() ? value() : static_cast(std::forward(default_value)); } template inline void Optional::reset() { if (!std::is_trivially_destructible::value && has_value()) value().~T(); hasValue = false; } template template inline T &Optional::emplace(Args &&... args) { reset(); new (storage.data()) T(std::forward(args)...); hasValue = true; return value(); } template inline std::string Optional::toString() const { return "rkcommon::utility::Optional"; } template inline void Optional::default_construct_storage_if_needed() { if (!has_value()) new (storage.data()) T(); } // Comparison functions /////////////////////////////////////////////////// template inline bool operator==(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs == *rhs); } template inline bool operator!=(const Optional &lhs, const Optional &rhs) { return !(lhs == rhs); } template inline bool operator<(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs < *rhs); } template inline bool operator<=(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs <= *rhs); } template inline bool operator>(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs > *rhs); } template inline bool operator>=(const Optional &lhs, const Optional &rhs) { return (lhs && rhs) && (*lhs >= *rhs); } template inline Optional make_optional(Args &&... args) { Optional ret; ret.emplace(std::forward(args)...); return ret; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/OwnedArray.h000066400000000000000000000054441456117377200230140ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include "AbstractArray.h" #include #include namespace rkcommon { namespace utility { /* 'OwnedArray' implements an array interface on a pointer to * data which is owned by the OwnedArray. */ template struct OwnedArray : public AbstractArray { OwnedArray() = default; ~OwnedArray() override = default; template OwnedArray(std::array &init); OwnedArray(std::vector &init); explicit OwnedArray(T *data, size_t size); template OwnedArray &operator=(std::array &rhs); OwnedArray &operator=(std::vector &rhs); void reset(); void reset(T *_data, size_t _size); void resize(size_t size, const T &val); private: std::vector dataBuf; }; // Inlined OwnedArray definitions ///////////////////////////////////////// template inline OwnedArray::OwnedArray(T *_data, size_t _size) : dataBuf(_data, _data + _size) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template template inline OwnedArray::OwnedArray(std::array &init) : dataBuf(init.begin(), init.end()) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template inline OwnedArray::OwnedArray(std::vector &init) : dataBuf(init) { AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template template inline OwnedArray &OwnedArray::operator=(std::array &rhs) { dataBuf = std::vector(rhs.begin(), rhs.end()); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); return *this; } template inline OwnedArray &OwnedArray::operator=(std::vector &rhs) { dataBuf = std::vector(rhs.begin(), rhs.end()); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); return *this; } template inline void OwnedArray::reset() { dataBuf.clear(); dataBuf.shrink_to_fit(); AbstractArray::setPtr(nullptr, 0); } template inline void OwnedArray::reset(T *_data, size_t _size) { dataBuf = std::vector(_data, _data + _size); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } template inline void OwnedArray::resize(size_t size, const T &val) { dataBuf.resize(size, val); AbstractArray::setPtr(dataBuf.data(), dataBuf.size()); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/ParameterizedObject.cpp000066400000000000000000000022701456117377200252110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "ParameterizedObject.h" #include namespace rkcommon { namespace utility { ParameterizedObject::Param::Param(const std::string &_name) : name(_name) {} void ParameterizedObject::removeParam(const std::string &name) { auto foundParam = std::find_if( paramList.begin(), paramList.end(), [&](const std::shared_ptr &p) { return p->name == name; }); if (foundParam != paramList.end()) { paramList.erase(foundParam); } } ParameterizedObject::Param *ParameterizedObject::findParam( const std::string &name, bool addIfNotExist) { auto foundParam = std::find_if( paramList.begin(), paramList.end(), [&](const std::shared_ptr &p) { return p->name == name; }); if (foundParam != paramList.end()) return foundParam->get(); else if (addIfNotExist) { paramList.push_back(std::make_shared(name)); return paramList[paramList.size() - 1].get(); } else return nullptr; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/ParameterizedObject.h000066400000000000000000000064531456117377200246650ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // stl #include // rkcommon #include "Any.h" namespace rkcommon { namespace utility { /*! \brief defines a basic object whose lifetime is managed by ospray */ struct RKCOMMON_INTERFACE ParameterizedObject { ParameterizedObject() = default; virtual ~ParameterizedObject() = default; /*! \brief container for _any_ sort of parameter an app can assign to an ospray object */ struct RKCOMMON_INTERFACE Param { Param(const std::string &name); ~Param() = default; template void set(const T &v); utility::Any data; std::string name; bool query = false; }; /*! \brief check if a given parameter is available */ bool hasParam(const std::string &name); /*! set a parameter with given name to given value, create param if not * existing */ template void setParam(const std::string &name, const T &t); template T getParam(const std::string &name, T valIfNotFound); void removeParam(const std::string &name); void resetAllParamQueryStatus(); protected: Param *findParam(const std::string &name, bool addIfNotExist = false); std::vector>::iterator params_begin(); std::vector>::iterator params_end(); private: // Data members // /*! \brief list of parameters attached to this object */ // NOTE(jda) - Use std::shared_ptr because copy/move of a // ParameterizedObject would end up copying parameters, where // destruction of each copy should only result in freeing the // parameters *once* std::vector> paramList; }; // Inlined ParameterizedObject definitions //////////////////////////////// template inline void ParameterizedObject::Param::set(const T &v) { data = v; } inline bool ParameterizedObject::hasParam(const std::string &name) { return findParam(name, false) != nullptr; } template inline void ParameterizedObject::setParam(const std::string &name, const T &t) { findParam(name, true)->set(t); } template inline T ParameterizedObject::getParam(const std::string &name, T valIfNotFound) { Param *param = findParam(name); if (!param) return valIfNotFound; if (!param->data.is()) return valIfNotFound; param->query = true; return param->data.get(); } inline void ParameterizedObject::resetAllParamQueryStatus() { for (auto p = params_begin(); p != params_end(); ++p) (*p)->query = false; } inline std::vector>::iterator ParameterizedObject::params_begin() { return paramList.begin(); } inline std::vector>::iterator ParameterizedObject::params_end() { return paramList.end(); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/PseudoURL.cpp000066400000000000000000000102421456117377200231060ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 /*! \file PseudoURL: splits a 'pseudo-url' of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'lines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb') */ #include "PseudoURL.h" namespace rkcommon { namespace utility { void tokenize(const std::string &str, const char delim, std::vector &tokens) { size_t prev = 0; size_t fnd = str.find(delim); for (; fnd != std::string::npos; prev = fnd + 1, fnd = str.find(delim, prev)) { // Discard repeated tokens in the string, e.g. tokeninzing a::c::b on // ':' should just return a, c, b if (fnd - prev > 1) { tokens.push_back(str.substr(prev, fnd - prev)); } } // Grab the last token in the string, if the string didn't terminate with // a delimiter if (str.size() - prev > 1) { tokens.push_back(str.substr(prev)); } } /*! constructor - parse the given string into its components */ PseudoURL::PseudoURL(const std::string &inputString) { std::string tmp = inputString; const size_t separator = tmp.find("://"); if (separator != std::string::npos) { // separator specified: cut off 'type' before that separator, // and reset 'tmp' to everything behind it type = tmp.substr(0, separator); tmp = tmp.substr(separator + 3); } else { // no separator -> empty type specifier string, tmp returns // un-modified type = ""; } /* now, split remainder into its colon-separated components (the first of those is the filename, all other ones are params */ std::vector colonSeparatedComponents; tokenize(tmp, ':', colonSeparatedComponents); if (colonSeparatedComponents.empty()) // degenerate case of "type://" - return empty filename and // empty params return; fileName = colonSeparatedComponents[0]; for (size_t arg_it = 1; arg_it < colonSeparatedComponents.size(); arg_it++) { std::string arg = colonSeparatedComponents[arg_it]; const size_t equalSign = arg.find('='); if (equalSign != std::string::npos) { params.push_back(std::make_pair(arg.substr(0, equalSign), arg.substr(equalSign + 1))); } else { params.push_back(std::make_pair(arg, std::string(""))); } } } /*! return the parsed type. may we empty string if none was specified */ std::string PseudoURL::getType() const { return type; } /*! return the parsed file name specifier. cannot be empty string */ std::string PseudoURL::getFileName() const { return fileName; } /*! return value for given parameters name, or throw an exception if not specified */ std::string PseudoURL::getValue(const std::string &name) const { /* note(iw) - we do _not_ do a immediate 'return' upon the first param with mathcin gname we find so as to ensure that we use the _last_ time any parameter was written. it's more intuitive to have the last value override earlier ones, but i didn't want the parser (ie, constructor) to mess with the input data (maybe in some cases a class using this _wants_ to have multiple instances of the same parameter!?), so let's fix that here */ int found = -1; for (size_t i = 0; i < params.size(); i++) { if (params[i].first == name) found = i; } if (found < 0) { throw std::runtime_error( "PseudoURL::getValue queried value of " "not-specified parameter"); } return params[found].second; } /*! check if the given parameter was specified */ bool PseudoURL::hasParam(const std::string &name) { for (auto ¶m : params) if (param.first == name) return true; return false; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/PseudoURL.h000066400000000000000000000040311456117377200225520ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once /*! \file PseudoURL: splits a 'pseudo-url' of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'slines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb') */ #include #include #include "../common.h" namespace rkcommon { namespace utility { //! \brief Tokenize the string passed on the desired delimiter void tokenize(const std::string &str, const char delim, std::vector &tokens); /* a pseudo-url is of the form '://[:name=value]*' into its components of 'type' (e.g, 'points', 'lines', etc), filename, and 'name=value' argument pairs (e.g., 'format=xyzrgb'). This class takes a string and splits it into these components */ struct PseudoURL { /*! constructor - parse the given string into its components */ PseudoURL(const std::string &inputString); /*! return the parsed type. may we empty string if none was specified */ std::string getType() const; /*! return the parsed file name specifier. cannot be empty string */ std::string getFileName() const; /*! return value for given parameters name, or throw an exception if not specified */ std::string getValue(const std::string &name) const; /*! check if the given parameter was specified */ bool hasParam(const std::string &name); private: /*! the type of the psueod-url, eg, for 'points://file.raw' this would be 'points'. If no "://" is specified, this gets set to "" */ std::string type; /*! the filename - the thing after the ://, and before the ":" that starts parameters */ std::string fileName; /*! the name-value pairs specified as parameters */ std::vector> params; }; } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/SaveImage.h000066400000000000000000000063551456117377200226040ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // Quiet `fopen` MSVC warning #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS #endif #include #include #include #include "../math/vec.h" #include "../memory/malloc.h" namespace rkcommon { namespace utility { template inline void writeImage(const std::string &fileName, const char *const header, const int sizeX, const int sizeY, const PIXEL_T *const pixel) { FILE *file = fopen(fileName.c_str(), "wb"); if (file == nullptr) throw std::runtime_error("Can't open file for writeP[FP]M!"); fprintf(file, header, sizeX, sizeY); auto out = STACK_BUFFER(COMP_T, N_COMP * sizeX); for (int y = 0; y < sizeY; y++) { auto *in = (const COMP_T *)&pixel[(FLIP ? sizeY - 1 - y : y) * sizeX]; for (int x = 0; x < sizeX; x++) for (int c = 0; c < N_COMP; c++) out[N_COMP * x + c] = in[PIXEL_COMP * x + (N_COMP == 1 ? 3 : c)]; fwrite(out, N_COMP * sizeX, sizeof(COMP_T), file); } fprintf(file, "\n"); fclose(file); } inline void writePPM(const std::string &fileName, const int sizeX, const int sizeY, const uint32_t *pixel) { writeImage( fileName, "P6\n%i %i\n255\n", sizeX, sizeY, pixel); } inline void writePGM(const std::string &fileName, const int sizeX, const int sizeY, const uint32_t *pixel) { writeImage( fileName, "P5\n%i %i\n255\n", sizeX, sizeY, pixel); } template inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const T *p) = delete; using namespace rkcommon::math; template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const float *p) { writeImage( fName, "Pf\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec3f *p) { writeImage( fName, "PF\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec3fa *p) { writeImage( fName, "PF\n%i %i\n-1.0\n", sizeX, sizeY, p); } template <> inline void writePFM(const std::string &fName, const int sizeX, const int sizeY, const vec4f *p) { writeImage( fName, "PF4\n%i %i\n-1.0\n", sizeX, sizeY, p); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/StringManip.h000066400000000000000000000053001456117377200231630ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include #include namespace rkcommon { namespace utility { /* return a string which is the two inputs match from the beginning of each */ inline std::string longestBeginningMatch(const std::string &first, const std::string &second) { // NOTE(jda) - If length of the second string is shorter than the first, // then we can only iterate through the first string the // number of characters of the second string. auto maxMatchLength = std::min(first.size(), second.size()); auto start1 = first.begin(); auto start2 = second.begin(); auto end = first.begin() + maxMatchLength; return std::string(start1, std::mismatch(start1, end, start2).first); } inline bool beginsWith(const std::string &inputString, const std::string &startsWithString) { auto startingMatch = longestBeginningMatch(inputString, startsWithString); return startingMatch.size() == startsWithString.size(); } /* split a string on a single character delimiter */ inline std::vector split(const std::string &input, char delim) { std::stringstream ss(input); std::string item; std::vector elems; while (std::getline(ss, item, delim)) elems.push_back(std::move(item)); return elems; } /* split a string on a set of delimiters */ inline std::vector split(const std::string &input, const std::string &delim) { std::vector tokens; size_t pos = 0; while (1) { size_t begin = input.find_first_not_of(delim, pos); if (begin == input.npos) return tokens; size_t end = input.find_first_of(delim, begin); tokens.push_back(input.substr( begin, (end == input.npos) ? input.npos : (end - begin))); pos = end; } } /* return lower case version of the input string */ inline std::string lowerCase(const std::string &str) { std::string retval = str; std::transform(retval.begin(), retval.end(), retval.begin(), ::tolower); return retval; } /* return upper case version of the input string */ inline std::string upperCase(const std::string &str) { std::string retval = str; std::transform(retval.begin(), retval.end(), retval.begin(), ::toupper); return retval; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/TimeStamp.cpp000066400000000000000000000016201456117377200231670ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "TimeStamp.h" namespace rkcommon { namespace utility { std::atomic TimeStamp::global{0}; TimeStamp::TimeStamp(const TimeStamp &other) { this->value = other.value.load(); } TimeStamp::TimeStamp(TimeStamp &&other) { this->value = other.value.load(); } TimeStamp &TimeStamp::operator=(const TimeStamp &other) { this->value = other.value.load(); return *this; } TimeStamp &TimeStamp::operator=(TimeStamp &&other) { this->value = other.value.load(); return *this; } TimeStamp::operator size_t() const { return value; } void TimeStamp::renew() { value = nextValue(); } size_t TimeStamp::nextValue() { return global++; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/TimeStamp.h000066400000000000000000000013451456117377200226400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../common.h" #include namespace rkcommon { namespace utility { struct RKCOMMON_INTERFACE TimeStamp { TimeStamp() = default; TimeStamp(const TimeStamp &); TimeStamp(TimeStamp &&); TimeStamp &operator=(const TimeStamp &); TimeStamp &operator=(TimeStamp &&); operator size_t() const; void renew(); private: static size_t nextValue(); // Data members // std::atomic value{nextValue()}; //! \brief the uint64_t that stores the time value static std::atomic global; }; } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/TransactionalValue.h000066400000000000000000000047251456117377200245410ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include namespace rkcommon { namespace utility { /* This implements a 1-to-1 value fence. One thread can set (or "queue") a * value for another thread to later get. This is conceptually similar to * "doublebuffering" a single value. Note that all values from the producer * thread overwrite the "queued" value, where the consumer thread will * always get the last value set by the producer thread. */ template class TransactionalValue { public: TransactionalValue() = default; ~TransactionalValue() = default; template TransactionalValue(const OtherType &ot); template TransactionalValue &operator=(const OtherType &ot); TransactionalValue &operator=(const TransactionalValue &fp); T &ref(); T get(); bool update(); private: bool newValue{false}; T queuedValue; T currentValue; std::mutex mutex; }; // Inlined TransactionalValue Members ///////////////////////////////////// template template inline TransactionalValue::TransactionalValue(const OtherType &ot) { currentValue = ot; } template template inline TransactionalValue &TransactionalValue::operator=( const OtherType &ot) { std::lock_guard lock{mutex}; queuedValue = ot; newValue = true; return *this; } template inline TransactionalValue &TransactionalValue::operator=( const TransactionalValue &fp) { std::lock_guard lock{mutex}; queuedValue = fp.ref(); newValue = true; return *this; } template inline T &TransactionalValue::ref() { return currentValue; } template inline T TransactionalValue::get() { return currentValue; } template inline bool TransactionalValue::update() { bool didUpdate = false; if (newValue) { std::lock_guard lock{mutex}; currentValue = std::move(queuedValue); newValue = false; didUpdate = true; } return didUpdate; } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/demangle.cpp000066400000000000000000000011611456117377200230400ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "demangle.h" #ifdef __GNUG__ #include #include #include #endif namespace rkcommon { namespace utility { #ifdef __GNUG__ std::string demangle(const char *name) { int status = 0; std::unique_ptr res{ abi::__cxa_demangle(name, NULL, NULL, &status), std::free}; return (status == 0) ? res.get() : name; } #else std::string demangle(const char *name) { return name; } #endif } // namespace utility } // namespace rkcommonospray-rkcommon-538f8a2/rkcommon/utility/demangle.h000066400000000000000000000006441456117377200225120ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include #include "../common.h" namespace rkcommon { namespace utility { RKCOMMON_INTERFACE std::string demangle(const char *name); template inline std::string nameOf() { return demangle(typeid(T).name()); } } // namespace utility } // namespace rkcommonospray-rkcommon-538f8a2/rkcommon/utility/detail/000077500000000000000000000000001456117377200220235ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/utility/detail/pcg_extras.hpp000066400000000000000000000474261456117377200247100ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2017 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This file provides support code that is useful for random-number generation * but not specific to the PCG generation scheme, including: * - 128-bit int support for platforms where it isn't available natively * - bit twiddling operations * - I/O of 128-bit and 8-bit integers * - Handling the evilness of SeedSeq * - Support for efficiently producing random numbers less than a given * bound */ #ifndef PCG_EXTRAS_HPP_INCLUDED #define PCG_EXTRAS_HPP_INCLUDED 1 #include #include #include #include #include #include #include #include #include #include #include #ifdef __GNUC__ #include #endif /* * Abstractions for compiler-specific directives */ #ifdef __GNUC__ #define PCG_NOINLINE __attribute__((noinline)) #else #define PCG_NOINLINE #endif /* * Some members of the PCG library use 128-bit math. When compiling on 64-bit * platforms, both GCC and Clang provide 128-bit integer types that are ideal * for the job. * * On 32-bit platforms (or with other compilers), we fall back to a C++ * class that provides 128-bit unsigned integers instead. It may seem * like we're reinventing the wheel here, because libraries already exist * that support large integers, but most existing libraries provide a very * generic multiprecision code, but here we're operating at a fixed size. * Also, most other libraries are fairly heavyweight. So we use a direct * implementation. Sadly, it's much slower than hand-coded assembly or * direct CPU support. * */ #if __SIZEOF_INT128__ namespace pcg_extras { typedef __uint128_t pcg128_t; } #define PCG_128BIT_CONSTANT(high,low) \ ((pcg_extras::pcg128_t(high) << 64) + low) #else #include "pcg_uint128.hpp" namespace pcg_extras { typedef pcg_extras::uint_x4 pcg128_t; } #define PCG_128BIT_CONSTANT(high,low) \ pcg_extras::pcg128_t(high,low) #define PCG_EMULATED_128BIT_MATH 1 #endif namespace pcg_extras { /* * We often need to represent a "number of bits". When used normally, these * numbers are never greater than 128, so an unsigned char is plenty. * If you're using a nonstandard generator of a larger size, you can set * PCG_BITCOUNT_T to have it define it as a larger size. (Some compilers * might produce faster code if you set it to an unsigned int.) */ #ifndef PCG_BITCOUNT_T typedef uint8_t bitcount_t; #else typedef PCG_BITCOUNT_T bitcount_t; #endif /* * C++ requires us to be able to serialize RNG state by printing or reading * it from a stream. Because we use 128-bit ints, we also need to be able * ot print them, so here is code to do so. * * This code provides enough functionality to print 128-bit ints in decimal * and zero-padded in hex. It's not a full-featured implementation. */ template std::basic_ostream& operator<<(std::basic_ostream& out, pcg128_t value) { auto desired_base = out.flags() & out.basefield; bool want_hex = desired_base == out.hex; if (want_hex) { uint64_t highpart = uint64_t(value >> 64); uint64_t lowpart = uint64_t(value); auto desired_width = out.width(); if (desired_width > 16) { out.width(desired_width - 16); } if (highpart != 0 || desired_width > 16) out << highpart; CharT oldfill = '\0'; if (highpart != 0) { out.width(16); oldfill = out.fill('0'); } auto oldflags = out.setf(decltype(desired_base){}, out.showbase); out << lowpart; out.setf(oldflags); if (highpart != 0) { out.fill(oldfill); } return out; } constexpr size_t MAX_CHARS_128BIT = 40; char buffer[MAX_CHARS_128BIT]; char* pos = buffer+sizeof(buffer); *(--pos) = '\0'; constexpr auto BASE = pcg128_t(10ULL); do { auto div = value / BASE; auto mod = uint32_t(value - (div * BASE)); *(--pos) = '0' + char(mod); value = div; } while(value != pcg128_t(0ULL)); return out << pos; } template std::basic_istream& operator>>(std::basic_istream& in, pcg128_t& value) { typename std::basic_istream::sentry s(in); if (!s) return in; constexpr auto BASE = pcg128_t(10ULL); pcg128_t current(0ULL); bool did_nothing = true; bool overflow = false; for(;;) { CharT wide_ch = in.get(); if (!in.good()) break; auto ch = in.narrow(wide_ch, '\0'); if (ch < '0' || ch > '9') { in.unget(); break; } did_nothing = false; pcg128_t digit(uint32_t(ch - '0')); pcg128_t timesbase = current*BASE; overflow = overflow || timesbase < current; current = timesbase + digit; overflow = overflow || current < digit; } if (did_nothing || overflow) { in.setstate(std::ios::failbit); if (overflow) current = ~pcg128_t(0ULL); } value = current; return in; } /* * Likewise, if people use tiny rngs, we'll be serializing uint8_t. * If we just used the provided IO operators, they'd read/write chars, * not ints, so we need to define our own. We *can* redefine this operator * here because we're in our own namespace. */ template std::basic_ostream& operator<<(std::basic_ostream&out, uint8_t value) { return out << uint32_t(value); } template std::basic_istream& operator>>(std::basic_istream& in, uint8_t& target) { uint32_t value = 0xdecea5edU; in >> value; if (!in && value == 0xdecea5edU) return in; if (value > uint8_t(~0)) { in.setstate(std::ios::failbit); value = ~0U; } target = uint8_t(value); return in; } /* Unfortunately, the above functions don't get found in preference to the * built in ones, so we create some more specific overloads that will. * Ugh. */ inline std::ostream& operator<<(std::ostream& out, uint8_t value) { return pcg_extras::operator<< (out, value); } inline std::istream& operator>>(std::istream& in, uint8_t& value) { return pcg_extras::operator>> (in, value); } /* * Useful bitwise operations. */ /* * XorShifts are invertable, but they are someting of a pain to invert. * This function backs them out. It's used by the whacky "inside out" * generator defined later. */ template inline itype unxorshift(itype x, bitcount_t bits, bitcount_t shift) { if (2*shift >= bits) { return x ^ (x >> shift); } itype lowmask1 = (itype(1U) << (bits - shift*2)) - 1; itype highmask1 = ~lowmask1; itype top1 = x; itype bottom1 = x & lowmask1; top1 ^= top1 >> shift; top1 &= highmask1; x = top1 | bottom1; itype lowmask2 = (itype(1U) << (bits - shift)) - 1; itype bottom2 = x & lowmask2; bottom2 = unxorshift(bottom2, bits - shift, shift); bottom2 &= lowmask1; return top1 | bottom2; } /* * Rotate left and right. * * In ideal world, compilers would spot idiomatic rotate code and convert it * to a rotate instruction. Of course, opinions vary on what the correct * idiom is and how to spot it. For clang, sometimes it generates better * (but still crappy) code if you define PCG_USE_ZEROCHECK_ROTATE_IDIOM. */ template inline itype rotl(itype value, bitcount_t rot) { constexpr bitcount_t bits = sizeof(itype) * 8; constexpr bitcount_t mask = bits - 1; #if PCG_USE_ZEROCHECK_ROTATE_IDIOM return rot ? (value << rot) | (value >> (bits - rot)) : value; #else return (value << rot) | (value >> ((- rot) & mask)); #endif } template inline itype rotr(itype value, bitcount_t rot) { constexpr bitcount_t bits = sizeof(itype) * 8; constexpr bitcount_t mask = bits - 1; #if PCG_USE_ZEROCHECK_ROTATE_IDIOM return rot ? (value >> rot) | (value << (bits - rot)) : value; #else return (value >> rot) | (value << ((- rot) & mask)); #endif } /* Unfortunately, both Clang and GCC sometimes perform poorly when it comes * to properly recognizing idiomatic rotate code, so for we also provide * assembler directives (enabled with PCG_USE_INLINE_ASM). Boo, hiss. * (I hope that these compilers get better so that this code can die.) * * These overloads will be preferred over the general template code above. */ #if PCG_USE_INLINE_ASM && __GNUC__ && (__x86_64__ || __i386__) inline uint8_t rotr(uint8_t value, bitcount_t rot) { asm ("rorb %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } inline uint16_t rotr(uint16_t value, bitcount_t rot) { asm ("rorw %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } inline uint32_t rotr(uint32_t value, bitcount_t rot) { asm ("rorl %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } #if __x86_64__ inline uint64_t rotr(uint64_t value, bitcount_t rot) { asm ("rorq %%cl, %0" : "=r" (value) : "0" (value), "c" (rot)); return value; } #endif // __x86_64__ #elif defined(_MSC_VER) // Use MSVC++ bit rotation intrinsics #pragma intrinsic(_rotr, _rotr64, _rotr8, _rotr16) inline uint8_t rotr(uint8_t value, bitcount_t rot) { return _rotr8(value, rot); } inline uint16_t rotr(uint16_t value, bitcount_t rot) { return _rotr16(value, rot); } inline uint32_t rotr(uint32_t value, bitcount_t rot) { return _rotr(value, rot); } inline uint64_t rotr(uint64_t value, bitcount_t rot) { return _rotr64(value, rot); } #endif // PCG_USE_INLINE_ASM /* * The C++ SeedSeq concept (modelled by seed_seq) can fill an array of * 32-bit integers with seed data, but sometimes we want to produce * larger or smaller integers. * * The following code handles this annoyance. * * uneven_copy will copy an array of 32-bit ints to an array of larger or * smaller ints (actually, the code is general it only needing forward * iterators). The copy is identical to the one that would be performed if * we just did memcpy on a standard little-endian machine, but works * regardless of the endian of the machine (or the weirdness of the ints * involved). * * generate_to initializes an array of integers using a SeedSeq * object. It is given the size as a static constant at compile time and * tries to avoid memory allocation. If we're filling in 32-bit constants * we just do it directly. If we need a separate buffer and it's small, * we allocate it on the stack. Otherwise, we fall back to heap allocation. * Ugh. * * generate_one produces a single value of some integral type using a * SeedSeq object. */ /* uneven_copy helper, case where destination ints are less than 32 bit. */ template SrcIter uneven_copy_impl( SrcIter src_first, DestIter dest_first, DestIter dest_last, std::true_type) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr bitcount_t SRC_SIZE = sizeof(src_t); constexpr bitcount_t DEST_SIZE = sizeof(dest_t); constexpr bitcount_t DEST_BITS = DEST_SIZE * 8; constexpr bitcount_t SCALE = SRC_SIZE / DEST_SIZE; size_t count = 0; src_t value = 0; while (dest_first != dest_last) { if ((count++ % SCALE) == 0) value = *src_first++; // Get more bits else value >>= DEST_BITS; // Move down bits *dest_first++ = dest_t(value); // Truncates, ignores high bits. } return src_first; } /* uneven_copy helper, case where destination ints are more than 32 bit. */ template SrcIter uneven_copy_impl( SrcIter src_first, DestIter dest_first, DestIter dest_last, std::false_type) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr auto SRC_SIZE = sizeof(src_t); constexpr auto SRC_BITS = SRC_SIZE * 8; constexpr auto DEST_SIZE = sizeof(dest_t); constexpr auto SCALE = (DEST_SIZE+SRC_SIZE-1) / SRC_SIZE; while (dest_first != dest_last) { dest_t value(0UL); unsigned int shift = 0; for (size_t i = 0; i < SCALE; ++i) { value |= dest_t(*src_first++) << shift; shift += SRC_BITS; } *dest_first++ = value; } return src_first; } /* uneven_copy, call the right code for larger vs. smaller */ template inline SrcIter uneven_copy(SrcIter src_first, DestIter dest_first, DestIter dest_last) { typedef typename std::iterator_traits::value_type src_t; typedef typename std::iterator_traits::value_type dest_t; constexpr bool DEST_IS_SMALLER = sizeof(dest_t) < sizeof(src_t); return uneven_copy_impl(src_first, dest_first, dest_last, std::integral_constant{}); } /* generate_to, fill in a fixed-size array of integral type using a SeedSeq * (actually works for any random-access iterator) */ template inline void generate_to_impl(SeedSeq&& generator, DestIter dest, std::true_type) { generator.generate(dest, dest+size); } template void generate_to_impl(SeedSeq&& generator, DestIter dest, std::false_type) { typedef typename std::iterator_traits::value_type dest_t; constexpr auto DEST_SIZE = sizeof(dest_t); constexpr auto GEN_SIZE = sizeof(uint32_t); constexpr bool GEN_IS_SMALLER = GEN_SIZE < DEST_SIZE; constexpr size_t FROM_ELEMS = GEN_IS_SMALLER ? size * ((DEST_SIZE+GEN_SIZE-1) / GEN_SIZE) : (size + (GEN_SIZE / DEST_SIZE) - 1) / ((GEN_SIZE / DEST_SIZE) + GEN_IS_SMALLER); // this odd code ^^^^^^^^^^^^^^^^^ is work-around for // a bug: http://llvm.org/bugs/show_bug.cgi?id=21287 if (FROM_ELEMS <= 1024) { uint32_t buffer[FROM_ELEMS]; generator.generate(buffer, buffer+FROM_ELEMS); uneven_copy(buffer, dest, dest+size); } else { uint32_t* buffer = static_cast(malloc(GEN_SIZE * FROM_ELEMS)); generator.generate(buffer, buffer+FROM_ELEMS); uneven_copy(buffer, dest, dest+size); free(static_cast(buffer)); } } template inline void generate_to(SeedSeq&& generator, DestIter dest) { typedef typename std::iterator_traits::value_type dest_t; constexpr bool IS_32BIT = sizeof(dest_t) == sizeof(uint32_t); generate_to_impl(std::forward(generator), dest, std::integral_constant{}); } /* generate_one, produce a value of integral type using a SeedSeq * (optionally, we can have it produce more than one and pick which one * we want) */ template inline UInt generate_one(SeedSeq&& generator) { UInt result[N]; generate_to(std::forward(generator), result); return result[i]; } template auto bounded_rand(RngType& rng, typename RngType::result_type upper_bound) -> typename RngType::result_type { typedef typename RngType::result_type rtype; rtype threshold = (RngType::max() - RngType::min() + rtype(1) - upper_bound) % upper_bound; for (;;) { rtype r = rng() - RngType::min(); if (r >= threshold) return r % upper_bound; } } template void shuffle(Iter from, Iter to, RandType&& rng) { typedef typename std::iterator_traits::difference_type delta_t; typedef typename std::remove_reference::type::result_type result_t; auto count = to - from; while (count > 1) { delta_t chosen = delta_t(bounded_rand(rng, result_t(count))); --count; --to; using std::swap; swap(*(from + chosen), *to); } } /* * Although std::seed_seq is useful, it isn't everything. Often we want to * initialize a random-number generator some other way, such as from a random * device. * * Technically, it does not meet the requirements of a SeedSequence because * it lacks some of the rarely-used member functions (some of which would * be impossible to provide). However the C++ standard is quite specific * that actual engines only called the generate method, so it ought not to be * a problem in practice. */ template class seed_seq_from { private: RngType rng_; typedef uint_least32_t result_type; public: template seed_seq_from(Args&&... args) : rng_(std::forward(args)...) { // Nothing (else) to do... } template void generate(Iter start, Iter finish) { for (auto i = start; i != finish; ++i) *i = result_type(rng_()); } constexpr size_t size() const { return (sizeof(typename RngType::result_type) > sizeof(result_type) && RngType::max() > ~size_t(0UL)) ? ~size_t(0UL) : size_t(RngType::max()); } }; /* * Sometimes you might want a distinct seed based on when the program * was compiled. That way, a particular instance of the program will * behave the same way, but when recompiled it'll produce a different * value. */ template struct static_arbitrary_seed { private: static constexpr IntType fnv(IntType hash, const char* pos) { return *pos == '\0' ? hash : fnv((hash * IntType(16777619U)) ^ *pos, (pos+1)); } public: static constexpr IntType value = fnv(IntType(2166136261U ^ sizeof(IntType)), __DATE__ __TIME__ __FILE__); }; // Sometimes, when debugging or testing, it's handy to be able print the name // of a (in human-readable form). This code allows the idiom: // // cout << printable_typename() // // to print out my_foo_type_t (or its concrete type if it is a synonym) #if __cpp_rtti || __GXX_RTTI template struct printable_typename {}; template std::ostream& operator<<(std::ostream& out, printable_typename) { const char *implementation_typename = typeid(T).name(); #ifdef __GNUC__ int status; char* pretty_name = abi::__cxa_demangle(implementation_typename, nullptr, nullptr, &status); if (status == 0) out << pretty_name; free(static_cast(pretty_name)); if (status == 0) return out; #endif out << implementation_typename; return out; } #endif // __cpp_rtti || __GXX_RTTI } // namespace pcg_extras #endif // PCG_EXTRAS_HPP_INCLUDED ospray-rkcommon-538f8a2/rkcommon/utility/detail/pcg_random.hpp000066400000000000000000002172431456117377200246560ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2019 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This code provides the reference implementation of the PCG family of * random number generators. The code is complex because it implements * * - several members of the PCG family, specifically members corresponding * to the output functions: * - XSH RR (good for 64-bit state, 32-bit output) * - XSH RS (good for 64-bit state, 32-bit output) * - XSL RR (good for 128-bit state, 64-bit output) * - RXS M XS (statistically most powerful generator) * - XSL RR RR (good for 128-bit state, 128-bit output) * - and RXS, RXS M, XSH, XSL (mostly for testing) * - at potentially *arbitrary* bit sizes * - with four different techniques for random streams (MCG, one-stream * LCG, settable-stream LCG, unique-stream LCG) * - and the extended generation schemes allowing arbitrary periods * - with all features of C++11 random number generation (and more), * some of which are somewhat painful, including * - initializing with a SeedSequence which writes 32-bit values * to memory, even though the state of the generator may not * use 32-bit values (it might use smaller or larger integers) * - I/O for RNGs and a prescribed format, which needs to handle * the issue that 8-bit and 128-bit integers don't have working * I/O routines (e.g., normally 8-bit = char, not integer) * - equality and inequality for RNGs * - and a number of convenience typedefs to mask all the complexity * * The code employes a fairly heavy level of abstraction, and has to deal * with various C++ minutia. If you're looking to learn about how the PCG * scheme works, you're probably best of starting with one of the other * codebases (see www.pcg-random.org). But if you're curious about the * constants for the various output functions used in those other, simpler, * codebases, this code shows how they are calculated. * * On the positive side, at least there are convenience typedefs so that you * can say * * pcg32 myRNG; * * rather than: * * pcg_detail::engine< * uint32_t, // Output Type * uint64_t, // State Type * pcg_detail::xsh_rr_mixin, true, // Output Func * pcg_detail::specific_stream, // Stream Kind * pcg_detail::default_multiplier // LCG Mult * > myRNG; * */ #ifndef PCG_RAND_HPP_INCLUDED #define PCG_RAND_HPP_INCLUDED 1 #include #include #include #include #include #include #include #include #include #include #include #include #include #include #ifdef _MSC_VER #pragma warning(disable:4146) #endif #ifdef _MSC_VER #define PCG_ALWAYS_INLINE __forceinline #elif __GNUC__ #define PCG_ALWAYS_INLINE __attribute__((always_inline)) #else #define PCG_ALWAYS_INLINE inline #endif /* * The pcg_extras namespace contains some support code that is likley to * be useful for a variety of RNGs, including: * - 128-bit int support for platforms where it isn't available natively * - bit twiddling operations * - I/O of 128-bit and 8-bit integers * - Handling the evilness of SeedSeq * - Support for efficiently producing random numbers less than a given * bound */ #include "pcg_extras.hpp" namespace pcg_detail { using namespace pcg_extras; /* * The LCG generators need some constants to function. This code lets you * look up the constant by *type*. For example * * default_multiplier::multiplier() * * gives you the default multipler for 32-bit integers. We use the name * of the constant and not a generic word like value to allow these classes * to be used as mixins. */ template struct default_multiplier { // Not defined for an arbitrary type }; template struct default_increment { // Not defined for an arbitrary type }; #define PCG_DEFINE_CONSTANT(type, what, kind, constant) \ template <> \ struct what ## _ ## kind { \ static constexpr type kind() { \ return constant; \ } \ }; PCG_DEFINE_CONSTANT(uint8_t, default, multiplier, 141U) PCG_DEFINE_CONSTANT(uint8_t, default, increment, 77U) PCG_DEFINE_CONSTANT(uint16_t, default, multiplier, 12829U) PCG_DEFINE_CONSTANT(uint16_t, default, increment, 47989U) PCG_DEFINE_CONSTANT(uint32_t, default, multiplier, 747796405U) PCG_DEFINE_CONSTANT(uint32_t, default, increment, 2891336453U) PCG_DEFINE_CONSTANT(uint64_t, default, multiplier, 6364136223846793005ULL) PCG_DEFINE_CONSTANT(uint64_t, default, increment, 1442695040888963407ULL) PCG_DEFINE_CONSTANT(pcg128_t, default, multiplier, PCG_128BIT_CONSTANT(2549297995355413924ULL,4865540595714422341ULL)) PCG_DEFINE_CONSTANT(pcg128_t, default, increment, PCG_128BIT_CONSTANT(6364136223846793005ULL,1442695040888963407ULL)) /* Alternative (cheaper) multipliers for 128-bit */ template struct cheap_multiplier : public default_multiplier { // For most types just use the default. }; template <> struct cheap_multiplier { static constexpr uint64_t multiplier() { return 0xda942042e4dd58b5ULL; } }; /* * Each PCG generator is available in four variants, based on how it applies * the additive constant for its underlying LCG; the variations are: * * single stream - all instances use the same fixed constant, thus * the RNG always somewhere in same sequence * mcg - adds zero, resulting in a single stream and reduced * period * specific stream - the constant can be changed at any time, selecting * a different random sequence * unique stream - the constant is based on the memory address of the * object, thus every RNG has its own unique sequence * * This variation is provided though mixin classes which define a function * value called increment() that returns the nesessary additive constant. */ /* * unique stream */ template class unique_stream { protected: static constexpr bool is_mcg = false; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; constexpr itype increment() const { return itype(reinterpret_cast(this) | 1); } constexpr itype stream() const { return increment() >> 1; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return (sizeof(itype) < sizeof(size_t) ? sizeof(itype) : sizeof(size_t))*8 - 1u; } protected: constexpr unique_stream() = default; }; /* * no stream (mcg) */ template class no_stream { protected: static constexpr bool is_mcg = true; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; static constexpr itype increment() { return 0; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return 0u; } protected: constexpr no_stream() = default; }; /* * single stream/sequence (oneseq) */ template class oneseq_stream : public default_increment { protected: static constexpr bool is_mcg = false; // Is never called, but is provided for symmetry with specific_stream void set_stream(...) { abort(); } public: typedef itype state_type; static constexpr itype stream() { return default_increment::increment() >> 1; } static constexpr bool can_specify_stream = false; static constexpr size_t streams_pow2() { return 0u; } protected: constexpr oneseq_stream() = default; }; /* * specific stream */ template class specific_stream { protected: static constexpr bool is_mcg = false; itype inc_ = default_increment::increment(); public: typedef itype state_type; typedef itype stream_state; constexpr itype increment() const { return inc_; } itype stream() { return inc_ >> 1; } void set_stream(itype specific_seq) { inc_ = (specific_seq << 1) | 1; } static constexpr bool can_specify_stream = true; static constexpr size_t streams_pow2() { return (sizeof(itype)*8) - 1u; } protected: specific_stream() = default; specific_stream(itype specific_seq) : inc_(itype(specific_seq << 1) | itype(1U)) { // Nothing (else) to do. } }; /* * This is where it all comes together. This function joins together three * mixin classes which define * - the LCG additive constant (the stream) * - the LCG multiplier * - the output function * in addition, we specify the type of the LCG state, and the result type, * and whether to use the pre-advance version of the state for the output * (increasing instruction-level parallelism) or the post-advance version * (reducing register pressure). * * Given the high level of parameterization, the code has to use some * template-metaprogramming tricks to handle some of the suble variations * involved. */ template , typename multiplier_mixin = default_multiplier > class engine : protected output_mixin, public stream_mixin, protected multiplier_mixin { protected: itype state_; struct can_specify_stream_tag {}; struct no_specifiable_stream_tag {}; using stream_mixin::increment; using multiplier_mixin::multiplier; public: typedef xtype result_type; typedef itype state_type; static constexpr size_t period_pow2() { return sizeof(state_type)*8 - 2*stream_mixin::is_mcg; } // It would be nice to use std::numeric_limits for these, but // we can't be sure that it'd be defined for the 128-bit types. static constexpr result_type min() { return result_type(0UL); } static constexpr result_type max() { return result_type(~result_type(0UL)); } protected: itype bump(itype state) { return state * multiplier() + increment(); } itype base_generate() { return state_ = bump(state_); } itype base_generate0() { itype old_state = state_; state_ = bump(state_); return old_state; } public: result_type operator()() { if (output_previous) return this->output(base_generate0()); else return this->output(base_generate()); } result_type operator()(result_type upper_bound) { return bounded_rand(*this, upper_bound); } protected: static itype advance(itype state, itype delta, itype cur_mult, itype cur_plus); static itype distance(itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask = ~itype(0U)); itype distance(itype newstate, itype mask = itype(~itype(0U))) const { return distance(state_, newstate, multiplier(), increment(), mask); } public: void advance(itype delta) { state_ = advance(state_, delta, this->multiplier(), this->increment()); } void backstep(itype delta) { advance(-delta); } void discard(itype delta) { advance(delta); } bool wrapped() { if (stream_mixin::is_mcg) { // For MCGs, the low order two bits never change. In this // implementation, we keep them fixed at 3 to make this test // easier. return state_ == 3; } else { return state_ == 0; } } engine(itype state = itype(0xcafef00dd15ea5e5ULL)) : state_(this->is_mcg ? state|state_type(3U) : bump(state + this->increment())) { // Nothing else to do. } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template engine(itype state, typename sm::stream_state stream_seed) : stream_mixin(stream_seed), state_(this->is_mcg ? state|state_type(3U) : bump(state + this->increment())) { // Nothing else to do. } template engine(SeedSeq&& seedSeq, typename std::enable_if< !stream_mixin::can_specify_stream && !std::is_convertible::value && !std::is_convertible::value, no_specifiable_stream_tag>::type = {}) : engine(generate_one(std::forward(seedSeq))) { // Nothing else to do. } template engine(SeedSeq&& seedSeq, typename std::enable_if< stream_mixin::can_specify_stream && !std::is_convertible::value && !std::is_convertible::value, can_specify_stream_tag>::type = {}) : engine(generate_one(seedSeq), generate_one(seedSeq)) { // Nothing else to do. } template void seed(Args&&... args) { new (this) engine(std::forward(args)...); } template friend bool operator==(const engine&, const engine&); template friend itype1 operator-(const engine&, const engine&); template friend std::basic_ostream& operator<<(std::basic_ostream& out, const engine&); template friend std::basic_istream& operator>>(std::basic_istream& in, engine& rng); }; template std::basic_ostream& operator<<(std::basic_ostream& out, const engine& rng) { using pcg_extras::operator<<; auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left); auto space = out.widen(' '); auto orig_fill = out.fill(); out << rng.multiplier() << space << rng.increment() << space << rng.state_; out.flags(orig_flags); out.fill(orig_fill); return out; } template std::basic_istream& operator>>(std::basic_istream& in, engine& rng) { using pcg_extras::operator>>; auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws); itype multiplier, increment, state; in >> multiplier >> increment >> state; if (!in.fail()) { bool good = true; if (multiplier != rng.multiplier()) { good = false; } else if (rng.can_specify_stream) { rng.set_stream(increment >> 1); } else if (increment != rng.increment()) { good = false; } if (good) { rng.state_ = state; } else { in.clear(std::ios::failbit); } } in.flags(orig_flags); return in; } template itype engine::advance( itype state, itype delta, itype cur_mult, itype cur_plus) { // The method used here is based on Brown, "Random Number Generation // with Arbitrary Stride,", Transactions of the American Nuclear // Society (Nov. 1994). The algorithm is very similar to fast // exponentiation. // // Even though delta is an unsigned integer, we can pass a // signed integer to go backwards, it just goes "the long way round". constexpr itype ZERO = 0u; // itype may be a non-trivial types, so constexpr itype ONE = 1u; // we define some ugly constants. itype acc_mult = 1; itype acc_plus = 0; while (delta > ZERO) { if (delta & ONE) { acc_mult *= cur_mult; acc_plus = acc_plus*cur_mult + cur_plus; } cur_plus = (cur_mult+ONE)*cur_plus; cur_mult *= cur_mult; delta >>= 1; } return acc_mult * state + acc_plus; } template itype engine::distance( itype cur_state, itype newstate, itype cur_mult, itype cur_plus, itype mask) { constexpr itype ONE = 1u; // itype could be weird, so use constant bool is_mcg = cur_plus == itype(0); itype the_bit = is_mcg ? itype(4u) : itype(1u); itype distance = 0u; while ((cur_state & mask) != (newstate & mask)) { if ((cur_state & the_bit) != (newstate & the_bit)) { cur_state = cur_state * cur_mult + cur_plus; distance |= the_bit; } assert((cur_state & the_bit) == (newstate & the_bit)); the_bit <<= 1; cur_plus = (cur_mult+ONE)*cur_plus; cur_mult *= cur_mult; } return is_mcg ? distance >> 2 : distance; } template itype operator-(const engine& lhs, const engine& rhs) { static_assert( std::is_same::value && std::is_same::value, "Incomparable generators"); if (lhs.increment() == rhs.increment()) { return rhs.distance(lhs.state_); } else { constexpr itype ONE = 1u; itype lhs_diff = lhs.increment() + (lhs.multiplier()-ONE) * lhs.state_; itype rhs_diff = rhs.increment() + (rhs.multiplier()-ONE) * rhs.state_; if ((lhs_diff & itype(3u)) != (rhs_diff & itype(3u))) { rhs_diff = -rhs_diff; } return rhs.distance(rhs_diff, lhs_diff, rhs.multiplier(), itype(0u)); } } template bool operator==(const engine& lhs, const engine& rhs) { return (lhs.multiplier() == rhs.multiplier()) && (lhs.increment() == rhs.increment()) && (lhs.state_ == rhs.state_); } template inline bool operator!=(const engine& lhs, const engine& rhs) { return !operator==(lhs,rhs); } template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using oneseq_base = engine, output_previous, oneseq_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using unique_base = engine, output_previous, unique_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using setseq_base = engine, output_previous, specific_stream, multiplier_mixin >; template class output_mixin, bool output_previous = (sizeof(itype) <= 8), template class multiplier_mixin = default_multiplier> using mcg_base = engine, output_previous, no_stream, multiplier_mixin >; /* * OUTPUT FUNCTIONS. * * These are the core of the PCG generation scheme. They specify how to * turn the base LCG's internal state into the output value of the final * generator. * * They're implemented as mixin classes. * * All of the classes have code that is written to allow it to be applied * at *arbitrary* bit sizes, although in practice they'll only be used at * standard sizes supported by C++. */ /* * XSH RS -- high xorshift, followed by a random shift * * Fast. A good performer. */ template struct xsh_rs_mixin { static xtype output(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t opbits = sparebits-5 >= 64 ? 5 : sparebits-4 >= 32 ? 4 : sparebits-3 >= 16 ? 3 : sparebits-2 >= 4 ? 2 : sparebits-1 >= 1 ? 1 : 0; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t maxrandshift = mask; constexpr bitcount_t topspare = opbits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = topspare + (xtypebits+maxrandshift)/2; bitcount_t rshift = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; internal ^= internal >> xshift; xtype result = xtype(internal >> (bottomspare - maxrandshift + rshift)); return result; } }; /* * XSH RR -- high xorshift, followed by a random rotate * * Fast. A good performer. Slightly better statistically than XSH RS. */ template struct xsh_rr_mixin { static xtype output(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype)*8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7 : xtypebits >= 64 ? 6 : xtypebits >= 32 ? 5 : xtypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = opbits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits)/2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; xtype result = xtype(internal >> bottomspare); result = rotr(result, amprot); return result; } }; /* * RXS -- random xorshift */ template struct rxs_mixin { static xtype output_rxs(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype)*8); constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t extrashift = (xtypebits - shift)/2; bitcount_t rshift = shift > 64+8 ? (internal >> (bits - 6)) & 63 : shift > 32+4 ? (internal >> (bits - 5)) & 31 : shift > 16+2 ? (internal >> (bits - 4)) & 15 : shift > 8+1 ? (internal >> (bits - 3)) & 7 : shift > 4+1 ? (internal >> (bits - 2)) & 3 : shift > 2+1 ? (internal >> (bits - 1)) & 1 : 0; internal ^= internal >> (shift + extrashift - rshift); xtype result = internal >> rshift; return result; } }; /* * RXS M XS -- random xorshift, mcg multiply, fixed xorshift * * The most statistically powerful generator, but all those steps * make it slower than some of the others. We give it the rottenest jobs. * * Because it's usually used in contexts where the state type and the * result type are the same, it is a permutation and is thus invertable. * We thus provide a function to invert it. This function is used to * for the "inside out" generator used by the extended generator. */ /* Defined type-based concepts for the multiplication step. They're actually * all derived by truncating the 128-bit, which was computed to be a good * "universal" constant. */ template struct mcg_multiplier { // Not defined for an arbitrary type }; template struct mcg_unmultiplier { // Not defined for an arbitrary type }; PCG_DEFINE_CONSTANT(uint8_t, mcg, multiplier, 217U) PCG_DEFINE_CONSTANT(uint8_t, mcg, unmultiplier, 105U) PCG_DEFINE_CONSTANT(uint16_t, mcg, multiplier, 62169U) PCG_DEFINE_CONSTANT(uint16_t, mcg, unmultiplier, 28009U) PCG_DEFINE_CONSTANT(uint32_t, mcg, multiplier, 277803737U) PCG_DEFINE_CONSTANT(uint32_t, mcg, unmultiplier, 2897767785U) PCG_DEFINE_CONSTANT(uint64_t, mcg, multiplier, 12605985483714917081ULL) PCG_DEFINE_CONSTANT(uint64_t, mcg, unmultiplier, 15009553638781119849ULL) PCG_DEFINE_CONSTANT(pcg128_t, mcg, multiplier, PCG_128BIT_CONSTANT(17766728186571221404ULL, 12605985483714917081ULL)) PCG_DEFINE_CONSTANT(pcg128_t, mcg, unmultiplier, PCG_128BIT_CONSTANT(14422606686972528997ULL, 15009553638781119849ULL)) template struct rxs_m_xs_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = xtypebits >= 128 ? 6 : xtypebits >= 64 ? 5 : xtypebits >= 32 ? 4 : xtypebits >= 16 ? 3 : 2; constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t mask = (1 << opbits) - 1; bitcount_t rshift = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; internal ^= internal >> (opbits + rshift); internal *= mcg_multiplier::multiplier(); xtype result = internal >> shift; result ^= result >> ((2U*xtypebits+2U)/3U); return result; } static itype unoutput(itype internal) { constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = bits >= 128 ? 6 : bits >= 64 ? 5 : bits >= 32 ? 4 : bits >= 16 ? 3 : 2; constexpr bitcount_t mask = (1 << opbits) - 1; internal = unxorshift(internal, bits, (2U*bits+2U)/3U); internal *= mcg_unmultiplier::unmultiplier(); bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0; internal = unxorshift(internal, bits, opbits + rshift); return internal; } }; /* * RXS M -- random xorshift, mcg multiply */ template struct rxs_m_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t opbits = xtypebits >= 128 ? 6 : xtypebits >= 64 ? 5 : xtypebits >= 32 ? 4 : xtypebits >= 16 ? 3 : 2; constexpr bitcount_t shift = bits - xtypebits; constexpr bitcount_t mask = (1 << opbits) - 1; bitcount_t rshift = opbits ? (internal >> (bits - opbits)) & mask : 0; internal ^= internal >> (opbits + rshift); internal *= mcg_multiplier::multiplier(); xtype result = internal >> shift; return result; } }; /* * DXSM -- double xorshift multiply * * This is a new, more powerful output permutation (added in 2019). It's * a more comprehensive scrambling than RXS M, but runs faster on 128-bit * types. Although primarily intended for use at large sizes, also works * at smaller sizes as well. * * This permutation is similar to xorshift multiply hash functions, except * that one of the multipliers is the LCG multiplier (to avoid needing to * have a second constant) and the other is based on the low-order bits. * This latter aspect means that the scrambling applied to the high bits * depends on the low bits, and makes it (to my eye) impractical to back * out the permutation without having the low-order bits. */ template struct dxsm_mixin { inline xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t itypebits = bitcount_t(sizeof(itype) * 8); static_assert(xtypebits <= itypebits/2, "Output type must be half the size of the state type."); xtype hi = xtype(internal >> (itypebits - xtypebits)); xtype lo = xtype(internal); lo |= 1; hi ^= hi >> (xtypebits/2); hi *= xtype(cheap_multiplier::multiplier()); hi ^= hi >> (3*(xtypebits/4)); hi *= lo; return hi; } }; /* * XSL RR -- fixed xorshift (to low bits), random rotate * * Useful for 128-bit types that are split across two CPU registers. */ template struct xsl_rr_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t wantedopbits = xtypebits >= 128 ? 7 : xtypebits >= 64 ? 6 : xtypebits >= 32 ? 5 : xtypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; xtype result = xtype(internal >> bottomspare); result = rotr(result, amprot); return result; } }; /* * XSL RR RR -- fixed xorshift (to low bits), random rotate (both parts) * * Useful for 128-bit types that are split across two CPU registers. * If you really want an invertable 128-bit RNG, I guess this is the one. */ template struct halfsize_trait {}; template <> struct halfsize_trait { typedef uint64_t type; }; template <> struct halfsize_trait { typedef uint32_t type; }; template <> struct halfsize_trait { typedef uint16_t type; }; template <> struct halfsize_trait { typedef uint8_t type; }; template struct xsl_rr_rr_mixin { typedef typename halfsize_trait::type htype; static itype output(itype internal) { constexpr bitcount_t htypebits = bitcount_t(sizeof(htype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - htypebits; constexpr bitcount_t wantedopbits = htypebits >= 128 ? 7 : htypebits >= 64 ? 6 : htypebits >= 32 ? 5 : htypebits >= 16 ? 4 : 3; constexpr bitcount_t opbits = sparebits >= wantedopbits ? wantedopbits : sparebits; constexpr bitcount_t amplifier = wantedopbits - opbits; constexpr bitcount_t mask = (1 << opbits) - 1; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t xshift = (topspare + htypebits) / 2; bitcount_t rot = opbits ? bitcount_t(internal >> (bits - opbits)) & mask : 0; bitcount_t amprot = (rot << amplifier) & mask; internal ^= internal >> xshift; htype lowbits = htype(internal); lowbits = rotr(lowbits, amprot); htype highbits = htype(internal >> topspare); bitcount_t rot2 = lowbits & mask; bitcount_t amprot2 = (rot2 << amplifier) & mask; highbits = rotr(highbits, amprot2); return (itype(highbits) << topspare) ^ itype(lowbits); } }; /* * XSH -- fixed xorshift (to high bits) * * You shouldn't use this at 64-bits or less. */ template struct xsh_mixin { static xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t topspare = 0; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; internal ^= internal >> xshift; xtype result = internal >> bottomspare; return result; } }; /* * XSL -- fixed xorshift (to low bits) * * You shouldn't use this at 64-bits or less. */ template struct xsl_mixin { inline xtype output(itype internal) { constexpr bitcount_t xtypebits = bitcount_t(sizeof(xtype) * 8); constexpr bitcount_t bits = bitcount_t(sizeof(itype) * 8); constexpr bitcount_t sparebits = bits - xtypebits; constexpr bitcount_t topspare = sparebits; constexpr bitcount_t bottomspare = sparebits - topspare; constexpr bitcount_t xshift = (topspare + xtypebits) / 2; internal ^= internal >> xshift; xtype result = internal >> bottomspare; return result; } }; /* ---- End of Output Functions ---- */ template struct inside_out : private baseclass { inside_out() = delete; typedef typename baseclass::result_type result_type; typedef typename baseclass::state_type state_type; static_assert(sizeof(result_type) == sizeof(state_type), "Require a RNG whose output function is a permutation"); static bool external_step(result_type& randval, size_t i) { state_type state = baseclass::unoutput(randval); state = state * baseclass::multiplier() + baseclass::increment() + state_type(i*2); result_type result = baseclass::output(state); randval = result; state_type zero = baseclass::is_mcg ? state & state_type(3U) : state_type(0U); return result == zero; } static bool external_advance(result_type& randval, size_t i, result_type delta, bool forwards = true) { state_type state = baseclass::unoutput(randval); state_type mult = baseclass::multiplier(); state_type inc = baseclass::increment() + state_type(i*2); state_type zero = baseclass::is_mcg ? state & state_type(3U) : state_type(0U); state_type dist_to_zero = baseclass::distance(state, zero, mult, inc); bool crosses_zero = forwards ? dist_to_zero <= delta : (-dist_to_zero) <= delta; if (!forwards) delta = -delta; state = baseclass::advance(state, delta, mult, inc); randval = baseclass::output(state); return crosses_zero; } }; template class extended : public baseclass { public: typedef typename baseclass::state_type state_type; typedef typename baseclass::result_type result_type; typedef inside_out insideout; private: static constexpr bitcount_t rtypebits = sizeof(result_type)*8; static constexpr bitcount_t stypebits = sizeof(state_type)*8; static constexpr bitcount_t tick_limit_pow2 = 64U; static constexpr size_t table_size = 1UL << table_pow2; static constexpr size_t table_shift = stypebits - table_pow2; static constexpr state_type table_mask = (state_type(1U) << table_pow2) - state_type(1U); static constexpr bool may_tick = (advance_pow2 < stypebits) && (advance_pow2 < tick_limit_pow2); static constexpr size_t tick_shift = stypebits - advance_pow2; static constexpr state_type tick_mask = may_tick ? state_type( (uint64_t(1) << (advance_pow2*may_tick)) - 1) // ^-- stupidity to appease GCC warnings : ~state_type(0U); static constexpr bool may_tock = stypebits < tick_limit_pow2; result_type data_[table_size]; PCG_NOINLINE void advance_table(); PCG_NOINLINE void advance_table(state_type delta, bool isForwards = true); result_type& get_extended_value() { state_type state = this->state_; if (kdd && baseclass::is_mcg) { // The low order bits of an MCG are constant, so drop them. state >>= 2; } size_t index = kdd ? state & table_mask : state >> table_shift; if (may_tick) { bool tick = kdd ? (state & tick_mask) == state_type(0u) : (state >> tick_shift) == state_type(0u); if (tick) advance_table(); } if (may_tock) { bool tock = state == state_type(0u); if (tock) advance_table(); } return data_[index]; } public: static constexpr size_t period_pow2() { return baseclass::period_pow2() + table_size*extvalclass::period_pow2(); } PCG_ALWAYS_INLINE result_type operator()() { result_type rhs = get_extended_value(); result_type lhs = this->baseclass::operator()(); return lhs ^ rhs; } result_type operator()(result_type upper_bound) { return bounded_rand(*this, upper_bound); } void set(result_type wanted) { result_type& rhs = get_extended_value(); result_type lhs = this->baseclass::operator()(); rhs = lhs ^ wanted; } void advance(state_type distance, bool forwards = true); void backstep(state_type distance) { advance(distance, false); } extended(const result_type* data) : baseclass() { datainit(data); } extended(const result_type* data, state_type seed) : baseclass(seed) { datainit(data); } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template extended(const result_type* data, state_type seed, typename bc::stream_state stream_seed) : baseclass(seed, stream_seed) { datainit(data); } extended() : baseclass() { selfinit(); } extended(state_type seed) : baseclass(seed) { selfinit(); } // This function may or may not exist. It thus has to be a template // to use SFINAE; users don't have to worry about its template-ness. template extended(state_type seed, typename bc::stream_state stream_seed) : baseclass(seed, stream_seed) { selfinit(); } private: void selfinit(); void datainit(const result_type* data); public: template::value && !std::is_convertible::value>::type> extended(SeedSeq&& seedSeq) : baseclass(seedSeq) { generate_to(seedSeq, data_); } template void seed(Args&&... args) { new (this) extended(std::forward(args)...); } template friend bool operator==(const extended&, const extended&); template friend std::basic_ostream& operator<<(std::basic_ostream& out, const extended&); template friend std::basic_istream& operator>>(std::basic_istream& in, extended&); }; template void extended::datainit( const result_type* data) { for (size_t i = 0; i < table_size; ++i) data_[i] = data[i]; } template void extended::selfinit() { // We need to fill the extended table with something, and we have // very little provided data, so we use the base generator to // produce values. Although not ideal (use a seed sequence, folks!), // unexpected correlations are mitigated by // - using XOR differences rather than the number directly // - the way the table is accessed, its values *won't* be accessed // in the same order the were written. // - any strange correlations would only be apparent if we // were to backstep the generator so that the base generator // was generating the same values again result_type lhs = baseclass::operator()(); result_type rhs = baseclass::operator()(); result_type xdiff = lhs - rhs; for (size_t i = 0; i < table_size; ++i) { data_[i] = baseclass::operator()() ^ xdiff; } } template bool operator==(const extended& lhs, const extended& rhs) { auto& base_lhs = static_cast(lhs); auto& base_rhs = static_cast(rhs); return base_lhs == base_rhs && std::equal( std::begin(lhs.data_), std::end(lhs.data_), std::begin(rhs.data_) ); } template inline bool operator!=(const extended& lhs, const extended& rhs) { return !operator==(lhs, rhs); } template std::basic_ostream& operator<<(std::basic_ostream& out, const extended& rng) { auto orig_flags = out.flags(std::ios_base::dec | std::ios_base::left); auto space = out.widen(' '); auto orig_fill = out.fill(); out << rng.multiplier() << space << rng.increment() << space << rng.state_; for (const auto& datum : rng.data_) out << space << datum; out.flags(orig_flags); out.fill(orig_fill); return out; } template std::basic_istream& operator>>(std::basic_istream& in, extended& rng) { extended new_rng; auto& base_rng = static_cast(new_rng); in >> base_rng; if (in.fail()) return in; auto orig_flags = in.flags(std::ios_base::dec | std::ios_base::skipws); for (auto& datum : new_rng.data_) { in >> datum; if (in.fail()) goto bail; } rng = new_rng; bail: in.flags(orig_flags); return in; } template void extended::advance_table() { bool carry = false; for (size_t i = 0; i < table_size; ++i) { if (carry) { carry = insideout::external_step(data_[i],i+1); } bool carry2 = insideout::external_step(data_[i],i+1); carry = carry || carry2; } } template void extended::advance_table( state_type delta, bool isForwards) { typedef typename baseclass::state_type base_state_t; typedef typename extvalclass::state_type ext_state_t; constexpr bitcount_t basebits = sizeof(base_state_t)*8; constexpr bitcount_t extbits = sizeof(ext_state_t)*8; static_assert(basebits <= extbits || advance_pow2 > 0, "Current implementation might overflow its carry"); base_state_t carry = 0; for (size_t i = 0; i < table_size; ++i) { base_state_t total_delta = carry + delta; ext_state_t trunc_delta = ext_state_t(total_delta); if (basebits > extbits) { carry = total_delta >> extbits; } else { carry = 0; } carry += insideout::external_advance(data_[i],i+1, trunc_delta, isForwards); } } template void extended::advance( state_type distance, bool forwards) { static_assert(kdd, "Efficient advance is too hard for non-kdd extension. " "For a weak advance, cast to base class"); state_type zero = baseclass::is_mcg ? this->state_ & state_type(3U) : state_type(0U); if (may_tick) { state_type ticks = distance >> (advance_pow2*may_tick); // ^-- stupidity to appease GCC // warnings state_type adv_mask = baseclass::is_mcg ? tick_mask << 2 : tick_mask; state_type next_advance_distance = this->distance(zero, adv_mask); if (!forwards) next_advance_distance = (-next_advance_distance) & tick_mask; if (next_advance_distance < (distance & tick_mask)) { ++ticks; } if (ticks) advance_table(ticks, forwards); } if (forwards) { if (may_tock && this->distance(zero) <= distance) advance_table(); baseclass::advance(distance); } else { if (may_tock && -(this->distance(zero)) <= distance) advance_table(state_type(1U), false); baseclass::advance(-distance); } } } // namespace pcg_detail namespace pcg_engines { using namespace pcg_detail; /* Predefined types for XSH RS */ typedef oneseq_base oneseq_xsh_rs_16_8; typedef oneseq_base oneseq_xsh_rs_32_16; typedef oneseq_base oneseq_xsh_rs_64_32; typedef oneseq_base oneseq_xsh_rs_128_64; typedef oneseq_base cm_oneseq_xsh_rs_128_64; typedef unique_base unique_xsh_rs_16_8; typedef unique_base unique_xsh_rs_32_16; typedef unique_base unique_xsh_rs_64_32; typedef unique_base unique_xsh_rs_128_64; typedef unique_base cm_unique_xsh_rs_128_64; typedef setseq_base setseq_xsh_rs_16_8; typedef setseq_base setseq_xsh_rs_32_16; typedef setseq_base setseq_xsh_rs_64_32; typedef setseq_base setseq_xsh_rs_128_64; typedef setseq_base cm_setseq_xsh_rs_128_64; typedef mcg_base mcg_xsh_rs_16_8; typedef mcg_base mcg_xsh_rs_32_16; typedef mcg_base mcg_xsh_rs_64_32; typedef mcg_base mcg_xsh_rs_128_64; typedef mcg_base cm_mcg_xsh_rs_128_64; /* Predefined types for XSH RR */ typedef oneseq_base oneseq_xsh_rr_16_8; typedef oneseq_base oneseq_xsh_rr_32_16; typedef oneseq_base oneseq_xsh_rr_64_32; typedef oneseq_base oneseq_xsh_rr_128_64; typedef oneseq_base cm_oneseq_xsh_rr_128_64; typedef unique_base unique_xsh_rr_16_8; typedef unique_base unique_xsh_rr_32_16; typedef unique_base unique_xsh_rr_64_32; typedef unique_base unique_xsh_rr_128_64; typedef unique_base cm_unique_xsh_rr_128_64; typedef setseq_base setseq_xsh_rr_16_8; typedef setseq_base setseq_xsh_rr_32_16; typedef setseq_base setseq_xsh_rr_64_32; typedef setseq_base setseq_xsh_rr_128_64; typedef setseq_base cm_setseq_xsh_rr_128_64; typedef mcg_base mcg_xsh_rr_16_8; typedef mcg_base mcg_xsh_rr_32_16; typedef mcg_base mcg_xsh_rr_64_32; typedef mcg_base mcg_xsh_rr_128_64; typedef mcg_base cm_mcg_xsh_rr_128_64; /* Predefined types for RXS M XS */ typedef oneseq_base oneseq_rxs_m_xs_8_8; typedef oneseq_base oneseq_rxs_m_xs_16_16; typedef oneseq_base oneseq_rxs_m_xs_32_32; typedef oneseq_base oneseq_rxs_m_xs_64_64; typedef oneseq_base oneseq_rxs_m_xs_128_128; typedef oneseq_base cm_oneseq_rxs_m_xs_128_128; typedef unique_base unique_rxs_m_xs_8_8; typedef unique_base unique_rxs_m_xs_16_16; typedef unique_base unique_rxs_m_xs_32_32; typedef unique_base unique_rxs_m_xs_64_64; typedef unique_base unique_rxs_m_xs_128_128; typedef unique_base cm_unique_rxs_m_xs_128_128; typedef setseq_base setseq_rxs_m_xs_8_8; typedef setseq_base setseq_rxs_m_xs_16_16; typedef setseq_base setseq_rxs_m_xs_32_32; typedef setseq_base setseq_rxs_m_xs_64_64; typedef setseq_base setseq_rxs_m_xs_128_128; typedef setseq_base cm_setseq_rxs_m_xs_128_128; // MCG versions don't make sense here, so aren't defined. /* Predefined types for RXS M */ typedef oneseq_base oneseq_rxs_m_16_8; typedef oneseq_base oneseq_rxs_m_32_16; typedef oneseq_base oneseq_rxs_m_64_32; typedef oneseq_base oneseq_rxs_m_128_64; typedef oneseq_base cm_oneseq_rxs_m_128_64; typedef unique_base unique_rxs_m_16_8; typedef unique_base unique_rxs_m_32_16; typedef unique_base unique_rxs_m_64_32; typedef unique_base unique_rxs_m_128_64; typedef unique_base cm_unique_rxs_m_128_64; typedef setseq_base setseq_rxs_m_16_8; typedef setseq_base setseq_rxs_m_32_16; typedef setseq_base setseq_rxs_m_64_32; typedef setseq_base setseq_rxs_m_128_64; typedef setseq_base cm_setseq_rxs_m_128_64; typedef mcg_base mcg_rxs_m_16_8; typedef mcg_base mcg_rxs_m_32_16; typedef mcg_base mcg_rxs_m_64_32; typedef mcg_base mcg_rxs_m_128_64; typedef mcg_base cm_mcg_rxs_m_128_64; /* Predefined types for DXSM */ typedef oneseq_base oneseq_dxsm_16_8; typedef oneseq_base oneseq_dxsm_32_16; typedef oneseq_base oneseq_dxsm_64_32; typedef oneseq_base oneseq_dxsm_128_64; typedef oneseq_base cm_oneseq_dxsm_128_64; typedef unique_base unique_dxsm_16_8; typedef unique_base unique_dxsm_32_16; typedef unique_base unique_dxsm_64_32; typedef unique_base unique_dxsm_128_64; typedef unique_base cm_unique_dxsm_128_64; typedef setseq_base setseq_dxsm_16_8; typedef setseq_base setseq_dxsm_32_16; typedef setseq_base setseq_dxsm_64_32; typedef setseq_base setseq_dxsm_128_64; typedef setseq_base cm_setseq_dxsm_128_64; typedef mcg_base mcg_dxsm_16_8; typedef mcg_base mcg_dxsm_32_16; typedef mcg_base mcg_dxsm_64_32; typedef mcg_base mcg_dxsm_128_64; typedef mcg_base cm_mcg_dxsm_128_64; /* Predefined types for XSL RR (only defined for "large" types) */ typedef oneseq_base oneseq_xsl_rr_64_32; typedef oneseq_base oneseq_xsl_rr_128_64; typedef oneseq_base cm_oneseq_xsl_rr_128_64; typedef unique_base unique_xsl_rr_64_32; typedef unique_base unique_xsl_rr_128_64; typedef unique_base cm_unique_xsl_rr_128_64; typedef setseq_base setseq_xsl_rr_64_32; typedef setseq_base setseq_xsl_rr_128_64; typedef setseq_base cm_setseq_xsl_rr_128_64; typedef mcg_base mcg_xsl_rr_64_32; typedef mcg_base mcg_xsl_rr_128_64; typedef mcg_base cm_mcg_xsl_rr_128_64; /* Predefined types for XSL RR RR (only defined for "large" types) */ typedef oneseq_base oneseq_xsl_rr_rr_64_64; typedef oneseq_base oneseq_xsl_rr_rr_128_128; typedef oneseq_base cm_oneseq_xsl_rr_rr_128_128; typedef unique_base unique_xsl_rr_rr_64_64; typedef unique_base unique_xsl_rr_rr_128_128; typedef unique_base cm_unique_xsl_rr_rr_128_128; typedef setseq_base setseq_xsl_rr_rr_64_64; typedef setseq_base setseq_xsl_rr_rr_128_128; typedef setseq_base cm_setseq_xsl_rr_rr_128_128; // MCG versions don't make sense here, so aren't defined. /* Extended generators */ template using ext_std8 = extended; template using ext_std16 = extended; template using ext_std32 = extended; template using ext_std64 = extended; template using ext_oneseq_rxs_m_xs_32_32 = ext_std32; template using ext_mcg_xsh_rs_64_32 = ext_std32; template using ext_oneseq_xsh_rs_64_32 = ext_std32; template using ext_setseq_xsh_rr_64_32 = ext_std32; template using ext_mcg_xsl_rr_128_64 = ext_std64; template using ext_oneseq_xsl_rr_128_64 = ext_std64; template using ext_setseq_xsl_rr_128_64 = ext_std64; } // namespace pcg_engines typedef pcg_engines::setseq_xsh_rr_64_32 pcg32; typedef pcg_engines::oneseq_xsh_rr_64_32 pcg32_oneseq; typedef pcg_engines::unique_xsh_rr_64_32 pcg32_unique; typedef pcg_engines::mcg_xsh_rs_64_32 pcg32_fast; typedef pcg_engines::setseq_xsl_rr_128_64 pcg64; typedef pcg_engines::oneseq_xsl_rr_128_64 pcg64_oneseq; typedef pcg_engines::unique_xsl_rr_128_64 pcg64_unique; typedef pcg_engines::mcg_xsl_rr_128_64 pcg64_fast; typedef pcg_engines::setseq_rxs_m_xs_8_8 pcg8_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_16_16 pcg16_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_32_32 pcg32_once_insecure; typedef pcg_engines::setseq_rxs_m_xs_64_64 pcg64_once_insecure; typedef pcg_engines::setseq_xsl_rr_rr_128_128 pcg128_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_8_8 pcg8_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_16_16 pcg16_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_32_32 pcg32_oneseq_once_insecure; typedef pcg_engines::oneseq_rxs_m_xs_64_64 pcg64_oneseq_once_insecure; typedef pcg_engines::oneseq_xsl_rr_rr_128_128 pcg128_oneseq_once_insecure; // These two extended RNGs provide two-dimensionally equidistributed // 32-bit generators. pcg32_k2_fast occupies the same space as pcg64, // and can be called twice to generate 64 bits, but does not required // 128-bit math; on 32-bit systems, it's faster than pcg64 as well. typedef pcg_engines::ext_setseq_xsh_rr_64_32<1,16,true> pcg32_k2; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<1,32,true> pcg32_k2_fast; // These eight extended RNGs have about as much state as arc4random // // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // // (just how good the cryptographic security is is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,true> pcg32_k64; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,true> pcg32_k64_oneseq; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,true> pcg32_k64_fast; typedef pcg_engines::ext_setseq_xsh_rr_64_32<6,16,false> pcg32_c64; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<6,32,false> pcg32_c64_oneseq; typedef pcg_engines::ext_mcg_xsh_rs_64_32<6,32,false> pcg32_c64_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,true> pcg64_k32; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,true> pcg64_k32_oneseq; typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,true> pcg64_k32_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<5,16,false> pcg64_c32; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<5,128,false> pcg64_c32_oneseq; typedef pcg_engines::ext_mcg_xsl_rr_128_64<5,128,false> pcg64_c32_fast; // These eight extended RNGs have more state than the Mersenne twister // // - the k variants are k-dimensionally equidistributed // - the c variants offer better crypographic security // // (just how good the cryptographic security is is an open question) typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,true> pcg32_k1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,true> pcg32_k1024_fast; typedef pcg_engines::ext_setseq_xsh_rr_64_32<10,16,false> pcg32_c1024; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<10,32,false> pcg32_c1024_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,true> pcg64_k1024; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,true> pcg64_k1024_fast; typedef pcg_engines::ext_setseq_xsl_rr_128_64<10,16,false> pcg64_c1024; typedef pcg_engines::ext_oneseq_xsl_rr_128_64<10,128,false> pcg64_c1024_fast; // These generators have an insanely huge period (2^524352), and is suitable // for silly party tricks, such as dumping out 64 KB ZIP files at an arbitrary // point in the future. [Actually, over the full period of the generator, it // will produce every 64 KB ZIP file 2^64 times!] typedef pcg_engines::ext_setseq_xsh_rr_64_32<14,16,true> pcg32_k16384; typedef pcg_engines::ext_oneseq_xsh_rs_64_32<14,32,true> pcg32_k16384_fast; #ifdef _MSC_VER #pragma warning(default:4146) #endif #endif // PCG_RAND_HPP_INCLUDED ospray-rkcommon-538f8a2/rkcommon/utility/detail/pcg_uint128.hpp000066400000000000000000000603751456117377200246120ustar00rootroot00000000000000/* * PCG Random Number Generation for C++ * * Copyright 2014-2017 Melissa O'Neill , * and the PCG Project contributors. * * SPDX-License-Identifier: (Apache-2.0 OR MIT) * * Licensed under the Apache License, Version 2.0 (provided in * LICENSE-APACHE.txt and at http://www.apache.org/licenses/LICENSE-2.0) * or under the MIT license (provided in LICENSE-MIT.txt and at * http://opensource.org/licenses/MIT), at your option. This file may not * be copied, modified, or distributed except according to those terms. * * Distributed on an "AS IS" BASIS, WITHOUT WARRANTY OF ANY KIND, either * express or implied. See your chosen license for details. * * For additional information about the PCG random number generation scheme, * visit http://www.pcg-random.org/. */ /* * This code provides a a C++ class that can provide 128-bit (or higher) * integers. To produce 2K-bit integers, it uses two K-bit integers, * placed in a union that allowes the code to also see them as four K/2 bit * integers (and access them either directly name, or by index). * * It may seem like we're reinventing the wheel here, because several * libraries already exist that support large integers, but most existing * libraries provide a very generic multiprecision code, but here we're * operating at a fixed size. Also, most other libraries are fairly * heavyweight. So we use a direct implementation. Sadly, it's much slower * than hand-coded assembly or direct CPU support. */ #ifndef PCG_UINT128_HPP_INCLUDED #define PCG_UINT128_HPP_INCLUDED 1 #include #include #include #include #include #include #include #if defined(_MSC_VER) // Use MSVC++ intrinsics #include #endif /* * We want to lay the type out the same way that a native type would be laid * out, which means we must know the machine's endian, at compile time. * This ugliness attempts to do so. */ #ifndef PCG_LITTLE_ENDIAN #if defined(__BYTE_ORDER__) #if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ #define PCG_LITTLE_ENDIAN 1 #elif __BYTE_ORDER__ == __ORDER_BIG_ENDIAN__ #define PCG_LITTLE_ENDIAN 0 #else #error __BYTE_ORDER__ does not match a standard endian, pick a side #endif #elif __LITTLE_ENDIAN__ || _LITTLE_ENDIAN #define PCG_LITTLE_ENDIAN 1 #elif __BIG_ENDIAN__ || _BIG_ENDIAN #define PCG_LITTLE_ENDIAN 0 #elif __x86_64 || __x86_64__ || _M_X64 || __i386 || __i386__ || _M_IX86 #define PCG_LITTLE_ENDIAN 1 #elif __powerpc__ || __POWERPC__ || __ppc__ || __PPC__ \ || __m68k__ || __mc68000__ #define PCG_LITTLE_ENDIAN 0 #else #error Unable to determine target endianness #endif #endif namespace pcg_extras { // Recent versions of GCC have intrinsics we can use to quickly calculate // the number of leading and trailing zeros in a number. If possible, we // use them, otherwise we fall back to old-fashioned bit twiddling to figure // them out. #ifndef PCG_BITCOUNT_T typedef uint8_t bitcount_t; #else typedef PCG_BITCOUNT_T bitcount_t; #endif /* * Provide some useful helper functions * * flog2 floor(log2(x)) * * trailingzeros number of trailing zero bits */ #if defined(__GNUC__) // Any GNU-compatible compiler supporting C++11 has // some useful intrinsics we can use. inline bitcount_t flog2(uint32_t v) { return 31 - __builtin_clz(v); } inline bitcount_t trailingzeros(uint32_t v) { return __builtin_ctz(v); } inline bitcount_t flog2(uint64_t v) { #if UINT64_MAX == ULONG_MAX return 63 - __builtin_clzl(v); #elif UINT64_MAX == ULLONG_MAX return 63 - __builtin_clzll(v); #else #error Cannot find a function for uint64_t #endif } inline bitcount_t trailingzeros(uint64_t v) { #if UINT64_MAX == ULONG_MAX return __builtin_ctzl(v); #elif UINT64_MAX == ULLONG_MAX return __builtin_ctzll(v); #else #error Cannot find a function for uint64_t #endif } #elif defined(_MSC_VER) // Use MSVC++ intrinsics #pragma intrinsic(_BitScanReverse, _BitScanForward) #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) #pragma intrinsic(_BitScanReverse64, _BitScanForward64) #endif inline bitcount_t flog2(uint32_t v) { unsigned long i; _BitScanReverse(&i, v); return i; } inline bitcount_t trailingzeros(uint32_t v) { unsigned long i; _BitScanForward(&i, v); return i; } inline bitcount_t flog2(uint64_t v) { #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) unsigned long i; _BitScanReverse64(&i, v); return i; #else // 32-bit x86 uint32_t high = v >> 32; uint32_t low = uint32_t(v); return high ? 32+flog2(high) : flog2(low); #endif } inline bitcount_t trailingzeros(uint64_t v) { #if defined(_M_X64) || defined(_M_ARM) || defined(_M_ARM64) unsigned long i; _BitScanForward64(&i, v); return i; #else // 32-bit x86 uint32_t high = v >> 32; uint32_t low = uint32_t(v); return low ? trailingzeros(low) : trailingzeros(high)+32; #endif } #else // Otherwise, we fall back to bit twiddling // implementations inline bitcount_t flog2(uint32_t v) { // Based on code by Eric Cole and Mark Dickinson, which appears at // https://graphics.stanford.edu/~seander/bithacks.html#IntegerLogDeBruijn static const uint8_t multiplyDeBruijnBitPos[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; v |= v >> 1; // first round down to one less than a power of 2 v |= v >> 2; v |= v >> 4; v |= v >> 8; v |= v >> 16; return multiplyDeBruijnBitPos[(uint32_t)(v * 0x07C4ACDDU) >> 27]; } inline bitcount_t trailingzeros(uint32_t v) { static const uint8_t multiplyDeBruijnBitPos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; return multiplyDeBruijnBitPos[((uint32_t)((v & -v) * 0x077CB531U)) >> 27]; } inline bitcount_t flog2(uint64_t v) { uint32_t high = v >> 32; uint32_t low = uint32_t(v); return high ? 32+flog2(high) : flog2(low); } inline bitcount_t trailingzeros(uint64_t v) { uint32_t high = v >> 32; uint32_t low = uint32_t(v); return low ? trailingzeros(low) : trailingzeros(high)+32; } #endif inline bitcount_t flog2(uint8_t v) { return flog2(uint32_t(v)); } inline bitcount_t flog2(uint16_t v) { return flog2(uint32_t(v)); } #if __SIZEOF_INT128__ inline bitcount_t flog2(__uint128_t v) { uint64_t high = uint64_t(v >> 64); uint64_t low = uint64_t(v); return high ? 64+flog2(high) : flog2(low); } #endif inline bitcount_t trailingzeros(uint8_t v) { return trailingzeros(uint32_t(v)); } inline bitcount_t trailingzeros(uint16_t v) { return trailingzeros(uint32_t(v)); } #if __SIZEOF_INT128__ inline bitcount_t trailingzeros(__uint128_t v) { uint64_t high = uint64_t(v >> 64); uint64_t low = uint64_t(v); return low ? trailingzeros(low) : trailingzeros(high)+64; } #endif template inline bitcount_t clog2(UInt v) { return flog2(v) + ((v & (-v)) != v); } template inline UInt addwithcarry(UInt x, UInt y, bool carryin, bool* carryout) { UInt half_result = y + carryin; UInt result = x + half_result; *carryout = (half_result < y) || (result < x); return result; } template inline UInt subwithcarry(UInt x, UInt y, bool carryin, bool* carryout) { UInt half_result = y + carryin; UInt result = x - half_result; *carryout = (half_result < y) || (result > x); return result; } template class uint_x4 { // private: static constexpr unsigned int UINT_BITS = sizeof(UInt) * CHAR_BIT; public: union { #if PCG_LITTLE_ENDIAN struct { UInt v0, v1, v2, v3; } w; struct { UIntX2 v01, v23; } d; #else struct { UInt v3, v2, v1, v0; } w; struct { UIntX2 v23, v01; } d; #endif // For the array access versions, the code that uses the array // must handle endian itself. Yuck. UInt wa[4]; UIntX2 da[2]; }; public: uint_x4() = default; constexpr uint_x4(UInt v3, UInt v2, UInt v1, UInt v0) #if PCG_LITTLE_ENDIAN : w{v0, v1, v2, v3} #else : w{v3, v2, v1, v0} #endif { // Nothing (else) to do } constexpr uint_x4(UIntX2 v23, UIntX2 v01) #if PCG_LITTLE_ENDIAN : d{v01,v23} #else : d{v23,v01} #endif { // Nothing (else) to do } constexpr uint_x4(UIntX2 v01) #if PCG_LITTLE_ENDIAN : d{v01, UIntX2(0)} #else : d{UIntX2(0),v01} #endif { // Nothing (else) to do } template::value && sizeof(Integral) <= sizeof(UIntX2)) >::type* = nullptr> constexpr uint_x4(Integral v01) #if PCG_LITTLE_ENDIAN : d{UIntX2(v01), UIntX2(0)} #else : d{UIntX2(0), UIntX2(v01)} #endif { // Nothing (else) to do } explicit constexpr operator UIntX2() const { return d.v01; } template::value && sizeof(Integral) <= sizeof(UIntX2)) >::type* = nullptr> explicit constexpr operator Integral() const { return Integral(d.v01); } explicit constexpr operator bool() const { return d.v01 || d.v23; } template friend uint_x4 operator*(const uint_x4&, const uint_x4&); template friend uint_x4 operator*(const uint_x4&, V); template friend std::pair< uint_x4,uint_x4 > divmod(const uint_x4&, const uint_x4&); template friend uint_x4 operator+(const uint_x4&, const uint_x4&); template friend uint_x4 operator-(const uint_x4&, const uint_x4&); template friend uint_x4 operator<<(const uint_x4&, const bitcount_t shift); template friend uint_x4 operator>>(const uint_x4&, const bitcount_t shift); template friend uint_x4 operator&(const uint_x4&, const uint_x4&); template friend uint_x4 operator|(const uint_x4&, const uint_x4&); template friend uint_x4 operator^(const uint_x4&, const uint_x4&); template friend bool operator==(const uint_x4&, const uint_x4&); template friend bool operator!=(const uint_x4&, const uint_x4&); template friend bool operator<(const uint_x4&, const uint_x4&); template friend bool operator<=(const uint_x4&, const uint_x4&); template friend bool operator>(const uint_x4&, const uint_x4&); template friend bool operator>=(const uint_x4&, const uint_x4&); template friend uint_x4 operator~(const uint_x4&); template friend uint_x4 operator-(const uint_x4&); template friend bitcount_t flog2(const uint_x4&); template friend bitcount_t trailingzeros(const uint_x4&); uint_x4& operator*=(const uint_x4& rhs) { uint_x4 result = *this * rhs; return *this = result; } uint_x4& operator*=(UIntX2 rhs) { uint_x4 result = *this * rhs; return *this = result; } uint_x4& operator/=(const uint_x4& rhs) { uint_x4 result = *this / rhs; return *this = result; } uint_x4& operator%=(const uint_x4& rhs) { uint_x4 result = *this % rhs; return *this = result; } uint_x4& operator+=(const uint_x4& rhs) { uint_x4 result = *this + rhs; return *this = result; } uint_x4& operator-=(const uint_x4& rhs) { uint_x4 result = *this - rhs; return *this = result; } uint_x4& operator&=(const uint_x4& rhs) { uint_x4 result = *this & rhs; return *this = result; } uint_x4& operator|=(const uint_x4& rhs) { uint_x4 result = *this | rhs; return *this = result; } uint_x4& operator^=(const uint_x4& rhs) { uint_x4 result = *this ^ rhs; return *this = result; } uint_x4& operator>>=(bitcount_t shift) { uint_x4 result = *this >> shift; return *this = result; } uint_x4& operator<<=(bitcount_t shift) { uint_x4 result = *this << shift; return *this = result; } }; template bitcount_t flog2(const uint_x4& v) { #if PCG_LITTLE_ENDIAN for (uint8_t i = 4; i !=0; /* dec in loop */) { --i; #else for (uint8_t i = 0; i < 4; ++i) { #endif if (v.wa[i] == 0) continue; return flog2(v.wa[i]) + uint_x4::UINT_BITS*i; } abort(); } template bitcount_t trailingzeros(const uint_x4& v) { #if PCG_LITTLE_ENDIAN for (uint8_t i = 0; i < 4; ++i) { #else for (uint8_t i = 4; i !=0; /* dec in loop */) { --i; #endif if (v.wa[i] != 0) return trailingzeros(v.wa[i]) + uint_x4::UINT_BITS*i; } return uint_x4::UINT_BITS*4; } template std::pair< uint_x4, uint_x4 > divmod(const uint_x4& orig_dividend, const uint_x4& divisor) { // If the dividend is less than the divisor, the answer is always zero. // This takes care of boundary cases like 0/x (which would otherwise be // problematic because we can't take the log of zero. (The boundary case // of division by zero is undefined.) if (orig_dividend < divisor) return { uint_x4(UIntX2(0)), orig_dividend }; auto dividend = orig_dividend; auto log2_divisor = flog2(divisor); auto log2_dividend = flog2(dividend); // assert(log2_dividend >= log2_divisor); bitcount_t logdiff = log2_dividend - log2_divisor; constexpr uint_x4 ONE(UIntX2(1)); if (logdiff == 0) return { ONE, dividend - divisor }; // Now we change the log difference to // floor(log2(divisor)) - ceil(log2(dividend)) // to ensure that we *underestimate* the result. logdiff -= 1; uint_x4 quotient(UIntX2(0)); auto qfactor = ONE << logdiff; auto factor = divisor << logdiff; do { dividend -= factor; quotient += qfactor; while (dividend < factor) { factor >>= 1; qfactor >>= 1; } } while (dividend >= divisor); return { quotient, dividend }; } template uint_x4 operator/(const uint_x4& dividend, const uint_x4& divisor) { return divmod(dividend, divisor).first; } template uint_x4 operator%(const uint_x4& dividend, const uint_x4& divisor) { return divmod(dividend, divisor).second; } template uint_x4 operator*(const uint_x4& a, const uint_x4& b) { constexpr auto UINT_BITS = uint_x4::UINT_BITS; uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(b.w.v0); r.w.v0 = UInt(a0b0); r.w.v1 = UInt(a0b0 >> UINT_BITS); UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(b.w.v0); r.w.v2 = UInt(a1b0 >> UINT_BITS); r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b.w.v1); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); carryin = false; r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b.w.v1); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout); r.d.v23 += a.d.v01 * b.d.v23 + a.d.v23 * b.d.v01; return r; } template uint_x4 operator*(const uint_x4& a, UIntX2 b01) { constexpr auto UINT_BITS = uint_x4::UINT_BITS; uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; UIntX2 a0b0 = UIntX2(a.w.v0) * UIntX2(UInt(b01)); r.w.v0 = UInt(a0b0); r.w.v1 = UInt(a0b0 >> UINT_BITS); UIntX2 a1b0 = UIntX2(a.w.v1) * UIntX2(UInt(b01)); r.w.v2 = UInt(a1b0 >> UINT_BITS); r.w.v1 = addwithcarry(r.w.v1, UInt(a1b0), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a0b1 = UIntX2(a.w.v0) * UIntX2(b01 >> UINT_BITS); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a0b1 >> UINT_BITS), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); carryin = false; r.w.v1 = addwithcarry(r.w.v1, UInt(a0b1), carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(r.w.v2, UInt(0U), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(0U), carryin, &carryout); UIntX2 a1b1 = UIntX2(a.w.v1) * UIntX2(b01 >> UINT_BITS); carryin = false; r.w.v2 = addwithcarry(r.w.v2, UInt(a1b1), carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(r.w.v3, UInt(a1b1 >> UINT_BITS), carryin, &carryout); r.d.v23 += a.d.v23 * b01; return r; } template uint_x4 operator+(const uint_x4& a, const uint_x4& b) { uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; r.w.v0 = addwithcarry(a.w.v0, b.w.v0, carryin, &carryout); carryin = carryout; r.w.v1 = addwithcarry(a.w.v1, b.w.v1, carryin, &carryout); carryin = carryout; r.w.v2 = addwithcarry(a.w.v2, b.w.v2, carryin, &carryout); carryin = carryout; r.w.v3 = addwithcarry(a.w.v3, b.w.v3, carryin, &carryout); return r; } template uint_x4 operator-(const uint_x4& a, const uint_x4& b) { uint_x4 r = {0U, 0U, 0U, 0U}; bool carryin = false; bool carryout; r.w.v0 = subwithcarry(a.w.v0, b.w.v0, carryin, &carryout); carryin = carryout; r.w.v1 = subwithcarry(a.w.v1, b.w.v1, carryin, &carryout); carryin = carryout; r.w.v2 = subwithcarry(a.w.v2, b.w.v2, carryin, &carryout); carryin = carryout; r.w.v3 = subwithcarry(a.w.v3, b.w.v3, carryin, &carryout); return r; } template uint_x4 operator&(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 & b.d.v23, a.d.v01 & b.d.v01); } template uint_x4 operator|(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 | b.d.v23, a.d.v01 | b.d.v01); } template uint_x4 operator^(const uint_x4& a, const uint_x4& b) { return uint_x4(a.d.v23 ^ b.d.v23, a.d.v01 ^ b.d.v01); } template uint_x4 operator~(const uint_x4& v) { return uint_x4(~v.d.v23, ~v.d.v01); } template uint_x4 operator-(const uint_x4& v) { return uint_x4(0UL,0UL) - v; } template bool operator==(const uint_x4& a, const uint_x4& b) { return (a.d.v01 == b.d.v01) && (a.d.v23 == b.d.v23); } template bool operator!=(const uint_x4& a, const uint_x4& b) { return !operator==(a,b); } template bool operator<(const uint_x4& a, const uint_x4& b) { return (a.d.v23 < b.d.v23) || ((a.d.v23 == b.d.v23) && (a.d.v01 < b.d.v01)); } template bool operator>(const uint_x4& a, const uint_x4& b) { return operator<(b,a); } template bool operator<=(const uint_x4& a, const uint_x4& b) { return !(operator<(b,a)); } template bool operator>=(const uint_x4& a, const uint_x4& b) { return !(operator<(a,b)); } template uint_x4 operator<<(const uint_x4& v, const bitcount_t shift) { uint_x4 r = {0U, 0U, 0U, 0U}; const bitcount_t bits = uint_x4::UINT_BITS; const bitcount_t bitmask = bits - 1; const bitcount_t shiftdiv = shift / bits; const bitcount_t shiftmod = shift & bitmask; if (shiftmod) { UInt carryover = 0; #if PCG_LITTLE_ENDIAN for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #else for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #endif r.wa[out] = (v.wa[in] << shiftmod) | carryover; carryover = (v.wa[in] >> (bits - shiftmod)); } } else { #if PCG_LITTLE_ENDIAN for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #else for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #endif r.wa[out] = v.wa[in]; } } return r; } template uint_x4 operator>>(const uint_x4& v, const bitcount_t shift) { uint_x4 r = {0U, 0U, 0U, 0U}; const bitcount_t bits = uint_x4::UINT_BITS; const bitcount_t bitmask = bits - 1; const bitcount_t shiftdiv = shift / bits; const bitcount_t shiftmod = shift & bitmask; if (shiftmod) { UInt carryover = 0; #if PCG_LITTLE_ENDIAN for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #else for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #endif r.wa[out] = (v.wa[in] >> shiftmod) | carryover; carryover = (v.wa[in] << (bits - shiftmod)); } } else { #if PCG_LITTLE_ENDIAN for (uint8_t out = 4-shiftdiv, in = 4; out != 0; /* dec in loop */) { --out, --in; #else for (uint8_t out = shiftdiv, in = 0; out < 4; ++out, ++in) { #endif r.wa[out] = v.wa[in]; } } return r; } } // namespace pcg_extras #endif // PCG_UINT128_HPP_INCLUDED ospray-rkcommon-538f8a2/rkcommon/utility/getEnvVar.h000066400000000000000000000025751456117377200226440ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "Optional.h" namespace rkcommon { namespace utility { template inline Optional getEnvVar(const std::string & /*var*/) { static_assert(!std::is_same::value && !std::is_same::value && !std::is_same::value, "You can only get an int, float, or std::string " "when using ospray::getEnvVar()!"); return {}; } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional((float)atof(str)) : Optional(); } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional(atoi(str)) : Optional(); } template <> inline Optional getEnvVar(const std::string &var) { auto *str = getenv(var.c_str()); bool found = (str != nullptr); return found ? Optional(std::string(str)) : Optional(); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/multidim_index_sequence.h000066400000000000000000000141151456117377200256370ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/vec.h" namespace rkcommon { using namespace math; template struct multidim_index_iterator; template struct multidim_index_sequence { static_assert(NDIMS == 2 || NDIMS == 3, "rkcommon::multidim_index_sequence is currently limited to" " only 2 or 3 dimensions. (NDIMS == 2 || NDIMS == 3)"); multidim_index_sequence(const vec_t &_dims); size_t flatten(const vec_t &coords) const; vec_t reshape(size_t i) const; vec_t dimensions() const; size_t total_indices() const; multidim_index_iterator begin() const; multidim_index_iterator end() const; private: vec_t dims{0}; }; using index_sequence_2D = multidim_index_sequence<2>; using index_sequence_3D = multidim_index_sequence<3>; template struct multidim_index_iterator { multidim_index_iterator(const vec_t &_dims) : dims(_dims) {} multidim_index_iterator(const vec_t &_dims, size_t start) : multidim_index_iterator(_dims) { current_index = start; } // Traditional iterator interface methods // vec_t operator*() const; multidim_index_iterator operator++(); multidim_index_iterator &operator++(int); multidim_index_iterator operator--(); multidim_index_iterator &operator--(int); multidim_index_iterator &operator+(const multidim_index_iterator &other); multidim_index_iterator &operator-(const multidim_index_iterator &other); multidim_index_iterator &operator+(size_t other); multidim_index_iterator &operator-(size_t other); bool operator==(const multidim_index_iterator &other) const; bool operator!=(const multidim_index_iterator &other) const; // Extra helper methods // void jump_to(size_t index); size_t current() const; private: multidim_index_sequence dims; size_t current_index{0}; }; // Inlined multidim_index_sequence definitions ////////////////////////////// template inline multidim_index_sequence::multidim_index_sequence( const vec_t &_dims) : dims(_dims) { } template <> inline size_t index_sequence_2D::flatten(const vec_t &coords) const { return coords.x + dims.x * coords.y; } template <> inline size_t index_sequence_3D::flatten(const vec_t &coords) const { return coords.x + dims.x * (coords.y + dims.y * coords.z); } template <> inline vec_t index_sequence_2D::reshape(size_t i) const { size_t y = i / dims.x; size_t x = i % dims.x; return vec_t(x, y); } template <> inline vec_t index_sequence_3D::reshape(size_t i) const { size_t z = i / (dims.x * dims.y); i -= (z * dims.x * dims.y); size_t y = i / dims.x; size_t x = i % dims.x; return vec_t(x, y, z); } template inline vec_t multidim_index_sequence::dimensions() const { return dims; } template inline size_t multidim_index_sequence::total_indices() const { return dims.long_product(); } template multidim_index_iterator multidim_index_sequence::begin() const { return multidim_index_iterator(dims, 0); } template multidim_index_iterator multidim_index_sequence::end() const { return multidim_index_iterator(dims, total_indices()); } // Inlined multidim_index_iterator definitions ////////////////////////////// template inline vec_t multidim_index_iterator::operator*() const { return dims.reshape(current_index); } template inline multidim_index_iterator multidim_index_iterator::operator++() { return multidim_index_iterator(dims.dimensions(), ++current_index); } template inline multidim_index_iterator &multidim_index_iterator::operator++(int) { current_index++; return *this; } template inline multidim_index_iterator multidim_index_iterator::operator--() { return multidim_index_iterator(dims.dimensions(), --current_index); } template inline multidim_index_iterator &multidim_index_iterator::operator--(int) { current_index--; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator+( const multidim_index_iterator &other) { current_index += other.current_index; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator-( const multidim_index_iterator &other) { current_index -= other.current_index; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator+(size_t offset) { current_index += offset; return *this; } template inline multidim_index_iterator &multidim_index_iterator::operator-(size_t offset) { current_index -= offset; return *this; } template inline bool multidim_index_iterator::operator==( const multidim_index_iterator &other) const { return dims.dimensions() == other.dims.dimensions() && current_index == other.current_index; } template inline bool multidim_index_iterator::operator!=( const multidim_index_iterator &other) const { return !(*this == other); } template inline void multidim_index_iterator::jump_to(size_t index) { current_index = index; } template inline size_t multidim_index_iterator::current() const { return current_index; } } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/random.h000066400000000000000000000042211456117377200222110ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include #include "../common.h" #include "../math/vec.h" #include "detail/pcg_random.hpp" namespace rkcommon { namespace utility { class pcg32_biased_float_distribution { public: pcg32_biased_float_distribution(int seed, int sequence, float lower, float upper); float operator()(); private: pcg32 rng; float lower, upper, diff; }; // Inlined pcg32_biased_float_distribution definitions /////////////////// inline pcg32_biased_float_distribution::pcg32_biased_float_distribution( int seed, int sequence, float lower, float upper) : lower(lower), upper(upper) { diff = upper - lower; rng.seed(seed, sequence); } inline float pcg32_biased_float_distribution::operator()() { const unsigned scaleBits = 0x2F800000; // 2^(-32) const float scale = *(float *)&scaleBits; return (scale * rng()) * diff + lower; } // The std::uniform_real_distribution from is not portable and may give // different results on different plaforms/compilers, we have to use our own // implementation for consistency template struct uniform_real_distribution { uniform_real_distribution(T lowerValue = 0, T upperValue = 1) : l(lowerValue), u(upperValue) {} template T operator()(G &g) { const T scale = (u - l) / T(g.max() - g.min()); return l + (g() - g.min()) * scale; } private: T l, u; }; inline math::vec3f makeRandomColor(const unsigned int i) { const unsigned int mx = 13 * 17 * 43; const unsigned int my = 11 * 29; const unsigned int mz = 7 * 23 * 63; const unsigned int g = (i * (3 * 5 * 127) + 12312314); return math::vec3f((g % mx) * (1.f / (mx - 1)), (g % my) * (1.f / (my - 1)), (g % mz) * (1.f / (mz - 1))); } } // namespace utility } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/utility/random.ih000066400000000000000000000010061456117377200223600ustar00rootroot00000000000000// Copyright 2022 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once #include "../math/vec.ih" #ifndef ISPC namespace ispc { #endif inline vec3f makeRandomColor(const uint32 i) { const uniform uint32 mx = 13 * 17 * 43; const uniform uint32 my = 11 * 29; const uniform uint32 mz = 7 * 23 * 63; const uint32 g = (i * (3 * 5 * 127) + 12312314); return make_vec3f((g % mx) * (1.f / (mx - 1)), (g % my) * (1.f / (my - 1)), (g % mz) * (1.f / (mz - 1))); } #ifndef ISPC } #endif ospray-rkcommon-538f8a2/rkcommon/version.h.in000066400000000000000000000004361456117377200213240ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #define RKCOMMON_VERSION_MAJOR @PROJECT_VERSION_MAJOR@ #define RKCOMMON_VERSION_MINOR @PROJECT_VERSION_MINOR@ #define RKCOMMON_VERSION_PATCH @PROJECT_VERSION_PATCH@ #define RKCOMMON_VERSION "@PROJECT_VERSION@" ospray-rkcommon-538f8a2/rkcommon/xml/000077500000000000000000000000001456117377200176565ustar00rootroot00000000000000ospray-rkcommon-538f8a2/rkcommon/xml/XML.cpp000066400000000000000000000225241456117377200210270ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #include "XML.h" #include namespace rkcommon { namespace xml { std::string toString(const float f) { std::stringstream ss; ss << f; return ss.str(); } std::string toString(const math::vec3f &v) { std::stringstream ss; ss << v.x << " " << v.y << " " << v.z; return ss.str(); } /*! checks if given node has given property */ bool Node::hasProp(const std::string &propName) const { return (properties.find(propName) != properties.end()); } /*! return value of property with given name if present, else return * 'fallbackValue' */ std::string Node::getProp(const std::string &propName, const std::string &fallbackValue) const { const auto it = properties.find(propName); return (it != properties.end()) ? it->second : fallbackValue; } /*! return value of property with given name if present; and throw an * exception if not */ std::string Node::getProp(const std::string &propName) const { return getProp(propName, std::string()); } static bool isWhite(char s) { return s == ' ' || s == '\t' || s == '\n' || s == '\r'; } static void expect(char *&s, const char w) { if (*s != w) { std::stringstream err; err << "error reading XML file: expecting '" << w << "', but found '" << *s << "'"; throw std::runtime_error(err.str()); } } static void expect(char *&s, const char w0, const char w1) { if (*s != w0 && *s != w1) { std::stringstream err; err << "error reading XML file: expecting '" << w0 << "' or '" << w1 << "', but found '" << *s << "'"; throw std::runtime_error(err.str()); } } static void consume(char *&s, const char w) { expect(s, w); ++s; } static void consumeComment(char *&s) { consume(s, '<'); consume(s, '!'); while (!((s[0] == 0) || (s[0] == '-' && s[1] == '-' && s[2] == '>'))) ++s; consume(s, '-'); consume(s, '-'); consume(s, '>'); } static void consume(char *&s, const char *word) { const char *in = word; while (*word) { try { consume(s, *word); ++word; } catch (...) { std::stringstream err; err << "error reading XML file: expecting '" << in << "', but could not find it"; throw std::runtime_error(err.str()); } } } static std::string makeString(const char *begin, const char *end) { if (!begin || !end || begin > end) throw std::runtime_error("invalid substring in osp::xml::makeString"); if (begin == end) return ""; char *mem = new char[end - begin + 1]; memcpy(mem, begin, end - begin); mem[end - begin] = 0; std::string s = mem; delete[] mem; return s; } static void parseString(char *&s, std::string &value) { if (*s == '"') { consume(s, '"'); char *begin = s; while (*s != '"') { if (*s == '\\') ++s; ++s; } char *end = s; value = makeString(begin, end); consume(s, '"'); } else { consume(s, '\''); char *begin = s; while (*s != '\'') { if (*s == '\\') ++s; ++s; } char *end = s; value = makeString(begin, end); consume(s, '\''); } } static bool parseIdentifier(char *&s, std::string &identifier) { if (isalpha(*s) || *s == '_') { char *begin = s; ++s; while (isalpha(*s) || isdigit(*s) || *s == '_' || *s == '.') { ++s; } char *end = s; identifier = makeString(begin, end); return true; } return false; } static void skipWhites(char *&s) { while (isWhite(*s)) ++s; } static bool parseProp(char *&s, std::string &name, std::string &value) { if (!parseIdentifier(s, name)) return false; skipWhites(s); consume(s, '='); skipWhites(s); expect(s, '"', '\''); parseString(s, value); return true; } static bool skipComment(char *&s) { if (*s == '<' && s[1] == '!') { consumeComment(s); return true; } return false; } static Node parseNode(char *&s) { consume(s, '<'); Node node; if (!parseIdentifier(s, node.name)) throw std::runtime_error("XML error: could not parse node name"); skipWhites(s); std::string name, value; while (parseProp(s, name, value)) { node.properties[name] = value; skipWhites(s); } if (*s == '/') { consume(s, "/>"); return node; } consume(s, ">"); while (1) { skipWhites(s); if (skipComment(s)) continue; if (*s == '<' && s[1] == '/') { consume(s, ", but ended with '"); } consume(s, ">"); break; // either end of current node } else if (*s == '<') { // child node node.child.push_back(parseNode(s)); } else if (*s == 0) { std::cout << "#osp:xml: warning: xml file ended with still-open" " nodes (this typically indicates a partial xml file)" << std::endl; return node; } else { if (node.content != "") { throw std::runtime_error( "invalid XML node - two different" " contents!?"); } // content char *begin = s; while (*s != '<' && *s != 0) ++s; char *end = s; while (isspace(end[-1])) --end; node.content = makeString(begin, end); } } return node; } static bool parseHeader(char *&s) { consume(s, "') { consume(s, "?>"); return true; } if (!isWhite(*s)) return false; ++s; skipWhites(s); std::string name, value; while (parseProp(s, name, value)) { // ignore header prop skipWhites(s); } consume(s, "?>"); return true; } void parseXML(XMLDoc &doc, char *s) { if (s[0] == '<' && s[1] == '?') { if (!parseHeader(s)) throw std::runtime_error("could not parse XML header"); } skipWhites(s); while (*s != 0) { if (skipComment(s)) { skipWhites(s); continue; } doc.child.push_back(parseNode(s)); skipWhites(s); } if (*s != 0) throw std::runtime_error("un-parsed junk at end of file"); } void Writer::spaces() { for (size_t i = 0; i < state.size(); i++) fprintf(xml, " "); } void Writer::writeProperty(const std::string &name, const std::string &value) { assert(xml); assert(!state.empty()); State *s = state.top(); (void)s; assert(s); assert(!s->hasContent); // content may not be written before properties fprintf(xml, " %s=\"%s\"", name.c_str(), value.c_str()); } void Writer::openNode(const std::string &type) { assert(xml); spaces(); fprintf(xml, "<%s", type.c_str()); State *s = new State; s->type = type; state.push(s); } void Writer::closeNode() { assert(xml); assert(!state.empty()); State *s = state.top(); assert(s); if (s->hasContent) fprintf(xml, "", s->type.c_str()); else fprintf(xml, "/>\n"); delete s; state.pop(); } XMLDoc readXML(const std::string &fn) { FILE *file = fopen(fn.c_str(), "r"); if (!file) { throw std::runtime_error("ospray::XML error: could not open file '" + fn + "'"); } fseek(file, 0, SEEK_END); ssize_t numBytes = #ifdef _WIN32 _ftelli64(file); #else ftell(file); #endif fseek(file, 0, SEEK_SET); std::vector mem(numBytes + 1, 0); try { auto rc = fread(mem.data(), 1, numBytes, file); (void)rc; XMLDoc doc; doc.fileName = fn; parseXML(doc, mem.data()); fclose(file); return doc; } catch (const std::runtime_error &e) { fclose(file); throw e; } } Writer::Writer(FILE *xml, FILE *bin) : xml(xml), bin(bin) {} /*! write document header, may only be called once */ void Writer::writeHeader(const std::string &version) { assert(xml); fprintf(xml, "\n", version.c_str()); } /*! write document footer. may only be called once, at end of write */ void Writer::writeFooter() { assert(xml); } } // namespace xml } // namespace rkcommon ospray-rkcommon-538f8a2/rkcommon/xml/XML.h000066400000000000000000000062151456117377200204730ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 #pragma once // ospcomon #include "../common.h" #include "../math/vec.h" #include "../os/FileName.h" // stl #include #include #include #include namespace rkcommon { namespace xml { struct Node; struct XMLDoc; /*! a XML node, consisting of a name, a list of properties, and a set of child nodes */ struct RKCOMMON_INTERFACE Node { Node() = default; ~Node() = default; /*! checks if given node has given property */ bool hasProp(const std::string &name) const; /*! return value of property with given name if present; and throw an * exception if not */ std::string getProp(const std::string &name) const; /*! return value of property with given name if present, else return * 'fallbackValue' */ std::string getProp(const std::string &name, const std::string &fallbackValue) const; /*! name of the xml node (i.e., the thing that's in "....") */ std::string name; /*! the content string, i.e., the thing that's between "..." and "..." */ std::string content; /*! \brief list of xml node properties properties. ' \detailed prop'erties in xml nodes are the 'name="value"' inside the ... description */ std::map properties; /*! list of child nodes */ std::vector child; }; /*! a entire xml document */ struct RKCOMMON_INTERFACE XMLDoc : public Node { XMLDoc() = default; ~XMLDoc() = default; FileName fileName; }; /*! parse an XML file with given file name, and return a pointer to it. In case of any error, this function will free all already-allocated data, and throw a std::runtime_error exception */ RKCOMMON_INTERFACE XMLDoc readXML(const std::string &fn); /*! helper class for writing sg nodes in XML format */ struct Writer { Writer(FILE *xml, FILE *bin); /*! write document header, may only be called once */ void writeHeader(const std::string &version); /*! write document footer. may only be called once, at end of write */ void writeFooter(); //! open a new xml node with given node type */ void openNode(const std::string &type); void writeProperty(const std::string &name, const std::string &value); void writeContent(const std::string &name, const std::string &value); //! close last open node type */ void closeNode(); /*! align output pos on binary file to given alignment */ void alignData(size_t alignment); /*! write given data into data file, and return offset value at which it was written */ size_t writeData(const void *ptr, size_t size); FILE *xml, *bin; private: struct State { bool hasContent{false}; std::string type; }; void spaces(); std::stack state; }; } // namespace xml } // namespace rkcommon ospray-rkcommon-538f8a2/tests/000077500000000000000000000000001456117377200163735ustar00rootroot00000000000000ospray-rkcommon-538f8a2/tests/CMakeLists.txt000066400000000000000000000057251456117377200211440ustar00rootroot00000000000000## Copyright 2009 Intel Corporation ## SPDX-License-Identifier: Apache-2.0 add_executable(rkcommon_test_suite ${RKCOMMON_RESOURCE} catch_main.cpp array3D/test_Array3D.cpp array3D/test_for_each.cpp math/test_AffineSpace.cpp math/test_box.cpp math/test_constants.cpp math/test_LinearSpace.cpp math/test_rkmath.cpp math/test_Quaternion.cpp math/test_range.cpp math/test_vec.cpp memory/test_DeletedUniquePtr.cpp memory/test_malloc.cpp memory/test_RefCount.cpp os/test_FileName.cpp os/test_library.cpp containers/test_AlignedVector.cpp containers/test_FlatMap.cpp containers/test_TransactionalBuffer.cpp tasking/test_async.cpp tasking/test_AsyncLoop.cpp tasking/test_AsyncTask.cpp tasking/test_parallel_for.cpp tasking/test_parallel_foreach.cpp tasking/test_schedule.cpp traits/test_traits.cpp utility/test_AbstractArray.cpp utility/test_Any.cpp utility/test_ArgumentList.cpp utility/test_ArrayView.cpp utility/test_CodeTimer.cpp utility/test_DataView.cpp utility/test_demangle.cpp utility/test_DoubleBufferedValue.cpp utility/test_getEnvVar.cpp utility/test_multidim_index_sequence.cpp utility/test_Observers.cpp utility/test_OnScopeExit.cpp utility/test_Optional.cpp utility/test_OwnedArray.cpp utility/test_ParameterizedObject.cpp utility/test_PseudoURL.cpp utility/test_random.cpp utility/test_SaveImage.cpp utility/test_StringManip.cpp utility/test_TimeStamp.cpp utility/test_TransactionalValue.cpp ) target_link_libraries(rkcommon_test_suite PRIVATE rkcommon) add_test(NAME ArgumentList COMMAND rkcommon_test_suite "[ArgumentList]") add_test(NAME ArrayView COMMAND rkcommon_test_suite "[ArrayView]") add_test(NAME OnScopeExit COMMAND rkcommon_test_suite "[OnScopeExit]") add_test(NAME Optional COMMAND rkcommon_test_suite "[Optional]") add_test(NAME FlatMap COMMAND rkcommon_test_suite "[FlatMap]") add_test(NAME TransactionalBuffer COMMAND rkcommon_test_suite "[TransactionalBuffer]") add_test(NAME StringManip COMMAND rkcommon_test_suite "[StringManip]") add_test(NAME random COMMAND rkcommon_test_suite "[random]") if(NOT WIN32) # Tests which are broken on Windows with unknown fixes (for now) add_test(NAME Any COMMAND rkcommon_test_suite "[Any]") add_test(NAME AlignedVector COMMAND rkcommon_test_suite "[AlignedVector]") add_test(NAME Observers COMMAND rkcommon_test_suite "[Observers]") add_test(NAME ParameterizedObject COMMAND rkcommon_test_suite "[ParameterizedObject]") add_test(NAME async COMMAND rkcommon_test_suite "[async]") add_test(NAME parallel_for COMMAND rkcommon_test_suite "[parallel_for]") add_test(NAME parallel_foreach COMMAND rkcommon_test_suite "[parallel_foreach]") add_test(NAME schedule COMMAND rkcommon_test_suite "[schedule]") endif() install(TARGETS rkcommon_test_suite RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} ) ospray-rkcommon-538f8a2/tests/array3D/000077500000000000000000000000001456117377200177005ustar00rootroot00000000000000ospray-rkcommon-538f8a2/tests/array3D/test_Array3D.cpp000066400000000000000000000002461456117377200227120ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // only test compliation, no functional tests (yet) #include "rkcommon/array3D/Array3D.h" ospray-rkcommon-538f8a2/tests/array3D/test_for_each.cpp000066400000000000000000000002471456117377200232140ustar00rootroot00000000000000// Copyright 2009 Intel Corporation // SPDX-License-Identifier: Apache-2.0 // only test compliation, no functional tests (yet) #include "rkcommon/array3D/for_each.h" ospray-rkcommon-538f8a2/tests/catch.hpp000066400000000000000000024040031456117377200201710ustar00rootroot00000000000000/* * Catch v2.13.10 * Generated: 2022-10-16 11:01:23.452308 * ---------------------------------------------------------- * This file has been merged from multiple headers. Please don't edit it directly * Copyright (c) 2022 Two Blue Cubes Ltd. All rights reserved. * * Distributed under the Boost Software License, Version 1.0. (See accompanying * file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt) */ #ifndef TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED #define TWOBLUECUBES_SINGLE_INCLUDE_CATCH_HPP_INCLUDED // start catch.hpp #define CATCH_VERSION_MAJOR 2 #define CATCH_VERSION_MINOR 13 #define CATCH_VERSION_PATCH 10 #ifdef __clang__ # pragma clang system_header #elif defined __GNUC__ # pragma GCC system_header #endif // start catch_suppress_warnings.h #ifdef __clang__ # ifdef __ICC // icpc defines the __clang__ macro # pragma warning(push) # pragma warning(disable: 161 1682) # else // __ICC # pragma clang diagnostic push # pragma clang diagnostic ignored "-Wpadded" # pragma clang diagnostic ignored "-Wswitch-enum" # pragma clang diagnostic ignored "-Wcovered-switch-default" # endif #elif defined __GNUC__ // Because REQUIREs trigger GCC's -Wparentheses, and because still // supported version of g++ have only buggy support for _Pragmas, // Wparentheses have to be suppressed globally. # pragma GCC diagnostic ignored "-Wparentheses" // See #674 for details # pragma GCC diagnostic push # pragma GCC diagnostic ignored "-Wunused-variable" # pragma GCC diagnostic ignored "-Wpadded" #endif // end catch_suppress_warnings.h #if defined(CATCH_CONFIG_MAIN) || defined(CATCH_CONFIG_RUNNER) # define CATCH_IMPL # define CATCH_CONFIG_ALL_PARTS #endif // In the impl file, we want to have access to all parts of the headers // Can also be used to sanely support PCHs #if defined(CATCH_CONFIG_ALL_PARTS) # define CATCH_CONFIG_EXTERNAL_INTERFACES # if defined(CATCH_CONFIG_DISABLE_MATCHERS) # undef CATCH_CONFIG_DISABLE_MATCHERS # endif # if !defined(CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER) # define CATCH_CONFIG_ENABLE_CHRONO_STRINGMAKER # endif #endif #if !defined(CATCH_CONFIG_IMPL_ONLY) // start catch_platform.h // See e.g.: // https://opensource.apple.com/source/CarbonHeaders/CarbonHeaders-18.1/TargetConditionals.h.auto.html #ifdef __APPLE__ # include # if (defined(TARGET_OS_OSX) && TARGET_OS_OSX == 1) || \ (defined(TARGET_OS_MAC) && TARGET_OS_MAC == 1) # define CATCH_PLATFORM_MAC # elif (defined(TARGET_OS_IPHONE) && TARGET_OS_IPHONE == 1) # define CATCH_PLATFORM_IPHONE # endif #elif defined(linux) || defined(__linux) || defined(__linux__) # define CATCH_PLATFORM_LINUX #elif defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || defined(_MSC_VER) || defined(__MINGW32__) # define CATCH_PLATFORM_WINDOWS #endif // end catch_platform.h #ifdef CATCH_IMPL # ifndef CLARA_CONFIG_MAIN # define CLARA_CONFIG_MAIN_NOT_DEFINED # define CLARA_CONFIG_MAIN # endif #endif // start catch_user_interfaces.h namespace Catch { unsigned int rngSeed(); } // end catch_user_interfaces.h // start catch_tag_alias_autoregistrar.h // start catch_common.h // start catch_compiler_capabilities.h // Detect a number of compiler features - by compiler // The following features are defined: // // CATCH_CONFIG_COUNTER : is the __COUNTER__ macro supported? // CATCH_CONFIG_WINDOWS_SEH : is Windows SEH supported? // CATCH_CONFIG_POSIX_SIGNALS : are POSIX signals supported? // CATCH_CONFIG_DISABLE_EXCEPTIONS : Are exceptions enabled? // **************** // Note to maintainers: if new toggles are added please document them // in configuration.md, too // **************** // In general each macro has a _NO_ form // (e.g. CATCH_CONFIG_NO_POSIX_SIGNALS) which disables the feature. // Many features, at point of detection, define an _INTERNAL_ macro, so they // can be combined, en-mass, with the _NO_ forms later. #ifdef __cplusplus # if (__cplusplus >= 201402L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201402L) # define CATCH_CPP14_OR_GREATER # endif # if (__cplusplus >= 201703L) || (defined(_MSVC_LANG) && _MSVC_LANG >= 201703L) # define CATCH_CPP17_OR_GREATER # endif #endif // Only GCC compiler should be used in this block, so other compilers trying to // mask themselves as GCC should be ignored. #if defined(__GNUC__) && !defined(__clang__) && !defined(__ICC) && !defined(__CUDACC__) && !defined(__LCC__) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic push" ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "GCC diagnostic pop" ) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) #endif #if defined(__clang__) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic push" ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION _Pragma( "clang diagnostic pop" ) // As of this writing, IBM XL's implementation of __builtin_constant_p has a bug // which results in calls to destructors being emitted for each temporary, // without a matching initialization. In practice, this can result in something // like `std::string::~string` being called on an uninitialized value. // // For example, this code will likely segfault under IBM XL: // ``` // REQUIRE(std::string("12") + "34" == "1234") // ``` // // Therefore, `CATCH_INTERNAL_IGNORE_BUT_WARN` is not implemented. # if !defined(__ibmxl__) && !defined(__CUDACC__) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) (void)__builtin_constant_p(__VA_ARGS__) /* NOLINT(cppcoreguidelines-pro-type-vararg, hicpp-vararg) */ # endif # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wexit-time-destructors\"" ) \ _Pragma( "clang diagnostic ignored \"-Wglobal-constructors\"") # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wparentheses\"" ) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wunused-variable\"" ) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wgnu-zero-variadic-macro-arguments\"" ) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS \ _Pragma( "clang diagnostic ignored \"-Wunused-template\"" ) #endif // __clang__ //////////////////////////////////////////////////////////////////////////////// // Assume that non-Windows platforms support posix signals by default #if !defined(CATCH_PLATFORM_WINDOWS) #define CATCH_INTERNAL_CONFIG_POSIX_SIGNALS #endif //////////////////////////////////////////////////////////////////////////////// // We know some environments not to support full POSIX signals #if defined(__CYGWIN__) || defined(__QNX__) || defined(__EMSCRIPTEN__) || defined(__DJGPP__) #define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS #endif #ifdef __OS400__ # define CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS # define CATCH_CONFIG_COLOUR_NONE #endif //////////////////////////////////////////////////////////////////////////////// // Android somehow still does not support std::to_string #if defined(__ANDROID__) # define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING # define CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE #endif //////////////////////////////////////////////////////////////////////////////// // Not all Windows environments support SEH properly #if defined(__MINGW32__) # define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH #endif //////////////////////////////////////////////////////////////////////////////// // PS4 #if defined(__ORBIS__) # define CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE #endif //////////////////////////////////////////////////////////////////////////////// // Cygwin #ifdef __CYGWIN__ // Required for some versions of Cygwin to declare gettimeofday // see: http://stackoverflow.com/questions/36901803/gettimeofday-not-declared-in-this-scope-cygwin # define _BSD_SOURCE // some versions of cygwin (most) do not support std::to_string. Use the libstd check. // https://gcc.gnu.org/onlinedocs/gcc-4.8.2/libstdc++/api/a01053_source.html line 2812-2813 # if !((__cplusplus >= 201103L) && defined(_GLIBCXX_USE_C99) \ && !defined(_GLIBCXX_HAVE_BROKEN_VSWPRINTF)) # define CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING # endif #endif // __CYGWIN__ //////////////////////////////////////////////////////////////////////////////// // Visual C++ #if defined(_MSC_VER) // Universal Windows platform does not support SEH // Or console colours (or console at all...) # if defined(WINAPI_FAMILY) && (WINAPI_FAMILY == WINAPI_FAMILY_APP) # define CATCH_CONFIG_COLOUR_NONE # else # define CATCH_INTERNAL_CONFIG_WINDOWS_SEH # endif # if !defined(__clang__) // Handle Clang masquerading for msvc // MSVC traditional preprocessor needs some workaround for __VA_ARGS__ // _MSVC_TRADITIONAL == 0 means new conformant preprocessor // _MSVC_TRADITIONAL == 1 means old traditional non-conformant preprocessor # if !defined(_MSVC_TRADITIONAL) || (defined(_MSVC_TRADITIONAL) && _MSVC_TRADITIONAL) # define CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR # endif // MSVC_TRADITIONAL // Only do this if we're not using clang on Windows, which uses `diagnostic push` & `diagnostic pop` # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION __pragma( warning(push) ) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION __pragma( warning(pop) ) # endif // __clang__ #endif // _MSC_VER #if defined(_REENTRANT) || defined(_MSC_VER) // Enable async processing, as -pthread is specified or no additional linking is required # define CATCH_INTERNAL_CONFIG_USE_ASYNC #endif // _MSC_VER //////////////////////////////////////////////////////////////////////////////// // Check if we are compiled with -fno-exceptions or equivalent #if defined(__EXCEPTIONS) || defined(__cpp_exceptions) || defined(_CPPUNWIND) # define CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED #endif //////////////////////////////////////////////////////////////////////////////// // DJGPP #ifdef __DJGPP__ # define CATCH_INTERNAL_CONFIG_NO_WCHAR #endif // __DJGPP__ //////////////////////////////////////////////////////////////////////////////// // Embarcadero C++Build #if defined(__BORLANDC__) #define CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN #endif //////////////////////////////////////////////////////////////////////////////// // Use of __COUNTER__ is suppressed during code analysis in // CLion/AppCode 2017.2.x and former, because __COUNTER__ is not properly // handled by it. // Otherwise all supported compilers support COUNTER macro, // but user still might want to turn it off #if ( !defined(__JETBRAINS_IDE__) || __JETBRAINS_IDE__ >= 20170300L ) #define CATCH_INTERNAL_CONFIG_COUNTER #endif //////////////////////////////////////////////////////////////////////////////// // RTX is a special version of Windows that is real time. // This means that it is detected as Windows, but does not provide // the same set of capabilities as real Windows does. #if defined(UNDER_RTSS) || defined(RTX64_BUILD) #define CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH #define CATCH_INTERNAL_CONFIG_NO_ASYNC #define CATCH_CONFIG_COLOUR_NONE #endif #if !defined(_GLIBCXX_USE_C99_MATH_TR1) #define CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER #endif // Various stdlib support checks that require __has_include #if defined(__has_include) // Check if string_view is available and usable #if __has_include() && defined(CATCH_CPP17_OR_GREATER) # define CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW #endif // Check if optional is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # define CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if byte is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # include # if defined(__cpp_lib_byte) && (__cpp_lib_byte > 0) # define CATCH_INTERNAL_CONFIG_CPP17_BYTE # endif # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) // Check if variant is available and usable # if __has_include() && defined(CATCH_CPP17_OR_GREATER) # if defined(__clang__) && (__clang_major__ < 8) // work around clang bug with libstdc++ https://bugs.llvm.org/show_bug.cgi?id=31852 // fix should be in clang 8, workaround in libstdc++ 8.2 # include # if defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) # define CATCH_CONFIG_NO_CPP17_VARIANT # else # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT # endif // defined(__GLIBCXX__) && defined(_GLIBCXX_RELEASE) && (_GLIBCXX_RELEASE < 9) # else # define CATCH_INTERNAL_CONFIG_CPP17_VARIANT # endif // defined(__clang__) && (__clang_major__ < 8) # endif // __has_include() && defined(CATCH_CPP17_OR_GREATER) #endif // defined(__has_include) #if defined(CATCH_INTERNAL_CONFIG_COUNTER) && !defined(CATCH_CONFIG_NO_COUNTER) && !defined(CATCH_CONFIG_COUNTER) # define CATCH_CONFIG_COUNTER #endif #if defined(CATCH_INTERNAL_CONFIG_WINDOWS_SEH) && !defined(CATCH_CONFIG_NO_WINDOWS_SEH) && !defined(CATCH_CONFIG_WINDOWS_SEH) && !defined(CATCH_INTERNAL_CONFIG_NO_WINDOWS_SEH) # define CATCH_CONFIG_WINDOWS_SEH #endif // This is set by default, because we assume that unix compilers are posix-signal-compatible by default. #if defined(CATCH_INTERNAL_CONFIG_POSIX_SIGNALS) && !defined(CATCH_INTERNAL_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_NO_POSIX_SIGNALS) && !defined(CATCH_CONFIG_POSIX_SIGNALS) # define CATCH_CONFIG_POSIX_SIGNALS #endif // This is set by default, because we assume that compilers with no wchar_t support are just rare exceptions. #if !defined(CATCH_INTERNAL_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_NO_WCHAR) && !defined(CATCH_CONFIG_WCHAR) # define CATCH_CONFIG_WCHAR #endif #if !defined(CATCH_INTERNAL_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_NO_CPP11_TO_STRING) && !defined(CATCH_CONFIG_CPP11_TO_STRING) # define CATCH_CONFIG_CPP11_TO_STRING #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_NO_CPP17_OPTIONAL) && !defined(CATCH_CONFIG_CPP17_OPTIONAL) # define CATCH_CONFIG_CPP17_OPTIONAL #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_NO_CPP17_STRING_VIEW) && !defined(CATCH_CONFIG_CPP17_STRING_VIEW) # define CATCH_CONFIG_CPP17_STRING_VIEW #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_VARIANT) && !defined(CATCH_CONFIG_NO_CPP17_VARIANT) && !defined(CATCH_CONFIG_CPP17_VARIANT) # define CATCH_CONFIG_CPP17_VARIANT #endif #if defined(CATCH_INTERNAL_CONFIG_CPP17_BYTE) && !defined(CATCH_CONFIG_NO_CPP17_BYTE) && !defined(CATCH_CONFIG_CPP17_BYTE) # define CATCH_CONFIG_CPP17_BYTE #endif #if defined(CATCH_CONFIG_EXPERIMENTAL_REDIRECT) # define CATCH_INTERNAL_CONFIG_NEW_CAPTURE #endif #if defined(CATCH_INTERNAL_CONFIG_NEW_CAPTURE) && !defined(CATCH_INTERNAL_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NO_NEW_CAPTURE) && !defined(CATCH_CONFIG_NEW_CAPTURE) # define CATCH_CONFIG_NEW_CAPTURE #endif #if !defined(CATCH_INTERNAL_CONFIG_EXCEPTIONS_ENABLED) && !defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) # define CATCH_CONFIG_DISABLE_EXCEPTIONS #endif #if defined(CATCH_INTERNAL_CONFIG_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_NO_POLYFILL_ISNAN) && !defined(CATCH_CONFIG_POLYFILL_ISNAN) # define CATCH_CONFIG_POLYFILL_ISNAN #endif #if defined(CATCH_INTERNAL_CONFIG_USE_ASYNC) && !defined(CATCH_INTERNAL_CONFIG_NO_ASYNC) && !defined(CATCH_CONFIG_NO_USE_ASYNC) && !defined(CATCH_CONFIG_USE_ASYNC) # define CATCH_CONFIG_USE_ASYNC #endif #if defined(CATCH_INTERNAL_CONFIG_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_NO_ANDROID_LOGWRITE) && !defined(CATCH_CONFIG_ANDROID_LOGWRITE) # define CATCH_CONFIG_ANDROID_LOGWRITE #endif #if defined(CATCH_INTERNAL_CONFIG_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_NO_GLOBAL_NEXTAFTER) && !defined(CATCH_CONFIG_GLOBAL_NEXTAFTER) # define CATCH_CONFIG_GLOBAL_NEXTAFTER #endif // Even if we do not think the compiler has that warning, we still have // to provide a macro that can be used by the code. #if !defined(CATCH_INTERNAL_START_WARNINGS_SUPPRESSION) # define CATCH_INTERNAL_START_WARNINGS_SUPPRESSION #endif #if !defined(CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION) # define CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION #endif #if !defined(CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_PARENTHESES_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_ZERO_VARIADIC_WARNINGS #endif // The goal of this macro is to avoid evaluation of the arguments, but // still have the compiler warn on problems inside... #if !defined(CATCH_INTERNAL_IGNORE_BUT_WARN) # define CATCH_INTERNAL_IGNORE_BUT_WARN(...) #endif #if defined(__APPLE__) && defined(__apple_build_version__) && (__clang_major__ < 10) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #elif defined(__clang__) && (__clang_major__ < 5) # undef CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if !defined(CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS) # define CATCH_INTERNAL_SUPPRESS_UNUSED_TEMPLATE_WARNINGS #endif #if defined(CATCH_CONFIG_DISABLE_EXCEPTIONS) #define CATCH_TRY if ((true)) #define CATCH_CATCH_ALL if ((false)) #define CATCH_CATCH_ANON(type) if ((false)) #else #define CATCH_TRY try #define CATCH_CATCH_ALL catch (...) #define CATCH_CATCH_ANON(type) catch (type) #endif #if defined(CATCH_INTERNAL_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_NO_TRADITIONAL_MSVC_PREPROCESSOR) && !defined(CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR) #define CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #endif // end catch_compiler_capabilities.h #define INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) name##line #define INTERNAL_CATCH_UNIQUE_NAME_LINE( name, line ) INTERNAL_CATCH_UNIQUE_NAME_LINE2( name, line ) #ifdef CATCH_CONFIG_COUNTER # define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __COUNTER__ ) #else # define INTERNAL_CATCH_UNIQUE_NAME( name ) INTERNAL_CATCH_UNIQUE_NAME_LINE( name, __LINE__ ) #endif #include #include #include // We need a dummy global operator<< so we can bring it into Catch namespace later struct Catch_global_namespace_dummy {}; std::ostream& operator<<(std::ostream&, Catch_global_namespace_dummy); namespace Catch { struct CaseSensitive { enum Choice { Yes, No }; }; class NonCopyable { NonCopyable( NonCopyable const& ) = delete; NonCopyable( NonCopyable && ) = delete; NonCopyable& operator = ( NonCopyable const& ) = delete; NonCopyable& operator = ( NonCopyable && ) = delete; protected: NonCopyable(); virtual ~NonCopyable(); }; struct SourceLineInfo { SourceLineInfo() = delete; SourceLineInfo( char const* _file, std::size_t _line ) noexcept : file( _file ), line( _line ) {} SourceLineInfo( SourceLineInfo const& other ) = default; SourceLineInfo& operator = ( SourceLineInfo const& ) = default; SourceLineInfo( SourceLineInfo&& ) noexcept = default; SourceLineInfo& operator = ( SourceLineInfo&& ) noexcept = default; bool empty() const noexcept { return file[0] == '\0'; } bool operator == ( SourceLineInfo const& other ) const noexcept; bool operator < ( SourceLineInfo const& other ) const noexcept; char const* file; std::size_t line; }; std::ostream& operator << ( std::ostream& os, SourceLineInfo const& info ); // Bring in operator<< from global namespace into Catch namespace // This is necessary because the overload of operator<< above makes // lookup stop at namespace Catch using ::operator<<; // Use this in variadic streaming macros to allow // >> +StreamEndStop // as well as // >> stuff +StreamEndStop struct StreamEndStop { std::string operator+() const; }; template T const& operator + ( T const& value, StreamEndStop ) { return value; } } #define CATCH_INTERNAL_LINEINFO \ ::Catch::SourceLineInfo( __FILE__, static_cast( __LINE__ ) ) // end catch_common.h namespace Catch { struct RegistrarForTagAliases { RegistrarForTagAliases( char const* alias, char const* tag, SourceLineInfo const& lineInfo ); }; } // end namespace Catch #define CATCH_REGISTER_TAG_ALIAS( alias, spec ) \ CATCH_INTERNAL_START_WARNINGS_SUPPRESSION \ CATCH_INTERNAL_SUPPRESS_GLOBALS_WARNINGS \ namespace{ Catch::RegistrarForTagAliases INTERNAL_CATCH_UNIQUE_NAME( AutoRegisterTagAlias )( alias, spec, CATCH_INTERNAL_LINEINFO ); } \ CATCH_INTERNAL_STOP_WARNINGS_SUPPRESSION // end catch_tag_alias_autoregistrar.h // start catch_test_registry.h // start catch_interfaces_testcase.h #include namespace Catch { class TestSpec; struct ITestInvoker { virtual void invoke () const = 0; virtual ~ITestInvoker(); }; class TestCase; struct IConfig; struct ITestCaseRegistry { virtual ~ITestCaseRegistry(); virtual std::vector const& getAllTests() const = 0; virtual std::vector const& getAllTestsSorted( IConfig const& config ) const = 0; }; bool isThrowSafe( TestCase const& testCase, IConfig const& config ); bool matchTest( TestCase const& testCase, TestSpec const& testSpec, IConfig const& config ); std::vector filterTests( std::vector const& testCases, TestSpec const& testSpec, IConfig const& config ); std::vector const& getAllTestCasesSorted( IConfig const& config ); } // end catch_interfaces_testcase.h // start catch_stringref.h #include #include #include #include namespace Catch { /// A non-owning string class (similar to the forthcoming std::string_view) /// Note that, because a StringRef may be a substring of another string, /// it may not be null terminated. class StringRef { public: using size_type = std::size_t; using const_iterator = const char*; private: static constexpr char const* const s_empty = ""; char const* m_start = s_empty; size_type m_size = 0; public: // construction constexpr StringRef() noexcept = default; StringRef( char const* rawChars ) noexcept; constexpr StringRef( char const* rawChars, size_type size ) noexcept : m_start( rawChars ), m_size( size ) {} StringRef( std::string const& stdString ) noexcept : m_start( stdString.c_str() ), m_size( stdString.size() ) {} explicit operator std::string() const { return std::string(m_start, m_size); } public: // operators auto operator == ( StringRef const& other ) const noexcept -> bool; auto operator != (StringRef const& other) const noexcept -> bool { return !(*this == other); } auto operator[] ( size_type index ) const noexcept -> char { assert(index < m_size); return m_start[index]; } public: // named queries constexpr auto empty() const noexcept -> bool { return m_size == 0; } constexpr auto size() const noexcept -> size_type { return m_size; } // Returns the current start pointer. If the StringRef is not // null-terminated, throws std::domain_exception auto c_str() const -> char const*; public: // substrings and searches // Returns a substring of [start, start + length). // If start + length > size(), then the substring is [start, size()). // If start > size(), then the substring is empty. auto substr( size_type start, size_type length ) const noexcept -> StringRef; // Returns the current start pointer. May not be null-terminated. auto data() const noexcept -> char const*; constexpr auto isNullTerminated() const noexcept -> bool { return m_start[m_size] == '\0'; } public: // iterators constexpr const_iterator begin() const { return m_start; } constexpr const_iterator end() const { return m_start + m_size; } }; auto operator += ( std::string& lhs, StringRef const& sr ) -> std::string&; auto operator << ( std::ostream& os, StringRef const& sr ) -> std::ostream&; constexpr auto operator "" _sr( char const* rawChars, std::size_t size ) noexcept -> StringRef { return StringRef( rawChars, size ); } } // namespace Catch constexpr auto operator "" _catch_sr( char const* rawChars, std::size_t size ) noexcept -> Catch::StringRef { return Catch::StringRef( rawChars, size ); } // end catch_stringref.h // start catch_preprocessor.hpp #define CATCH_RECURSION_LEVEL0(...) __VA_ARGS__ #define CATCH_RECURSION_LEVEL1(...) CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(CATCH_RECURSION_LEVEL0(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL2(...) CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(CATCH_RECURSION_LEVEL1(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL3(...) CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(CATCH_RECURSION_LEVEL2(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL4(...) CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(CATCH_RECURSION_LEVEL3(__VA_ARGS__))) #define CATCH_RECURSION_LEVEL5(...) CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(CATCH_RECURSION_LEVEL4(__VA_ARGS__))) #ifdef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_EXPAND_VARGS(...) __VA_ARGS__ // MSVC needs more evaluations #define CATCH_RECURSION_LEVEL6(...) CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(CATCH_RECURSION_LEVEL5(__VA_ARGS__))) #define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL6(CATCH_RECURSION_LEVEL6(__VA_ARGS__)) #else #define CATCH_RECURSE(...) CATCH_RECURSION_LEVEL5(__VA_ARGS__) #endif #define CATCH_REC_END(...) #define CATCH_REC_OUT #define CATCH_EMPTY() #define CATCH_DEFER(id) id CATCH_EMPTY() #define CATCH_REC_GET_END2() 0, CATCH_REC_END #define CATCH_REC_GET_END1(...) CATCH_REC_GET_END2 #define CATCH_REC_GET_END(...) CATCH_REC_GET_END1 #define CATCH_REC_NEXT0(test, next, ...) next CATCH_REC_OUT #define CATCH_REC_NEXT1(test, next) CATCH_DEFER ( CATCH_REC_NEXT0 ) ( test, next, 0) #define CATCH_REC_NEXT(test, next) CATCH_REC_NEXT1(CATCH_REC_GET_END test, next) #define CATCH_REC_LIST0(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST1(f, x, peek, ...) , f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST2(f, x, peek, ...) f(x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1) ) ( f, peek, __VA_ARGS__ ) #define CATCH_REC_LIST0_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) #define CATCH_REC_LIST1_UD(f, userdata, x, peek, ...) , f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST0_UD) ) ( f, userdata, peek, __VA_ARGS__ ) #define CATCH_REC_LIST2_UD(f, userdata, x, peek, ...) f(userdata, x) CATCH_DEFER ( CATCH_REC_NEXT(peek, CATCH_REC_LIST1_UD) ) ( f, userdata, peek, __VA_ARGS__ ) // Applies the function macro `f` to each of the remaining parameters, inserts commas between the results, // and passes userdata as the first parameter to each invocation, // e.g. CATCH_REC_LIST_UD(f, x, a, b, c) evaluates to f(x, a), f(x, b), f(x, c) #define CATCH_REC_LIST_UD(f, userdata, ...) CATCH_RECURSE(CATCH_REC_LIST2_UD(f, userdata, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) #define CATCH_REC_LIST(f, ...) CATCH_RECURSE(CATCH_REC_LIST2(f, __VA_ARGS__, ()()(), ()()(), ()()(), 0)) #define INTERNAL_CATCH_EXPAND1(param) INTERNAL_CATCH_EXPAND2(param) #define INTERNAL_CATCH_EXPAND2(...) INTERNAL_CATCH_NO## __VA_ARGS__ #define INTERNAL_CATCH_DEF(...) INTERNAL_CATCH_DEF __VA_ARGS__ #define INTERNAL_CATCH_NOINTERNAL_CATCH_DEF #define INTERNAL_CATCH_STRINGIZE(...) INTERNAL_CATCH_STRINGIZE2(__VA_ARGS__) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_STRINGIZE2(...) #__VA_ARGS__ #define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) #else // MSVC is adding extra space and needs another indirection to expand INTERNAL_CATCH_NOINTERNAL_CATCH_DEF #define INTERNAL_CATCH_STRINGIZE2(...) INTERNAL_CATCH_STRINGIZE3(__VA_ARGS__) #define INTERNAL_CATCH_STRINGIZE3(...) #__VA_ARGS__ #define INTERNAL_CATCH_STRINGIZE_WITHOUT_PARENS(param) (INTERNAL_CATCH_STRINGIZE(INTERNAL_CATCH_REMOVE_PARENS(param)) + 1) #endif #define INTERNAL_CATCH_MAKE_NAMESPACE2(...) ns_##__VA_ARGS__ #define INTERNAL_CATCH_MAKE_NAMESPACE(name) INTERNAL_CATCH_MAKE_NAMESPACE2(name) #define INTERNAL_CATCH_REMOVE_PARENS(...) INTERNAL_CATCH_EXPAND1(INTERNAL_CATCH_DEF __VA_ARGS__) #ifndef CATCH_CONFIG_TRADITIONAL_MSVC_PREPROCESSOR #define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) decltype(get_wrapper()) #define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__)) #else #define INTERNAL_CATCH_MAKE_TYPE_LIST2(...) INTERNAL_CATCH_EXPAND_VARGS(decltype(get_wrapper())) #define INTERNAL_CATCH_MAKE_TYPE_LIST(...) INTERNAL_CATCH_EXPAND_VARGS(INTERNAL_CATCH_MAKE_TYPE_LIST2(INTERNAL_CATCH_REMOVE_PARENS(__VA_ARGS__))) #endif #define INTERNAL_CATCH_MAKE_TYPE_LISTS_FROM_TYPES(...)\ CATCH_REC_LIST(INTERNAL_CATCH_MAKE_TYPE_LIST,__VA_ARGS__) #define INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_0) INTERNAL_CATCH_REMOVE_PARENS(_0) #define INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_0, _1) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_1_ARG(_1) #define INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_0, _1, _2) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_2_ARG(_1, _2) #define INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_0, _1, _2, _3) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_3_ARG(_1, _2, _3) #define INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_0, _1, _2, _3, _4) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_4_ARG(_1, _2, _3, _4) #define INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_0, _1, _2, _3, _4, _5) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_5_ARG(_1, _2, _3, _4, _5) #define INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_0, _1, _2, _3, _4, _5, _6) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_6_ARG(_1, _2, _3, _4, _5, _6) #define INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_0, _1, _2, _3, _4, _5, _6, _7) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_7_ARG(_1, _2, _3, _4, _5, _6, _7) #define INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_8_ARG(_1, _2, _3, _4, _5, _6, _7, _8) #define INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_9_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9) #define INTERNAL_CATCH_REMOVE_PARENS_11_ARG(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10) INTERNAL_CATCH_REMOVE_PARENS(_0), INTERNAL_CATCH_REMOVE_PARENS_10_ARG(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10) #define INTERNAL_CATCH_VA_NARGS_IMPL(_0, _1, _2, _3, _4, _5, _6, _7, _8, _9, _10, N, ...) N #define INTERNAL_CATCH_TYPE_GEN\ template struct TypeList {};\ template\ constexpr auto get_wrapper() noexcept -> TypeList { return {}; }\ template class...> struct TemplateTypeList{};\ template class...Cs>\ constexpr auto get_wrapper() noexcept -> TemplateTypeList { return {}; }\ template\ struct append;\ template\ struct rewrap;\ template class, typename...>\ struct create;\ template class, typename>\ struct convert;\ \ template \ struct append { using type = T; };\ template< template class L1, typename...E1, template class L2, typename...E2, typename...Rest>\ struct append, L2, Rest...> { using type = typename append, Rest...>::type; };\ template< template class L1, typename...E1, typename...Rest>\ struct append, TypeList, Rest...> { using type = L1; };\ \ template< template class Container, template class List, typename...elems>\ struct rewrap, List> { using type = TypeList>; };\ template< template class Container, template class List, class...Elems, typename...Elements>\ struct rewrap, List, Elements...> { using type = typename append>, typename rewrap, Elements...>::type>::type; };\ \ template