pax_global_header00006660000000000000000000000064150650721010014507gustar00rootroot0000000000000052 comment=ca2b5ff89c83c8eece9d991f9c5d61ce5b168ea6 rocPRIM-rocm-7.1.0/000077500000000000000000000000001506507210100137255ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.azuredevops/000077500000000000000000000000001506507210100163525ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.azuredevops/rocm-ci.yml000066400000000000000000000014051506507210100204260ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .gitlab - .jenkins - docs - '.*.y*ml' - '*.md' - LICENSE.txt - NOTICES.txt drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rocPRIM.yml@pipelines_repo rocPRIM-rocm-7.1.0/.clang-format000066400000000000000000000114351506507210100163040ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true AfterExternBlock: false BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false AttributeMacros: - __host__ - __device__ - __global__ - __forceinline__ - __shared__ - __launch_bounds__ - ROCPRIM_DEVICE - ROCPRIM_HOST - ROCPRIM_HOST_DEVICE - ROCPRIM_SHARED_MEMORY - ROCPRIM_KERNEL - ROCPRIM_INLINE - ROCPRIM_FORCE_INLINE - ROCPRIM_LAUNCH_BOUNDS # Trick clang into thinking that our C-style attributes are C++-style attributes # Make sure that the sizes line up for linebreaks etc Macros: - __host__=[[host]] - __device__=[[device]] - __global__=[[global]] - __forceinline__=[[forceinline]] - __shared__=[[shared]] - __launch_bounds__(x)=[[launch_bounds(x)]] - __attribute__(x)=[[attribute(x)]] - ROCPRIM_DEVICE=[[DEVICE____]] - ROCPRIM_HOST=[[HOST____]] - ROCPRIM_HOST_DEVICE=[[HOST_DEVICE____]] - ROCPRIM_SHARED_MEMORY=[[SHARED_MEMORY____]] - ROCPRIM_KERNEL=[[KERNEL____]] - ROCPRIM_INLINE=[[INLINE____]] - ROCPRIM_FORCE_INLINE=[FORCE_INLINE____]] - ROCPRIM_LAUNCH_BOUNDS(x)=[[launch_bounds(x)____]] BreakAfterAttributes: Always --- rocPRIM-rocm-7.1.0/.githooks/000077500000000000000000000000001506507210100156325ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.githooks/install000077500000000000000000000002121506507210100172210ustar00rootroot00000000000000#!/bin/sh cd "$(git rev-parse --git-dir)" cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocPRIM-rocm-7.1.0/.githooks/pre-commit000077500000000000000000000015531506507210100176400ustar00rootroot00000000000000#!/bin/sh # Redirect output to stderr. exec 1>&2 check_failed=false # Do the code format check if ! "$(git rev-parse --show-toplevel)/scripts/code-format/check-format.sh" HEAD --cached 1>&2; then printf "\n\033[31mFailed\033[0m: code format check.\n" check_failed=true fi # Do the copyright check # update & apply copyright when hook config is set, otherwise just verify opts="-qc" if [ "$(git config --get --type bool --default false hooks.updateCopyright)" = "true" ]; then opts="-qca" fi if ! "$(git rev-parse --show-toplevel)/scripts/copyright-date/check-copyright.sh" "$opts" 1>&2; then printf "\n\033[31mFailed\033[0m: copyright date check.\n" check_failed=true fi if $check_failed; then printf " Pre-commit check failed, please fix the reported errors. Note: Use '\033[33mgit commit --no-verify\033[0m' to bypass checks.\n" exit 1 fi rocPRIM-rocm-7.1.0/.github/000077500000000000000000000000001506507210100152655ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.github/CODEOWNERS000077500000000000000000000003121506507210100166570ustar00rootroot00000000000000* @stanleytsang-amd @umfranzw @RobsonRLemos # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation rocPRIM-rocm-7.1.0/.github/ISSUE_TEMPLATE/000077500000000000000000000000001506507210100174505ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000022571506507210100221500ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Install '...' version '...' 2. Run '...' with data '...' 3. See error on logfile '...' **Expected behavior** A clear and concise description of what you expected to happen. **Log-files** Add *full* logfiles to help explain your problem. **Environment** Make sure that ROCm is correctly installed and run the following command: ``` printf '=== environment\n' > environment.txt && printf '\n\n=== date\n' >> environment.txt && date >> environment.txt && printf '\n\n=== Linux Kernel\n' >> environment.txt && uname -a >> environment.txt && printf '\n\n=== rocm-smi' >> environment.txt && rocm-smi >> environment.txt && printf '\n\n' >> environment.txt && hipconfig >> environment.txt && printf '\n\n=== rocminfo\n' >> environment.txt && rocminfo >> environment.txt && printf '\n\n=== lspci VGA\n' >> environment.txt && lspci | grep -i vga >> environment.txt ``` Attach `environment.txt` **Additional context** Add any other context about the problem here. rocPRIM-rocm-7.1.0/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000011231506507210100231720ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. rocPRIM-rocm-7.1.0/.github/dependabot.yml000066400000000000000000000012231506507210100201130ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" rocPRIM-rocm-7.1.0/.github/workflows/000077500000000000000000000000001506507210100173225ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.github/workflows/docs.yaml000066400000000000000000000045551506507210100211470ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' rocPRIM-rocm-7.1.0/.gitignore000066400000000000000000000012211506507210100157110ustar00rootroot00000000000000### Build dirs ### build*/ ### clangd. ### /.cache # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake CMakeUserPresets.json # End of https://www.gitignore.io/api/c++,cmake # VS Code # .vscode # Python __pycache__ rocPRIM-rocm-7.1.0/.gitignore.develop000066400000000000000000000012401506507210100173470ustar00rootroot00000000000000### Build dirs ### build/ ### Docs dirs ### doc/html/ doc/xml/ doc/latex/ doc/*.tag # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build ### Gtilab CI ### .gitlab-ci-gputest.yml # End of https://www.gitignore.io/api/c++,cmake rocPRIM-rocm-7.1.0/.gitlab-ci.yml000066400000000000000000000535451506507210100163750ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include: - project: 'amd/ci-templates' ref: main file: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml - /deps-format.yaml - /deps-rocm.yaml - /deps-vcpkg.yaml - /deps-windows.yaml - /deps-compiler-acceleration.yaml - /gpus-rocm.yaml - /rules.yaml stages: - lint - autotune - build - test - benchmark workflow: rules: - if: $CI_MERGE_REQUEST_LABELS =~ /CI Skip/ when: never - if: $CI_MERGE_REQUEST_TITLE !~ /Draft:/ variables: ROCPRIM_TEST_RUNS: 1 - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ variables: ROCPRIM_TEST_RUNS: 1 variables: PACKAGE_DIR: $BUILD_DIR/package AUTOTUNE_CONFIG_DIR: ${CI_PROJECT_DIR}/autotune_config clang-format: extends: - .lint:clang-format copyright-date: extends: - .deps:rocm stage: lint needs: [] tags: - build rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - scripts/copyright-date/check-copyright.sh -v -d $CI_MERGE_REQUEST_DIFF_BASE_SHA .cmake-minimum-vcpkg: extends: - .deps:rocm - .deps:cmake-minimum - .deps:vcpkg - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:vcpkg", before_script] - !reference [".deps:compiler-acceleration", before_script] - $VCPKG_DIR/vcpkg install gtest benchmark .cmake-minimum-apt: extends: - .deps:rocm - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] - $SUDO_CMD apt-get install -y -qq libgtest-dev libbenchmark-dev .build:vcpkg-apt: stage: build tags: - build extends: - .gpus:rocm-gpus - .rules:build # Missing -Werror and other diagnostic flags due to rocm-terminal sporting an old googletest APT package (Ubuntu 18.04). # Here we're only testing the consumption logic, and we want to avoid new errors breaking logic testing script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release "$(if [ -n "$VCPKG_DIR" ]; then echo "-DCMAKE_TOOLCHAIN_FILE=$VCPKG_DIR/scripts/buildsystems/vcpkg.cmake"; fi)" -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D BUILD_BENCHMARK=ON -D AMDGPU_TARGETS=$GPU_TARGETS -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=17 -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR --target test_basic build:cmake-minimum-vcpkg: stage: build needs: [] extends: - .cmake-minimum-vcpkg - .build:vcpkg-apt build:cmake-minimum-apt: stage: build needs: [] extends: - .cmake-minimum-apt - .build:vcpkg-apt .cmake-latest: extends: - .deps:rocm - .deps:cmake-latest - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-latest", before_script] - !reference [".deps:compiler-acceleration", before_script] .cmake-minimum: extends: - .deps:rocm - .deps:cmake-minimum - .deps:compiler-acceleration before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:compiler-acceleration", before_script] .build:common: stage: build tags: - build extends: - .gpus:rocm-gpus - .rules:build variables: EXTRA_CMAKE_CXX_FLAGS: "" BUILD_TOOL_ARGS: "" script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - | # Add hardened libc++ assertions for tests only if [[ $BUILD_TARGET == "TEST" ]]; then echo "Configuring with hardened libc++!" EXTRA_CMAKE_CXX_FLAGS+=" -D_GLIBCXX_ASSERTIONS=ON -D ROCPRIM_ENABLE_ASSERTS=ON" fi - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror $EXTRA_CMAKE_CXX_FLAGS" -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D BUILD_$BUILD_TARGET=ON -D WITH_ROCRAND=ON -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D BUILD_EXAMPLE=ON -D AMDGPU_TARGETS=$GPU_TARGETS -D CMAKE_CXX_STANDARD="$BUILD_VERSION" -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR -- ${BUILD_TOOL_ARGS} artifacts: paths: - $BUILD_DIR/.ninja_log - $BUILD_DIR/benchmark/* - $BUILD_DIR/CMakeCache.txt - $BUILD_DIR/CTestTestfile.cmake - $BUILD_DIR/deps/googlebenchmark/ - $BUILD_DIR/gtest/ - $BUILD_DIR/test/CTestTestfile.cmake - $BUILD_DIR/test/rocprim/CTestTestfile.cmake - $BUILD_DIR/deps/rocrand/ - $BUILD_DIR/test/rocprim/test_* - $BUILD_DIR/test/rocprim/libtest_* - $BUILD_DIR/test/test_* expire_in: 1 day build:spirv: stage: build needs: [] extends: - .cmake-minimum - .build:common variables: # For unknown reasons spir-v builds ignore 'clang diagnostic' pragmas that # we use to ignore internal deprecations. EXTRA_CMAKE_CXX_FLAGS: "-Wno-deprecated-declarations -mf16c -DROCPRIM_EXPERIMENTAL_SPIRV" # Since not all targets are expected to build, do not stop building other # targets when any target fails. BUILD_TOOL_ARGS: "-k 0" GPU_TARGETS: "amdgcnspirv" image: "registry.streamhpc.internal/unstable-rocm:main" allow_failure: true parallel: # Debug builds disabled due to excessive build times for debug test builds matrix: - BUILD_TYPE: Release BUILD_TARGET: [BENCHMARK, TEST] BUILD_VERSION: 17 artifacts: when: always build:cmake-latest: stage: build needs: [] extends: - .cmake-latest - .build:common parallel: # Debug builds disabled due to excessive build times for debug test builds matrix: - BUILD_TYPE: Release BUILD_TARGET: [BENCHMARK, TEST] BUILD_VERSION: 17 build:cmake-minimum: needs: [] extends: - .cmake-minimum - .build:common parallel: matrix: - BUILD_TYPE: [Debug, Release] BUILD_TARGET: [BENCHMARK, TEST] BUILD_VERSION: 17 build:package: stage: build needs: [] tags: - build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:build script: - mkdir -p $PACKAGE_DIR - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D CMAKE_CXX_STANDARD=17 -B $PACKAGE_DIR -S $CI_PROJECT_DIR - cd $PACKAGE_DIR - cpack -G "DEB;ZIP" artifacts: paths: - $PACKAGE_DIR/rocprim*.deb - $PACKAGE_DIR/rocprim*.zip expire_in: 1 day build:windows: stage: build needs: [] extends: - .rules:build - .gpus:rocm-windows - .deps:rocm-windows - .deps:visual-studio-devshell parallel: matrix: - BUILD_TYPE: Release script: - mkdir -p $CI_PROJECT_DIR/build - cmake -G Ninja -S $CI_PROJECT_DIR -B $CI_PROJECT_DIR/build -D BUILD_TEST=ON -D BUILD_BENCHMARK=ON -D AMDGPU_TARGETS=$GPU_TARGET -D CMAKE_CXX_COMPILER:PATH="${env:HIP_PATH}\bin\clang++.exe" -D CMAKE_PREFIX_PATH:PATH="${env:HIP_PATH}" -D CMAKE_BUILD_TYPE="$BUILD_TYPE" -D CMAKE_CXX_STANDARD=17 - cmake --build "$CI_PROJECT_DIR/build" artifacts: paths: - $CI_PROJECT_DIR/build/test/test_* - $CI_PROJECT_DIR/build/test/rocprim/test_* - $CI_PROJECT_DIR/build/test/CTestTestfile.cmake - $CI_PROJECT_DIR/build/test/rocprim/CTestTestfile.cmake - $CI_PROJECT_DIR/build/gtest/ - $CI_PROJECT_DIR/build/CMakeCache.txt - $CI_PROJECT_DIR/build/.ninja_log - $CI_PROJECT_DIR/build/CTestTestfile.cmake expire_in: 1 day autotune:build: stage: autotune needs: [] tags: - build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:benchmark before_script: - !reference [".cmake-minimum", before_script] - $SUDO_CMD apt-get update -qq - $SUDO_CMD apt-get install -qq -y zstd variables: BENCHMARK_TARGETS: benchmark_config_tuning script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - 'printf "Building benchmark targets: %s\n" "$BENCHMARK_TARGETS"' - cmake -B $BUILD_DIR -S $CI_PROJECT_DIR -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wno-#pragma-messages" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D BUILD_BENCHMARK=ON -D BENCHMARK_CONFIG_TUNING=ON -D AMDGPU_TARGETS=$GPU_TARGETS -D CMAKE_C_COMPILER_LAUNCHER=phc_sccache_c -D CMAKE_CXX_COMPILER_LAUNCHER=phc_sccache_cxx -D CMAKE_CXX_STANDARD=17 - cmake --build . --target $BENCHMARK_TARGETS - 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel' # The autotune benchmarks get very large, above GitLabs upload limit. Fortunately they compress well. # We'll put them all in a single archive to compress them to a few hundred MB. - find benchmark -type f -executable -print0 | tar -I zstd -cvf benchmarks.tar.zstd --null -T - artifacts: paths: - $BUILD_DIR/benchmarks.tar.zstd - $BUILD_DIR/.ninja_log - $BUILD_DIR/deps/googlebenchmark/ expire_in: 1 week .test:common: stage: test tags: - rocm - $GPU extends: - .cmake-minimum needs: - job: build:cmake-minimum parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: TEST BUILD_VERSION: 17 script: - cd $BUILD_DIR - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. # This causes the hipMemcpy to fail, which is not reported as an error by HIP. # As a temporary workaround, disable the SDMA for test stability. - HSA_ENABLE_SDMA=0 ctest --output-on-failure --repeat-until-fail 2 --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS test:any-gpu: variables: GPU: "" PARALLEL_JOBS: 1 extends: - .test:common rules: - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/ test:label-arch: extends: - .gpus:rocm - .test:common - .rules:arch-labels test:all-gpus: variables: SHOULD_BE_UNDRAFTED: "true" extends: - .gpus:rocm - .test:common - .rules:test .test:common-spirv: stage: test tags: - rocm - $GPU extends: - .cmake-minimum allow_failure: true timeout: 3h needs: - job: build:spirv parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: TEST BUILD_VERSION: 17 image: "registry.streamhpc.internal/unstable-rocm:main" script: - cd $BUILD_DIR - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json # Parallel execution (with other AMDGPU processes) can oversubscribe the SDMA queue. # This causes the hipMemcpy to fail, which is not reported as an error by HIP. # As a temporary workaround, disable the SDMA for test stability. - HSA_ENABLE_SDMA=0 ctest --output-on-failure --repeat-until-fail 2 --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS --exclude-regex rocprim.device_partition test:any-gpu-spirv: variables: GPU: "" PARALLEL_JOBS: 1 extends: - .test:common-spirv rules: - if: $CI_MERGE_REQUEST_TITLE =~ /Draft:/ && $CI_MERGE_REQUEST_LABELS !~ /Arch::/ test:label-arch-spirv: extends: - .gpus:rocm - .test:common-spirv - .rules:arch-labels test:all-gpus-spirv: variables: SHOULD_BE_UNDRAFTED: "true" extends: - .gpus:rocm - .test:common-spirv - .rules:test .test-windows-base: stage: test extends: - .deps:rocm-windows - .gpus:rocm-gpus-windows - .deps:visual-studio-devshell - .rules:test script: - cd $CI_PROJECT_DIR/build - ctest --output-on-failure # Disabled due to extensive link times. # This is tracked in issue 679 # test-windows-debug: # extends: # - .test-windows-base # needs: # - job: build:windows # parallel: # matrix: # - BUILD_TYPE: Debug # BUILD_TARGET: TEST test-windows-release: extends: - .test-windows-base needs: - job: build:windows parallel: matrix: - BUILD_TYPE: Release .test-package: script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D AMDGPU_TARGETS=$GPU_TARGETS -D CMAKE_CXX_STANDARD=17 -S "$CI_PROJECT_DIR/test/extra" -B "$CI_PROJECT_DIR/package_test" - cmake --build "$CI_PROJECT_DIR/package_test" - "$CI_PROJECT_DIR/package_test/test_rocprim_package" - cd "$CI_PROJECT_DIR/package_test" - ctest --output-on-failure --repeat-until-fail 2 test:install: stage: test needs: [] tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D CMAKE_CXX_STANDARD=17 -B build -S $CI_PROJECT_DIR # Preserve $PATH when sudoing - $SUDO_CMD env PATH="$PATH" cmake --build build --target install - !reference [.test-package, script] test:deb: stage: test needs: - build:package tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - $SUDO_CMD dpkg -i $PACKAGE_DIR/rocprim*.deb - !reference [.test-package, script] test:docs: stage: test variables: SPHINX_DIR: $DOCS_DIR/sphinx extends: - .rules:test - .build:docs artifacts: paths: - $DOCS_DIR/_build/html/ expire_in: 2 weeks .benchmark-base: stage: benchmark extends: - .rules:benchmark variables: BENCHMARK_RESULT_DIR: ${CI_PROJECT_DIR}/benchmark_results BENCHMARK_RESULT_CACHE_DIR: ${BENCHMARK_RESULT_DIR}_cache benchmark: needs: - job: build:cmake-minimum parallel: matrix: - BUILD_TYPE: Release BUILD_TARGET: BENCHMARK BUILD_VERSION: 17 extends: - .cmake-minimum - .gpus:rocm - .benchmark-base variables: BENCHMARK_FILENAME_REGEX: ^benchmark BENCHMARK_ALGORITHM_REGEX: "" BENCHMARK_SEED: random script: - 'printf "CI Variables used in benchmarks:\nBENCHMARK_RESULT_DIR: %s\nBENCHMARK_FILENAME_REGEX: %s\nBENCHMARK_ALGORITHM_REGEX: %s \n" "$BENCHMARK_RESULT_DIR" "$BENCHMARK_FILENAME_REGEX" "$BENCHMARK_ALGORITHM_REGEX"' - cd "${CI_PROJECT_DIR}" - mkdir -p "${BENCHMARK_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir "${BUILD_DIR}/benchmark" --benchmark_gpu_architecture "${GPU_TARGET}" --benchmark_output_dir "${BENCHMARK_RESULT_DIR}" --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}" --seed "${BENCHMARK_SEED}" - python3 .gitlab/report_noise.py --benchmark_json_dir "${BENCHMARK_RESULT_DIR}" --noise_threshold_percentage 1.0 --accept_high_noise artifacts: paths: - ${BENCHMARK_RESULT_DIR} expire_in: 1 week benchmark:cache-or-report: needs: - benchmark extends: - .benchmark-base tags: - single-cache cache: key: benchmark-cache paths: - ${BENCHMARK_RESULT_CACHE_DIR} script: # If on MR branch, generate report, else cache results - > if [ ! -z "${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" ]; then if [ ! -d "${BENCHMARK_RESULT_CACHE_DIR}" ]; then echo 'ERROR: Cache directory does not exist' exit 1 elif [ ! -d "${BENCHMARK_RESULT_DIR}" ]; then echo 'ERROR: Benchmark results directory does not exist' exit 1 else echo 'INFO: Files in cache (reference benchmarks):' ls -al ${BENCHMARK_RESULT_CACHE_DIR} echo 'INFO: Generating report...' python3 .gitlab/generate_report.py --old ${BENCHMARK_RESULT_CACHE_DIR} --new ${BENCHMARK_RESULT_DIR} fi elif [ "${CI_COMMIT_BRANCH}" == "${CI_DEFAULT_BRANCH}" ]; then echo 'INFO: Caching benchmark results...' mkdir -p ${BENCHMARK_RESULT_CACHE_DIR} cp -R ${BENCHMARK_RESULT_DIR}/*.json ${BENCHMARK_RESULT_CACHE_DIR} else echo 'ERROR: Neither on a merge-request branch or the default branch' exit 1 fi .autotune-base: stage: autotune extends: - .rules:manual variables: AUTOTUNE_RESULT_DIR: ${CI_PROJECT_DIR}/autotune_results autotune:execute-tuning: needs: - autotune:build extends: - .autotune-base - .cmake-minimum - .gpus:rocm variables: AUTOTUNE_FILENAME_REGEX: ^benchmark AUTOTUNE_ALGORITHM_REGEX: "" AUTOTUNE_SIZE: "" AUTOTUNE_TRIALS: "" timeout: 8h artifacts: paths: - ${AUTOTUNE_RESULT_DIR}/*.json before_script: - !reference [".cmake-minimum", before_script] - $SUDO_CMD apt-get update -qq - $SUDO_CMD apt-get install -qq -y zstd script: - cd "${CI_PROJECT_DIR}" - tar -I zstd -xvf "${BUILD_DIR}/benchmarks.tar.zstd" -C "${BUILD_DIR}/" - | if [ ! -d "${BUILD_DIR}/benchmark" ]; then echo "There are no benchmark executables. Run the build job with a BUILD_TARGET." exit 1 fi - mkdir -p "${AUTOTUNE_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir="${BUILD_DIR}/benchmark" --benchmark_gpu_architecture="${GPU_TARGET}" --benchmark_output_dir="${AUTOTUNE_RESULT_DIR}" --benchmark_filename_regex="${AUTOTUNE_FILENAME_REGEX}" --benchmark_filter_regex="${AUTOTUNE_ALGORITHM_REGEX}" --size="${AUTOTUNE_SIZE}" --trials="${AUTOTUNE_TRIALS}" --seed=82589933 - python3 .gitlab/report_noise.py --benchmark_json_dir "${AUTOTUNE_RESULT_DIR}" --noise_threshold_percentage 1.0 --accept_high_noise autotune:generate-config: image: python:3.10.5-buster needs: - job: "autotune:execute-tuning" optional: true extends: - .rules:manual - .autotune-base variables: AUTOTUNE_CONFIG_REPO_PATH: /rocprim/include/rocprim/device/detail/config AUTOTUNE_RESULT_CACHE_DIR: ${AUTOTUNE_RESULT_DIR}_cache tags: - single-cache cache: key: autotune-cache paths: - autotune_results_cache/ script: # Set cache dir variables depending on if this is a MR or not - > if [ ! -z "${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" ]; then AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" else AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_COMMIT_BRANCH}" fi # If the global cache dir does not exist, create it - mkdir -p $AUTOTUNE_RESULT_CACHE_DIR # If there are fresh results in the artifacts, cache them in the branch cache # If there are no fresh results, check branch cache # If there are no branch cache results, check TARGET branch cache # If there are TARGET branch cache results, cache them in the branch cache - > if [ -d "$AUTOTUNE_RESULT_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} else echo 'ERROR: No autotune results found in previous artifacts, the branch cache or the target branch cache...' exit 1 fi # List the final .json files to use for config generation - ls -al ${AUTOTUNE_RESULT_DIR} - cd "${CI_PROJECT_DIR}" - python3 -m pip install jinja2 - mkdir -p ${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH} - python3 scripts/autotune/create_optimization.py --benchmark_files ${AUTOTUNE_RESULT_DIR}/*.json --out_basedir "${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH}" artifacts: paths: - ${AUTOTUNE_CONFIG_DIR} scheduled-check-changes: stage: autotune extends: .rules:scheduled-check-changes rocPRIM-rocm-7.1.0/.gitlab/000077500000000000000000000000001506507210100152455ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/.gitlab/generate_report.py000066400000000000000000000122301506507210100210020ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import json import argparse import os import re import stat import sys class bcolors: OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' def load_benchmarks(benchmark_dir): def is_benchmark_json(filename): if not re.match(r'.*\.json$', filename): return False path = os.path.join(benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether it is a regular file (S_IFREG) return (st_mode & stat.S_IFREG) def add_results(results, file_path: str): """ Adds a single file to the results. The file contains the results of benchmarks executed on a single architecture. The benchmarks within the file may belong to different algorithms. """ with open(file_path, "r+") as file_handle: # Fix Google Benchmark comma issue contents = file_handle.read() contents = re.sub(r"(\s*\"[^\"]*\"[^,])(^\s*\"[^\"]*\":)", "\\1,\\2", contents, 0, re.MULTILINE) file_handle.seek(0) file_handle.write(contents) file_handle.truncate() with open(file_path) as file_handle: benchmark_run_data = json.load(file_handle) try: arch = benchmark_run_data['context']['hdp_gcn_arch_name'].split(":")[0] results.setdefault(arch, {}) for single_benchmark in benchmark_run_data['benchmarks']: name = single_benchmark['name'].replace('/manual_time','') name = re.sub(r"(^device.*?)(,\s[A-z_]*_config.*>)$", "\\1>", name, 0, re.MULTILINE) results[arch][name] = single_benchmark['bytes_per_second'] except KeyError as err: print(f'KeyError: {err}, while reading file: {file_path}', file=sys.stderr, flush=True) benchmark_names = [name for name in os.listdir(benchmark_dir) if is_benchmark_json(name)] print('The following benchmark results will be reported:\n{}'.format('\n'.join(benchmark_names))) # Results is: {arch : {algorithm : bytes_per_second}, ...} results = {} for benchmark_name in benchmark_names: path = os.path.join(benchmark_dir, benchmark_name) add_results(results, path) return results def compare_results(old, new): results = [] incomparable = 0 for (arch, names) in new.items(): if arch in old: for (name, value_new) in names.items(): if name in old[arch]: results.append((f'{name} ({arch})', ((value_new - old[arch][name]) / old[arch][name]) * 100)) else: incomparable = incomparable + 1 if(incomparable > 0): print(f'Could not compare {incomparable} benchmarks.') print(f'----------------------------------------') success = True results.sort(key = lambda x: x[0]) for (name, difference) in results: if difference < -10: success = False print(f'{bcolors.FAIL}X {bcolors.ENDC} {name}: {bcolors.FAIL}{difference:.0f}{bcolors.ENDC}%') elif difference < -2: success = False print(f'{bcolors.WARNING}! {bcolors.ENDC} {name}: {bcolors.WARNING}{difference:.0f}{bcolors.ENDC}%') else: print(f'{bcolors.OKGREEN}OK{bcolors.ENDC} {name}: {bcolors.OKGREEN}{difference:.0f}{bcolors.ENDC}%') return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--old', help='The local directory that contains the old benchmark json files', required=True) parser.add_argument('--new', help='The local directory that contains the new benchmark json files', required=True) args = parser.parse_args() old_benchmarks = load_benchmarks(args.old) new_benchmarks = load_benchmarks(args.new) return compare_results(old_benchmarks, new_benchmarks) if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-7.1.0/.gitlab/report_noise.py000066400000000000000000000202661506507210100203350ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import argparse import json import os import re import stat import statistics import sys class colors: OK = "\033[92m" FAIL = "\033[91m" END_COLOR = "\033[0m" def print_results(results): # Store the length of the longest value in a column longest = { "name": max(len(result["name"]) for result in results), "noisy_permutations": max( len(result["noisy_permutations"]) for result in results ), "mean": max(len(result["mean"]) for result in results), "median": max(len(result["median"]) for result in results), "max": max(len(result["max"]) for result in results), "batch": max(len(result["batch"]) for result in results), "warmup": max(len(result["warmup"]) for result in results), "bytes": max(len(result["bytes"]) for result in results), } # The name of a column can be longer than its values longest = {key: max(value, len(key)) for key, value in longest.items()} printed = "name".ljust(longest["name"] + 1) printed += "noisy permutations".ljust(longest["noisy_permutations"] + 1) printed += "mean".ljust(longest["mean"] + 1) printed += "median".ljust(longest["median"] + 1) printed += "max".ljust(longest["max"] + 1) printed += "batch".ljust(longest["batch"] + 1) printed += "warmup".ljust(longest["warmup"] + 1) printed += "bytes".ljust(longest["bytes"] + 1) printed += "seed" print(printed) for result in results: printed = result["name"].ljust(longest["name"]) printed += " " printed += colors.FAIL if result["noisy"] else colors.OK printed += ( f'{result["noisy_permutations"].ljust(longest["noisy_permutations"])}' ) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_mean"] else colors.OK printed += result["mean"].ljust(longest["mean"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_median"] else colors.OK printed += result["median"].ljust(longest["median"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_max"] else colors.OK printed += result["max"].ljust(longest["max"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_batch"] else colors.OK printed += result["batch"].ljust(longest["batch"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_warmup"] else colors.OK printed += result["warmup"].ljust(longest["warmup"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["bad_bytes"] else colors.OK printed += result["bytes"].ljust(longest["bytes"]) printed += colors.END_COLOR printed += " " printed += colors.FAIL if result["seed"] == "random" else colors.OK printed += result["seed"] printed += colors.END_COLOR print(printed) def get_results(benchmarks, threshold): def get_humanized_bytes(size): for unit in ["B", "KiB", "MiB", "GiB", "TiB", "PiB"]: if size < 1024.0 or unit == "PiB": break size /= 1024.0 return f"{size:.1f} {unit}" success = True results = [] for benchmark in benchmarks: data = benchmark["data"] name = benchmark["name"] permutations = data["benchmarks"] cvs = [permutation["cv"] for permutation in permutations] # The cv (coefficient of variation) is a standard way of quantifying noise noises = sum(cv * 100 > threshold for cv in cvs) noisy = noises > 0 if noisy: success = False context = data["context"] noisy_permutations = f"{noises}/{len(permutations)}" mean = statistics.mean(cvs) median = statistics.median(cvs) max_ = max(cvs) batch = context["batch_iterations"] warmup = context["warmup_iterations"] bytes_ = int(context["size"]) seed = context["seed"] results.append( { "name": name, "noisy": noisy, "noisy_permutations": noisy_permutations, "bad_mean": mean * 100 > threshold, "mean": f"{mean:.1%}", "bad_median": median * 100 > threshold, "median": f"{median:.1%}", "bad_max": max_ * 100 > threshold, "max": f"{max_:.1%}", "bad_batch": int(batch) < 10, "batch": batch, "bad_warmup": int(warmup) < 5, "warmup": warmup, "bad_bytes": 0 < bytes_ < 128 * 1024 * 1024, # 128 MiB "bytes": get_humanized_bytes(int(context["size"])), "seed": seed, } ) return results, success def load_benchmarks(benchmark_json_dir): def is_benchmark_json(filename): if not re.match(r".*\.json$", filename): return False path = os.path.join(benchmark_json_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether it is a regular file (S_IFREG) return st_mode & stat.S_IFREG benchmark_names = [ name for name in os.listdir(benchmark_json_dir) if is_benchmark_json(name) ] success = True benchmarks = [] for benchmark_name in benchmark_names: with open(os.path.join(benchmark_json_dir, benchmark_name)) as f: try: benchmarks.append({"name": benchmark_name, "data": json.load(f)}) except json.JSONDecodeError as e: print( f"{colors.FAIL}Failed to load {benchmark_name}{colors.END_COLOR}: {e}\n", file=sys.stderr, ) success = False return benchmarks, success def main(): parser = argparse.ArgumentParser() parser.add_argument( "--noise_threshold_percentage", help="The noise threshold percentage, past which benchmark permutations are considered to be too noisy", required=True, type=float, ) parser.add_argument( "--benchmark_json_dir", help="The directory of benchmark JSON files, which to report the noise of", required=True, ) parser.add_argument( "--accept_high_noise", help="Don't call exit(1) when there is a noisy benchmark permutation", action=argparse.BooleanOptionalAction, ) args = parser.parse_args() print(f"The noise threshold is {args.noise_threshold_percentage:.1f}%\n") benchmarks, load_success = load_benchmarks(args.benchmark_json_dir) results, results_success = get_results(benchmarks, args.noise_threshold_percentage) print_results(results) if not load_success: return False if args.accept_high_noise: return True return results_success if __name__ == "__main__": success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-7.1.0/.gitlab/run_benchmarks.py000077500000000000000000000135061506507210100206300ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import argparse from collections import namedtuple import json import os import re import stat import subprocess import sys BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex', 'size', 'trials', 'seed', 'skip_gathered']) def run_benchmarks(benchmark_context): def is_benchmark_executable(filename): if not re.match(benchmark_context.benchmark_filename_regex, filename): return False path = os.path.join(benchmark_context.benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether there is any execution flag set # and it is a regular file (S_IFREG) return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) def should_skip(results_json_path): if not benchmark_context.skip_gathered: return False try: with open(results_json_path) as f: json.load(f) except (FileNotFoundError, json.JSONDecodeError): return False return True success = True benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] print('The following benchmarks will be ran:\n{}'.format('\n'.join(benchmark_names)), file=sys.stderr, flush=True) for benchmark_name in benchmark_names: results_json_name = f'{benchmark_name}_{benchmark_context.gpu_architecture}.json' benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name) if should_skip(results_json_path): print(f'Skipping {benchmark_name}, because its results have already been gathered at {results_json_path}', file=sys.stderr, flush=True) continue args = [ benchmark_path, f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] if benchmark_context.size: args += ['--size', benchmark_context.size] if benchmark_context.trials: args += ['--trials', benchmark_context.trials] if benchmark_context.seed: args += ['--seed', benchmark_context.seed] try: subprocess.check_call(args) except subprocess.CalledProcessError as error: print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"', file=sys.stderr, flush=True) success = False return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--benchmark_dir', help='The local directory that contains the benchmark executables', required=True) parser.add_argument('--benchmark_gpu_architecture', help='The architecture of the currently enabled GPU', required=True) parser.add_argument('--benchmark_output_dir', help='The directory to write the benchmarks to', required=True) parser.add_argument('--benchmark_filename_regex', help='Regular expression that controls the list of benchmark executables to run', default=r'^benchmark', required=False) parser.add_argument('--benchmark_filter_regex', help='Regular expression that controls the list of benchmarks to run in each benchmark executable', default='', required=False) parser.add_argument('--size', help='Controls the number of processed items in each benchmark', default='', required=False) parser.add_argument('--trials', help='Controls the number of trial iterations for each benchmark case', default='', required=False) parser.add_argument('--seed', help='Controls the seed for random number generation for each benchmark case', default='', required=False) parser.add_argument('--skip_gathered', help='Skip running benchmarks whose JSON data has already been gathered', default=False, action='store_true', required=False) args = parser.parse_args() benchmark_context = BenchmarkContext( args.benchmark_gpu_architecture, args.benchmark_output_dir, args.benchmark_dir, args.benchmark_filename_regex, args.benchmark_filter_regex, args.size, args.trials, args.seed, args.skip_gathered) benchmark_run_successful = run_benchmarks(benchmark_context) return benchmark_run_successful if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-7.1.0/.readthedocs.yaml000066400000000000000000000005021506507210100171510ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" rocPRIM-rocm-7.1.0/CHANGELOG.md000066400000000000000000001157321506507210100155470ustar00rootroot00000000000000# Changelog for rocPRIM Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projects/rocPRIM/en/latest/](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/). ## rocPRIM 4.1.0 for ROCm 7.1 ### Added * Added `get_sreg_lanemask_lt`, `get_sreg_lanemask_le`, `get_sreg_lanemask_gt` and `get_sreg_lanemask_ge`. * Added `rocprim::transform_output_iterator` and `rocprim::make_transform_output_iterator`. * Added experimental support for SPIR-V, to use the correct tuned config for part of the appliable algorithms. * Added a new cmake option, `BUILD_OFFLOAD_COMPRESS`. When rocPRIM is build with this option enabled, the `--offload-compress` switch is passed to the compiler. This causes the compiler to compress the binary that it generates. Compression can be useful in cases where you are compiling for a large number of targets, since this often results in a large binary. Without compression, in some cases, the generated binary may become so large symbols are placed out of range, resulting in linking errors. The new `BUILD_OFFLOAD_COMPRESS` option is set to `ON` by default. * Added a new CMake option `-DUSE_SYSTEM_LIB` to allow tests to be built from `ROCm` libraries provided by the system. * Added `rocprim::apply` which applies a function to a `rocprim::tuple`. ### Changed * Changed tests to support `ptr-to-const` output in `/test/rocprim/test_device_batch_memcpy.cpp`. ### Optimizations * Improved performance of many algorithms, by updating their tuned configs. * 891 specializations have been improved. * 399 specializations have been added. ### Upcoming changes * Deprecated the `->` operator for the `zip_iterator`. ### Resolved issues * Fixed `device_select`, `device_merge`, and `device_merge_sort` not allocating the correct amount of virtual shared memory on the host. * Fixed the `->` operator for the `transform_iterator`, the `texture_cache_iterator` and the `arg_index_iterator`, by now returning a proxy pointer. * The `arg_index_iterator` also now only returns the internal iterator for the `->`. ## rocPRIM 4.0.1 for ROCm 7.0.2 ### Resolved issues * Fixed compilation issue when using `rocprim::texture_cache_iterator`. * Fixed a HIP version check used to determine whether hipStreamLegacy is supported. This resolves runtime errors that occur when hipStreamLegacy is used in versions of ROCm later than 6.4. ## rocPRIM 4.0.0 for ROCm 7.0 ### Added * Added `rocprim::accumulator_t` to ensure parity with CCCL. * Added test for `rocprim::accumulator_t` * Added `rocprim::invoke_result_r` to ensure parity with CCCL. * Added function `is_build_in` into `rocprim::traits::get`. * Added virtual shared memory as a fallback option in `rocprim::device_merge` when it exceeds shared memory capacity, similar to `rocprim::device_select`, `rocprim::device_partition`, and `rocprim::device_merge_sort`, which already include this feature. * Added initial value support to device level inclusive scans. * Added new optimization to the backend for `device_transform` when the input and output are pointers. * Added `LoadType` to `transform_config`, which is used for the `device_transform` when the input and output are pointers. * Added `rocprim:device_transform` for n-ary transform operations API with as input `n` number of iterators inside a `rocprim::tuple`. * Added gfx950 support. * Added `rocprim::key_value_pair::operator==`. * Added the `rocprim::unrolled_copy` thread function to copy multiple items inside a thread. * Added the `rocprim::unrolled_thread_load` function to load multiple items inside a thread using `rocprim::thread_load`. * Added `rocprim::int128_t` and `rocprim::uint128_t` to benchmarks for improved performance evaluation on 128-bit integers. * Added `rocprim::int128_t` to the supported autotuning types to improve performance for 128-bit integers. * Added the `rocprim::merge_inplace` function for merging in-place. * Added initial value support for warp- and block-level inclusive scan. * Added support for building tests with device-side random data generation, making them finish faster. This requires rocRAND, and is enabled with the `WITH_ROCRAND=ON` build flag. * Added tests and documentation to `lookback_scan_state`. It is still in the `detail` namespace. ### Optimizations * Improved performance of `rocprim::device_select` and `rocprim::device_partition` when using multiple streams on the MI3XX architecture. ### Changed * Changed the parameters `long_radix_bits` and `LongRadixBits` from `segmented_radix_sort` to `radix_bits` and `RadixBits` respectively. * Marked the initialisation constructor of `rocprim::reverse_iterator` `explicit`, use `rocprim::make_reverse_iterator`. * Merged `radix_key_codec` into type_traits system. * Renamed `type_traits_interface.hpp` to `type_traits.hpp`, rename the original `type_traits.hpp` to `type_traits_functions.hpp`. * The default scan accumulator types for device-level scan algorithms have changed. This is a breaking change. The previous default accumulator types could lead to situations in which unexpected overflow occured, such as when the input or inital type was smaller than the output type. * This is a complete list of affected functions and how their default accumulator types are changing: * `rocprim::inclusive_scan` * Previous default: `class AccType = typename std::iterator_traits::value_type>` * Current default: `class AccType = rocprim::accumulator_t::value_type>` * `rocprim::deterministic_inclusive_scan` * Previous default: `class AccType = typename std::iterator_traits::value_type>` * Current default: `class AccType = rocprim::accumulator_t::value_type>` * `rocprim::exclusive_scan` * Previous default: `class AccType = detail::input_type_t>` * Current default: `class AccType = rocprim::accumulator_t>` * `rocprim::deterministic_exclusive_scan` * Previous default: `class AccType = detail::input_type_t>` * Current default: `class AccType = rocprim::accumulator_t>` * Undeprecated internal `detail::raw_storage`. * A new version of `rocprim::thread_load` and `rocprim::thread_store` replace the deprecated `rocprim::thread_load` and `rocprim::thread_store` functions. The versions avoid inline assembly where possible, and don't hinder the optimizer as much as a result. * Renamed `rocprim::load_cs` to `rocprim::load_nontemporal` and `rocprim::store_cs` to `rocprim::store_nontemporal` to express the intent of these load and store methods better. * All kernels now have hidden symbol visibility. All symbols now have inline namespaces that include the library version, for example, `rocprim::ROCPRIM_300400_NS::symbol` instead of `rocPRIM::symbol`, letting the user link multiple libraries built with different versions of rocPRIM. ### Upcoming changes * `rocprim::invoke_result_binary_op` and `rocprim::invoke_result_binary_op_t` are deprecated. Use `rocprim::accumulator_t` now. ### Removed * Removed `rocprim::detail::float_bit_mask` and relative tests, use `rocprim::traits::float_bit_mask` instead. * Removed `rocprim::traits::is_fundamental`, please use `rocprim::traits::get::is_fundamental()` directly. * Removed the deprecated parameters `short_radix_bits` and `ShortRadixBits` from the `segmented_radix_sort` config. They were unused, it is only an API change. * Removed the deprecated `operator<<` from the iterators. * Removed the deprecated `TwiddleIn` and `TwiddleOut`. Use `radix_key_codec` instead. * Removed the deprecated flags API of `block_adjacent_difference`. Use `subtract_left()` or `block_discontinuity::flag_heads()` instead. * Removed the deprecated `to_exclusive` functions in the warp scans. * Removed the `rocprim::load_cs` from the `cache_load_modifier` enum. Use `rocprim::load_nontemporal` instead. * Removed the `rocprim::store_cs` from the `cache_store_modifier` enum. Use `rocprim::store_nontemporal` instead. * Removed the deprecated header file `rocprim/detail/match_result_type.hpp`. Include `rocprim/type_traits.hpp` instead. * This header included `rocprim::detail::invoke_result`. Use `rocprim::invoke_result` instead. * This header included `rocprim::detail::invoke_result_binary_op`. Use `rocprim::invoke_result_binary_op` instead. * This header included `rocprim::detail::match_result_type`. Use `rocprim::invoke_result_binary_op_t` instead. * Removed the deprecated `rocprim::detail::radix_key_codec` function. Use `rocprim::radix_key_codec` instead. * Removed `rocprim/detail/radix_sort.hpp`, functionality can now be found in `rocprim/thread/radix_key_codec.hpp`. * Removed C++14 support, only C++17 is supported. * Due to the removal of `__AMDGCN_WAVEFRONT_SIZE` in the compiler, the following deprecated warp size-related symbols have been removed: * `rocprim::device_warp_size()` * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory. * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().` * `rocprim::warp_size()` * Use `rocprim::host_warp_size()`, `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead. * `ROCPRIM_WAVEFRONT_SIZE` * Use `rocprim::arch::wavefront::min_size()` or `rocprim::arch::wavefront::max_size()` instead. * `__AMDGCN_WAVEFRONT_SIZE` * This was a fallback define for the compiler's removed symbol, having the same name. * This release removes support for custom builds on gfx940 and gfx941. ### Resolved issues * Fixed an issue where `device_batch_memcpy` reported benchmarking throughput being 2x lower than it was in reality. * Fixed an issue where `device_segmented_reduce` reported autotuning throughput being 5x lower than it was in reality. * Fixed device radix sort not returning the correct required temporary storage when a double buffer contains `nullptr`. * Fixed constness of equality operators (`==` and `!=`) in `rocprim::key_value_pair`. * Fixed an issue for the comparison operators in `arg_index_iterator` and `texture_cache_iterator`, where `<` and `>` comparators were swapped. * Fixed an issue for the `rocprim::thread_reduce` not working correctly with a prefix value. ### Known issues * When using `rocprim::deterministic_inclusive_scan_by_key` and `rocprim::deterministic_exclusive_scan_by_key` the intermediate values can change order on Navi3x * However if a commutative scan operator is used then the final scan value (output array) will still always be consistent between runs ## rocPRIM 3.4.1 for ROCm 6.4.2 ### Upcoming changes * Changes to the template parameters of warp and block algorithms will be made in an upcoming release. * Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated: * `rocprim::device_warp_size()` * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory. * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().` * `rocprim::warp_size()` * `ROCPRIM_WAVEFRONT_SIZE` * The default scan accumulator types for device-level scan algorithms will be changed in an upcoming release, resulting in a breaking change. Previously, the default accumulator type was set to the input type for the inclusive scans and to the initial value type for the exclusive scans. This could lead to unexpected overflow if the input or initial type was smaller than the output type when the accumulator type was't explicitly set using the `AccType` template parameter. The new default accumulator types will be set to the type that results when the input or initial value type is applied to the scan operator. The following is the complete list of affected functions and how their default accumulator types are changing: * `rocprim::inclusive_scan` * current default: `class AccType = typename std::iterator_traits::value_type>` * future default: `class AccType = rocprim::invoke_result_binary_op_t::value_type, BinaryFunction>` * `rocprim::deterministic_inclusive_scan` * current default: `class AccType = typename std::iterator_traits::value_type>` * future default: `class AccType = rocprim::invoke_result_binary_op_t::value_type, BinaryFunction>` * `rocprim::exclusive_scan` * current default: `class AccType = detail::input_type_t>` * future default: `class AccType = rocprim::invoke_result_binary_op_t, BinaryFunction>` * `rocprim::deterministic_exclusive_scan` * current default: `class AccType = detail::input_type_t>` * future default: `class AccType = rocprim::invoke_result_binary_op_t, BinaryFunction>` * `rocprim::load_cs` and `rocprim::store_cs` are deprecated and will be removed in an upcoming release. Alternatively, you can use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` to load and store values in specific conditions (like bypassing the cache) for `rocprim::thread_load` and `rocprim::thread_store`. ## rocPRIM 3.4.0 for ROCm 6.4.0 ### Added * Added extended tests to `rtest.py`. These tests are extra tests that did not fit the criteria of smoke and regression tests. These tests will take much longer to run relative to smoke and regression tests. * Use `python rtest.py [--emulation|-e|--test|-t]=extended` to run these tests. * Added regression tests to `rtest.py`. Regression tests are a subset of tests that caused hardware problems for past emulation environments. * Can be run with `python rtest.py [--emulation|-e|--test|-t]=regression` * Added the parallel `find_first_of` device function with autotuned configurations, this function is similar to `std::find_first_of`, it searches for the first occurrence of any of the provided elements. * Added `--emulation` option added for `rtest.py` * Unit tests can be run with `[--emulation|-e|--test|-t]=` * Added tuned configurations for segmented radix sort for gfx942 to improve performance on this architecture. * Added a parallel device-level function, `rocprim::adjacent_find`, similar to the C++ Standard Library `std::adjacent_find` algorithm. * Added configuration autotuning to device adjacent find (`rocprim::adjacent_find`) for improved performance on selected architectures. * Added rocprim::numeric_limits which is an extension of `std::numeric_limits`, which includes support for 128-bit integers. * Added rocprim::int128_t and rocprim::uint128_t which are the __int128_t and __uint128_t types. * Added the parallel `search` and `find_end` device functions similar to `std::search` and `std::find_end`, these functions search for the first and last occurrence of the sequence respectively. * Added a parallel device-level function, `rocprim::search_n`, similar to the C++ Standard Library `std::search_n` algorithm. * Added new constructors and a `base` function, and added `constexpr` specifier to all functions in `rocprim::reverse_iterator` to improve parity with the C++17 `std::reverse_iterator`. * Added hipGraph support to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`). * Added configuration autotuning to device run-length-encode for nontrivial runs (`rocprim::run_length_encode_non_trivial_runs`) for improved performance on selected architectures. * Added configuration autotuning to device run-length-encode for trivial runs (`rocprim::run_length_encode`) for improved performance on selected architectures. * Added a new type traits interface to enable users to provide additional type trait information to rocPRIM, facilitating better compatibility with custom types. ### Changed * Changed the subset of tests that are run for smoke tests such that the smoke test will complete with faster run-time and to never exceed 2GB of vram usage. Use `python rtest.py [--emulation|-e|--test|-t]=smoke` to run these tests. * The `rtest.py` options have changed. `rtest.py` is now run with at least either `--test|-t` or `--emulation|-e`, but not both options. * Changed the internal algorithm of block radix sort to use rank match to improve performance of various radix sort related algorithms. * Disabled padding in various cases where higher occupancy resulted in better performance despite more bank conflicts. * Removed HIP-CPU support. HIP-CPU support was experimental and broken. * Changed the C++ version from 14 to 17. C++14 will be deprecated in the next major release. * You can use CMake HIP language support with CMake 3.18 and later. To use HIP language support, run `cmake` with `-DUSE_HIPCXX=ON` instead of setting the `CXX` variable to the path to a HIP-aware compiler. ### Resolved issues * Fixed an issue where `rmake.py` would generate wrong CMAKE commands while using Linux environment * Fixed an issue where `rocprim::partial_sort_copy` would yield a compile error if the input iterator is const. * Fixed incorrect 128-bit signed and unsigned integers type traits. * Fixed compilation issue when `rocprim::radix_key_codec<...>` is specialized with a 128-bit integer. * Fixed the warp-level reduction `rocprim::warp_reduce.reduce` DPP implementation to avoid undefined intermediate values during the reduction. * Fixed an issue that caused a segmentation fault when `hipStreamLegacy` was passed to some API functions. ### Upcoming changes * Using the initialisation constructor of `rocprim::reverse_iterator` will throw a deprecation warning. It will be marked as explicit in the next major release. * Using the initialisation constructor of rocprim::reverse_iterator will throw a deprecation warning. It will be marked as explicit in the next major release. ## rocPRIM 3.3.0 for ROCm 6.3.0 ### Added * Changed the default value of `rmake.py -a` to `default_gpus`. This is equivalent to `gfx906:xnack-,gfx1030,gfx1100,gfx1101,gfx1102,gfx1151,gfx1200,gfx1201`. * The `--test smoke` option has been added to `rtest.py`. When `rtest.py` is called with this option it runs a subset of tests such that the total test time is 5 minutes. Use `python3 ./rtest.py --test smoke` or `python3 ./rtest.py -t smoke` to run the smoke test. * The `--seed` option has been added to `run_benchmarks.py`. The `--seed` option specifies a seed for the generation of random inputs. When the option is omitted, the default behavior is to use a random seed for each benchmark measurement. * Added configuration autotuning to device partition (`rocprim::partition`, `rocprim::partition_two_way`, and `rocprim::partition_three_way`), to device select (`rocprim::select`, `rocprim::unique`, and `rocprim::unique_by_key`), and to device reduce by key (`rocprim::reduce_by_key`) to improve performance on selected architectures. * Added `rocprim::uninitialized_array` to provide uninitialized storage in local memory for user-defined types. * Added large segment support for `rocprim:segmented_reduce`. * Added a parallel `nth_element` device function similar to `std::nth_element`. `nth_element` places elements that are smaller than the nth element before the nth element, and elements that are bigger than the nth element after the nth element. * Added deterministic (bitwise reproducible) algorithm variants `rocprim::deterministic_inclusive_scan`, `rocprim::deterministic_exclusive_scan`, `rocprim::deterministic_inclusive_scan_by_key`, `rocprim::deterministic_exclusive_scan_by_key`, and `rocprim::deterministic_reduce_by_key`. These provide run-to-run stable results with non-associative operators such as float operations, at the cost of reduced performance. * Added a parallel `partial_sort` and `partial_sort_copy` device functions similar to `std::partial_sort` and `std::partial_sort_copy`. `partial_sort` and `partial_sort_copy` arrange elements such that the elements are in the same order as a sorted list up to and including the middle index. ### Changed * Modified the input size in device adjacent difference benchmarks. Observed performance with these benchmarks might be different. * Changed the default seed for `device_benchmark_segmented_reduce`. * Changed `test_utils_hipgraphs.hpp` to be a class `GraphHelper` with internal graph and graph instances ### Removed * `rocprim::thread_load()` and `rocprim::thread_store()` have been deprecated. Use `dereference()` instead. ### Resolved issues * Fixed an issue in `rmake.py` where the list storing cmake options would contain individual characters instead of a full string of options. * Resolved an issue in `rtest.py` where it crashed if the `build` folder was created without `release` or `debug` subdirectories. * Resolved an issue with `rtest.py` on Windows where passing an absolute path to `--install_dir` caused a `FileNotFound` error. * rocPRIM functions are no longer forcefully inlined on Windows. This significantly reduces the build time of debug builds. * `block_load`, `block_store`, `block_shuffle`, `block_exchange`, and `warp_exchange` now use placement `new` instead of copy assignment (`operator=`) when writing to local memory. This fixes the behavior of custom types with non-trivial copy assignments. * Fixed a bug in the generation of input data for benchmarks, which caused incorrect performance to be reported in specific cases. It may affect the reported performance for one-byte types (`uint8_t` and `int8_t`) and instantiations of `custom_type`. Specifically, device binary search, device histogram, device merge and warp sort are affected. * Fixed a bug for `rocprim::merge_path_search` where using `unsigned` offsets would produce incorrect results. * Fixed a bug for `rocprim::thread_load` and `rocprim::thread_store` where `float` and `double` were not cast to the correct type, resulting in incorrect results. * Resolved an issue where tests where failing when they were compiled with `-D_GLIBCXX_ASSERTIONS=ON`. * Resolved an issue where algorithms that used an internal serial merge routine caused a memory access fault that resulted in potential performance drops when using block sort, device merge sort (block merge), device merge, device partial sort, and device sort (merge sort). * Fixed memory leaks in unit tests due to missing calls to `hipFree()` and the incorrect use of hipGraphs. * Fixed an issue where certain inputs to `block_sort_merge()`, `device_merge_sort_merge_path()`, `device_merge()`, and `warp_sort_stable()` caused an assertion error during the call to `serial_merge()`. ## rocPRIM 3.2.1 for ROCm 6.2.1 ### Optimizations * Improved performance of `block_reduce_warp_reduce` when warp size equals block size. ## rocPRIM-3.2.0 for ROCm 6.2.0 ### Additions * New overloads for `warp_scan::exclusive_scan` that take no initial value. These new overloads will write an unspecified result to the first value of each warp. * The internal accumulator type of `inclusive_scan(_by_key)` and `exclusive_scan(_by_key)` is now exposed as an optional type parameter. * The default accumulator type is still the value type of the input iterator (inclusive scan) or the initial value's type (exclusive scan). This is the same behaviour as before this change. * New overload for `device_adjacent_difference_inplace` that allows separate input and output iterators, but allows them to point to the same element. * New public API for deriving resulting type on device-only functions: * `rocprim::invoke_result` * `rocprim::invoke_result_t` * `rocprim::invoke_result_binary_op` * `rocprim::invoke_result_binary_op_t` * New `rocprim::batch_copy` function added. Similar to `rocprim::batch_memcpy`, but copies by element, not with memcpy. * Added more test cases, to better cover supported data types. * Updated some tests to work with supported data types. * An optional `decomposer` argument for all member functions of `rocprim::block_radix_sort` and all functions of `device_radix_sort`. To sort keys of an user-defined type, a decomposer functor should be passed. The decomposer should produce a `rocprim::tuple` of references to arithmetic types from the key. * New `rocprim::predicate_iterator` which acts as a proxy for an underlying iterator based on a predicate. It iterates over proxies that holds the references to the underlying values, but only allow reading and writing if the predicate is `true`. It can be instantiated with: * `rocprim::make_predicate_iterator` * `rocprim::make_mask_iterator` * Added custom radix sizes as the last parameter for `block_radix_sort`. The default value is 4, it can be a number between 0 and 32. * New `rocprim::radix_key_codec`, which allows the encoding/decoding of keys for radix-based sorts. For user-defined key types, a decomposer functor should be passed. ### Optimizations * Improved the performance of `warp_sort_shuffle` and `block_sort_bitonic`. * Created an optimized version of the `warp_exchange` functions `blocked_to_striped_shuffle` and `striped_to_blocked_shuffle` when the warpsize is equal to the items per thread. * Improved the performance of `device_transform`. ### Fixes * Fixed incorrect results of `warp_exchange::blocked_to_striped_shuffle` and `warp_exchange::striped_to_blocked_shuffle` when the block size is larger than the logical warp size. The test suite has been updated with such cases. * Fixed incorrect results returned when calling device `unique_by_key` with overlapping `values_input` and `values_output`. * Fixed incorrect output type used in `device_adjacent_difference`. * Hotfix for incorrect results on the GFX10 (Navi 10/RDNA1, Navi 20/RDNA2) ISA and GFX11 ISA (Navi 30 GPUs) on device scan algorithms `rocprim::inclusive_scan(_by_key)` and `rocprim::exclusive_scan(_by_key)` with large input types. * `device_adjacent_difference` now considers both the input and the output type for selecting the appropriate kernel launch config. Previously only the input type was considered, which could result in compilation errors due to excessive shared memory usage. * Fixed incorrect data being loaded with `rocprim::thread_load` when compiling with `-O0`. * Fixed a compilation failure in the host compiler when instantiating various block and device algorithms with block sizes not divisible by 64. ### Deprecations * The internal header `detail/match_result_type.hpp` has been deprecated. * `TwiddleIn` and `TwiddleOut` have been deprecated in favor of `radix_key_codec`. * The internal `::rocprim::detail::radix_key_codec` has been deprecated in favor of the new public utility with the same name. ## rocPRIM-3.1.0 for ROCm 6.1.0 ### Additions * New primitive: `block_run_length_decode` * New primitive: `batch_memcpy` ### Changes * Renamed: * `scan_config_v2` to `scan_config` * `scan_by_key_config_v2` to `scan_by_key_config` * `radix_sort_config_v2` to `radix_sort_config` * `reduce_by_key_config_v2` to `reduce_by_key_config` * `radix_sort_config_v2` to `radix_sort_config` * Removed support for custom config types for device algorithms * `host_warp_size()` was moved into `rocprim/device/config_types.hpp`; it now uses either `device_id` or a `stream` parameter to query the proper device and a `device_id` out parameter * The return type is `hipError_t` * Added support for `__int128_t` in `device_radix_sort` and `block_radix_sort` * Improved the performance of `match_any`, and `block_histogram` which uses it ### Deprecations * Removed `reduce_by_key_config`, `MatchAny`, `scan_config`, `scan_by_key_config`, and `radix_sort_config` ### Fixes * Build issues with `rmake.py` on Windows when using VS 2017 15.8 or later (due to a breaking fix with extended aligned storage) * Fix tests for `block_histogram`, `block_exchange`, `device_histogram` and `device_reduce_by_key` for various types ### Known Issues * `device_run_length_encode`, `warp_exchange` and `warp_load` tests fail with `rocprim::half` ## rocPRIM-3.0.0 for ROCm 6.0.0 ### Additions - `block_sort::sort()` overload for keys and values with a dynamic size, for all block sort algorithms. Additionally, all `block_sort::sort()` overloads with a dynamic size are now supported for `block_sort_algorithm::merge_sort` and `block_sort_algorithm::bitonic_sort`. - New two-way partition primitive `partition_two_way` which can write to two separate iterators. ### Optimizations - Improved the performance of `partition`. ### Fixes - Fixed `rocprim::MatchAny` for devices with 64-bit warp size. The function `rocprim::MatchAny` is deprecated and `rocprim::match_any` is preferred instead. ## rocPRIM-2.13.1 for ROCm 5.7.0 ### Changes - Deprecated configuration `radix_sort_config` for device-level radix sort as it no longer matches the algorithm's parameters. New configuration `radix_sort_config_v2` is preferred instead. - Removed erroneous implementation of device-level `inclusive_scan` and `exclusive_scan`. The prior default implementation using lookback-scan now is the only available implementation. - The benchmark metric indicating the bytes processed for `exclusive_scan_by_key` and `inclusive_scan_by_key` has been changed to incorporate the key type. Furthermore, the benchmark log has been changed such that these algorithms are reported as `scan` and `scan_by_key` instead of `scan_exclusive` and `scan_inclusive`. - Deprecated configurations `scan_config` and `scan_by_key_config` for device-level scans, as they no longer match the algorithm's parameters. New configurations `scan_config_v2` and `scan_by_key_config_v2` are preferred instead. ### Fixes - Fixed build issue caused by missing header in `thread/thread_search.hpp`. ## rocPRIM-2.13.0 for ROCm 5.5.0 ### Additions * New block level `radix_rank` primitive * New block level `radix_rank_match` primitive * Added a stable block sorting implementation, which can be used with `block_sort` by adding the `block_sort_algorithm::stable_merge_sort` algorithm ### Changes * Improved the performance of: * `block_radix_sort` * `device_radix_sort` * `device_merge_sort` * Updated the `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core) ### Known Issues * Disabled GPU error messages relating to incorrect warp operation usage with Navi GPUs on Windows (due to GPU `printf` performance issues on Windows) * When `ROCPRIM_DISABLE_LOOKBACK_SCAN` is set, `device_scan` fails for input sizes larger than `scan_config::size_limit`, which defaults to `std::numeric_limits::max()` ## rocPRIM-2.12.0 for ROCm 5.4.0 ### Changes * `device_partition`, `device_unique`, and `device_reduce_by_key` now support problem sizes larger than 2^32 items * Device algorithms now return `hipErrorInvalidValue` if the amount of passed temporary memory is insufficient * Lists of sizes for tests are unified, restored scan and reduce tests for `half` and `bfloat16` values ### Removals * `block_sort::sort()` overload for keys and values with a dynamic size * This overload was documented but the implementation is missing; to avoid further confusion, the documentation is removed until a decision is made on implementing the function ## rocPRIM-2.11.1 for ROCm 5.3.3 ### Fixes * Fixed the compilation failure in `device_merge` when the two key iterators don't match ## rocPRIM-2.11.0 for ROCm 5.3.2 ### Known Issues * `device_merge` doesn't correctly support different types for `keys_input1` and `keys_input2` (as of the 5.3.0 release) ## rocPRIM-2.11.0 for ROCm 5.3.0 ### Additions * New functions `subtract_left` and `subtract_right` in `block_adjacent_difference` to apply functions on pairs of adjacent items distributed between threads in a block * New device-level `adjacent_difference` primitives * Experimental tooling for automatic kernel configuration tuning for various architectures * Benchmarks collect and output more detailed system information * CMake functionality improves build parallelism of the test suite that splits compilation units by function or by parameters * Reverse iterator * Support for problem sizes over `UINT_MAX` in device functions `inclusive_scan_by_key` and `exclusive_scan_by_key` ## Changes * Improved the performance of warp primitives using the swizzle operation on Navi * Improved build parallelism of the test suite by splitting up large compilation units * `device_select` now supports problem sizes larger than 2^32 items * `device_segmented_radix_sort` now partitions segments to group small, medium, and large segments * Each segment group can be sorted by specialized kernels to improve throughput * Improved histogram performance for the case of highly uneven sample distribution ## rocPRIM-2.10.14 for ROCm 5.2.0 ### Additions * Packages for tests and benchmark executables on all supported operating systems using CPack * Added file and folder reorganization changes with backward compatibility support using wrapper headers ## rocPRIM-2.10.13 for ROCm 5.1.0 ### Fixes * Fixed Radix Sort `int64_t` bug introduced in version 2.10.11 ### Additions * Future value * Device `partition_three_way` to partition input to three output iterators based on two predicates ### Changes * The reduce/scan algorithm precision issues in the tests has been resolved for half types * The device Radix Sort algorithm supports indexing with 64-bit unsigned integers * The indexer type is chosen based on the type argument of parameter `size` * If `sizeof(size)` is not larger than 4 bytes, the indexer type is 32-bit unsigned int, otherwise, the indexer type is 64-bit unsigned int * The maximum problem size is based on the compile time configuration of the algorithm according to the following formula: * `max_problem_size = (UINT_MAX + 1) * config::scan::block_size * config::scan::items_per_thread` ### Deprecations * Flags API of `block_adjacent_difference` ### Known issues * `device_segmented_radix_sort` unit test is failing for HIP on Windows ## rocPRIM-2.10.12 for ROCm 5.0.0 ### Fixes * Enable bfloat16 tests and reduce threshold for bfloat16 * Fix device scan `limit_size` feature * Non-optimized builds no longer trigger local memory limit errors ### Additions * Scan size limit feature * Reduce size limit feature * Transform size limit feature * `block_load_striped` and `block_store_striped` * `gather_to_blocked` to gather values from other threads into a blocked arrangement * The block sizes for device merge sorts initial block sort and its merge steps are now separate in its kernel config * The block sort step supports multiple items per thread ### Changes * you can now set the `size_limit` for scan, reduce, and transform in the config struct instead of using a parameter * `device_scan` and `device_segmented_scan`: `inclusive_scan` now uses the `input-type` as `accumulator-type`; `exclusive_scan` uses `initial-value-type` * This changes the behavior of small-size input types with large-size output types (e.g., `short` input, `int` output) and low-res input with high-res output (e.g., `float` input, `double` output) * Revert an old Fiji workaround because they solved the issue at the compiler side * Update README CMake minimum version number * Added block sort support for multiple items per thread * Currently only powers of two block sizes, and items per threads are supported and only for full blocks * Bumped the minimum required version of CMake to 3.16 ### Known issues * `device_segmented_radix_sort` and `device_scan` unit tests failing for HIP on Windows * `ReduceEmptyInput` causes random failure with bfloat16 ## rocPRIM-2.10.11 for ROCm 4.5.0 ### Additions * Initial HIP on Windows support * bfloat16 support added ### Changes * Packaging has been split into a runtime package (`rocprim`) and a development package (`rocprim-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. * Because rocPRIM is a header-only library, the runtime package is an empty placeholder used to aid in the transition. This package is also a deprecated feature and will be removed in a future rocm release. ### Known issues * Unit tests may soft hang on MI200 when running in `hipMallocManaged` mode ## rocPRIM-2.10.11 for ROCm 4.4.0 ### Additions * Code coverage tools build option * AddressSanitizer build option * gfx1030 support added * Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Windows and Linux (this is work in progress and many algorithms are known to fail) ### Optimizations * Added single tile Radix Sort for smaller sizes * Improved performance for Radix Sort for larger element sizes ## rocPRIM-2.10.10 for ROCm 4.3.0 ### Fixes * Bug fix and minor performance improvement for `merge_sort` when input and output storage are the same ### Additions * gfx90a support added ### Deprecations * `warp_size()` function; use `host_warp_size()` and `device_warp_size()` for host and device references, respectively ## rocPRIM-2.10.9 for ROCm 4.2.0 ### Fixes * Size zero inputs are now properly handled with newer ROCm builds that no longer allow zero-size kernel grid and block dimensions ### Changes * Minimum CMake version required is now 3.10.2 ### Known issues * Device scan unit test is currently failing due to an LLVM bug ## rocPRIM-2.10.8 for ROCm 4.1.0 ### Fixes * Texture cache iteration support has been re-enabled * Benchmark builds have been re-enabled * Unique operator is no longer called on invalid elements ### Known issues * Device scan unit test is currently failing because of an LLVM bug ## rocPRIM-2.10.7 for ROCm 4.0.0 * No new features ## rocPRIM-2.10.6 for ROCm 3.10 ### Optimizations * Updates to DPP instructions for warp shuffle ### Known issues * Benchmark builds are disabled due to compiler bug ## rocPRIM-2.10.5 for ROCm 3.9.0 ### Additions * HIP CMake dependency ### Optimizations * Updates to warp shuffle for gfx10 * Disabled DPP functions on gfx10++ ### Known issues * Benchmark builds are disabled due to compiler bug ## rocPRIM-2.10.4 for ROCm 3.8.0 ### Fixes * Fix for rocPRIM texture cache iterator ## rocPRIM-2.10.3 for ROCm 3.7.0 ### Fixes * Package dependency correct to `hip-rocclr` ### Known issues * rocPRIM texture cache iterator functionality is broken in the runtime (this will be fixed in the next release); you can use the prior release if calling this function ## rocPRIM-2.10.2 for ROCm 3.6.0 * No new features ## rocPRIM-2.10.1 for ROCm 3.5.1 ### Fixes * Point release with compilation fix ## rocPRIM-2.10.1 for ROCm 3.5.0 ### Additions * Improved tests with fixed and random seeds for test data * Network interface improvements with API v3 ### Changes * Switched to HIP-Clang as the default compiler * CMake searches for rocPRIM locally first; if t's not found, CMake downloads it from GitHub rocPRIM-rocm-7.1.0/CMakeLists.txt000066400000000000000000000254511506507210100164740ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.25) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # rocPRIM project project(rocprim LANGUAGES CXX) # Set CXX flags if (NOT DEFINED CMAKE_CXX_STANDARD) set(CMAKE_CXX_STANDARD 17) endif() set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) # Set HIP flags set(CMAKE_HIP_STANDARD 14) set(CMAKE_HIP_STANDARD_REQUIRED ON) set(CMAKE_HIP_EXTENSIONS OFF) if(NOT CMAKE_CXX_STANDARD EQUAL 17) message(FATAL_ERROR "Only C++17 is supported") endif() if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) else() set(ROCPRIM_PROJECT_IS_TOP_LEVEL FALSE) endif() #Adding CMAKE_PREFIX_PATH if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() include(CheckLanguage) include(CMakeDependentOption) # Build options # Disables building tests, benchmarks, examples option(ONLY_INSTALL "Only install" OFF) cmake_dependent_option(BUILD_TEST "Build tests (requires googletest)" OFF "NOT ONLY_INSTALL" OFF) option(WITH_ROCRAND "Build tests with device-side data generation(requires rocRAND)" OFF) cmake_dependent_option(BUILD_BENCHMARK "Build benchmarks" OFF "NOT ONLY_INSTALL" OFF) cmake_dependent_option(BUILD_EXAMPLE "Build examples" OFF "NOT ONLY_INSTALL" OFF) option(BUILD_NAIVE_BENCHMARK "Build naive benchmarks" OFF) cmake_dependent_option(BUILD_DOCS "Build documentation (requires sphinx)" OFF "NOT ONLY_INSTALL" OFF) option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) option(ROCPRIM_INSTALL "Enable installation of rocPRIM (projects embedding rocPRIM may want to turn this OFF)" ON) option(ROCPRIM_ENABLE_ASSERTS "Enable asserts in release build)" OFF) option(BUILD_OFFLOAD_COMPRESS "Build rocPRIM with offload compression" ON) cmake_dependent_option(USE_SYSTEM_LIB "Use installed ROCm libs when building tests" OFF BUILD_TEST OFF) check_language(HIP) cmake_dependent_option(USE_HIPCXX "Use CMake HIP language support" OFF CMAKE_HIP_COMPILER OFF) include(CheckCXXCompilerFlag) if(BUILD_OFFLOAD_COMPRESS) check_cxx_compiler_flag("--offload-compress" CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) if(CXX_COMPILER_SUPPORTS_OFFLOAD_COMPRESS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} --offload-compress") else() message(STATUS "Warning: BUILD_OFFLOAD_COMPRESS=ON but flag not supported by compiler. Ignoring option.") endif() endif() if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_SOURCE_DIR) set(ROCPRIM_PROJECT_IS_TOP_LEVEL TRUE) else() set(ROCPRIM_PROJECT_IS_TOP_LEVEL FALSE) endif() #Adding CMAKE_PREFIX_PATH if(WIN32) set(ROCM_ROOT "$ENV{HIP_PATH}" CACHE PATH "Root directory of the ROCm installation") else() set(ROCM_ROOT "/opt/rocm" CACHE PATH "Root directory of the ROCm installation") endif() # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${ROCM_PATH}/lib/cmake/hip ${HIP_PATH}/cmake ${ROCM_ROOT}/lib/cmake/hip ${ROCM_ROOT}/hip/cmake # FindHIP.cmake ) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() if(ROCPRIM_ENABLE_ASSERTS) if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "") string(TOUPPER ${CMAKE_BUILD_TYPE} BUILD_TYPE) set(BUILD_TYPE_CXX_FLAGS "CMAKE_CXX_FLAGS_${BUILD_TYPE}") set(BUILD_TYPE_C_FLAGS "CMAKE_C_FLAGS_${BUILD_TYPE}") endif() string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS}") string(REGEX REPLACE "-DNDEBUG( |$)" "" CMAKE_C_FLAGS "${CMAKE_C_FLAGS}") if(NOT "${CMAKE_BUILD_TYPE}" STREQUAL "") string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_CXX_FLAGS} "${${BUILD_TYPE_CXX_FLAGS}}") string(REGEX REPLACE "-DNDEBUG( |$)" "" ${BUILD_TYPE_C_FLAGS} "${${BUILD_TYPE_C_FLAGS}}") endif() endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") if(DEFINED BUILD_SHARED_LIBS) set(PKG_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) else() set(PKG_BUILD_SHARED_LIBS ON) endif() set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared # Get dependencies (required here to get rocm-cmake) include(cmake/Dependencies.cmake) # Use target ID syntax if supported for GPU_TARGETS if(USE_HIPCXX) enable_language(HIP) else() if (NOT DEFINED AMDGPU_TARGETS) set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") endif() set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") if(GPU_TARGETS STREQUAL "all") if(BUILD_ADDRESS_SANITIZER) # ASAN builds require xnack rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx908:xnack+;gfx90a:xnack+;gfx942:xnack+;gfx950:xnack+" ) else() rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx942;gfx950;gfx1030;gfx1100;gfx1101;gfx1102;gfx1151;gfx1200;gfx1201" ) endif() set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() endif() # Compressed offload binaries are currently not working with the SPIR-V target if("amdgcnspirv" IN_LIST GPU_TARGETS) if(BUILD_OFFLOAD_COMPRESS) message(FATAL_ERROR "Cannot combine SPIR-V and BUILD_OFFLOAD_COMPRESS") endif() endif() # TODO: Fix VerifyCompiler for HIP on Windows if (NOT WIN32) include(cmake/VerifyCompiler.cmake) endif() list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm ${ROCM_ROOT}/llvm ${ROCM_ROOT} ${ROCM_ROOT}/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) if(BUILD_CODE_COVERAGE) add_compile_options(-fprofile-arcs -ftest-coverage) add_link_options(--coverage) endif() # Setup VERSION set(VERSION_STRING "4.1.0") rocm_setup_version(VERSION ${VERSION_STRING}) math(EXPR rocprim_VERSION_NUMBER "${rocprim_VERSION_MAJOR} * 100000 + ${rocprim_VERSION_MINOR} * 100 + ${rocprim_VERSION_PATCH}") # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() # rocPRIM library add_subdirectory(rocprim) if(ROCPRIM_PROJECT_IS_TOP_LEVEL AND (BUILD_TEST OR BUILD_BENCHMARK)) rocm_package_setup_component(clients) endif() # Tests if(BUILD_TEST) if(USE_SYSTEM_LIB) find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") if (NOT ${rocprim_VERSION} VERSION_EQUAL ${VERSION_STRING}) message(WARNING "The installed rocprim version, ${rocprim_VERSION}, does not match project version ${VERSION_STRING}. Building tests with USE_SYSTEM_LIB=ON may not work properly.") endif() endif() if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(tests) endif() enable_testing() add_subdirectory(test) endif() # Benchmarks if(BUILD_BENCHMARK) if (ROCPRIM_PROJECT_IS_TOP_LEVEL) rocm_package_setup_client_component(benchmarks) endif() add_subdirectory(benchmark) endif() # Examples if(BUILD_EXAMPLE) add_subdirectory(example) endif() # Docs if(BUILD_DOCS) add_subdirectory(docs) endif() # set BUILD_SHARED_LIBS for packaging set(BUILD_SHARED_LIBS ${PKG_BUILD_SHARED_LIBS}) # Package if (ROCPRIM_PROJECT_IS_TOP_LEVEL) # add dependency on HIP runtime set(HIP_RUNTIME_MINIMUM 4.5.0) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() rocm_package_add_dependencies(SHARED_DEPENDS "${DEPENDS_HIP_RUNTIME} >= ${HIP_RUNTIME_MINIMUM}") rocm_package_add_deb_dependencies(STATIC_DEPENDS "hip-static-dev >= ${HIP_RUNTIME_MINIMUM}") rocm_package_add_rpm_dependencies(STATIC_DEPENDS "hip-static-devel >= ${HIP_RUNTIME_MINIMUM}") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md") set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) rocm_create_package( NAME rocprim DESCRIPTION "rocPRIM is a header-only library that provides HIP parallel primitives." MAINTAINER "rocPRIM Maintainer " HEADER_ONLY ) endif() # # ADDITIONAL TARGETS FOR CODE COVERAGE # if(BUILD_CODE_COVERAGE) # # > make coverage_cleanup (clean coverage related files.) # > # run your tests # > make coverage (generate html documentation) # # # Prepare coverage output # This little script is generated because the option '--gcov-tool ' of lcov cannot take arguments. # add_custom_target(coverage DEPENDS rocprim COMMAND mkdir -p lcoverage COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh COMMAND chmod +x llvm-gcov.sh ) # # Generate coverage output. # add_custom_command(TARGET coverage COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info COMMAND genhtml lcoverage/main_coverage.info --output-directory lcoverage ) # # Coverage cleanup # add_custom_target(coverage_cleanup COMMAND find ${CMAKE_BINARY_DIR} -name *.gcda -delete WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocPRIM-rocm-7.1.0/CONTRIBUTING.md000066400000000000000000000435311506507210100161640ustar00rootroot00000000000000 # Contributing to rocPRIM # We welcome contributions to rocPRIM. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgement for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## rocPRIM provides a number of foundational parallel algorithms that are optimized for AMD ROCm platforms. The purpose of the library is to provide a reliable, performant foundation upon which other libraries can be built. The library is written in HIP, targeting AMD's ROCm platform. Correctness and performance are both important goals in rocPRIM. Because of this, new changes should include both **test** and **benchmark** coverage. Tests and benchmarks should be broad enough to ensure that code runs correctly and performs well across a variety of input types and sizes. More specifically: - Tests must cover all the functionality added to the public API. - Tests must cover the whole range of supported sizes, not by testing every single possible size but rather using representative sizes that ensure that the algorithms run succesfully with any size from the range. - On this note, it also needs to be taken into account that some algorithms have support for large indices (indices that cannot be stored in a 32-bit integer), so input sizes should also cover that case. - Tests and benchmarks must be instantiated with all supported data types. - If the algorithm uses multiple data types (for instance, if it uses different types for input and output), a selected and representative few combinations should be tested instead of the full combination matrix. Any utility needed by the tests **and** benchmarks must be added to the appropriate header within the `common` folder. Non-common utilities may be hosted in the corresponding headers from the `test` or `benchmark` folders. For a more detailed description of the cases to be considered for adding new utilities, please check [common](/common/README.md). We also employ automated testing and benchmarking via checks that are run when a pull request is created. These checks: - test all algorithms for correctness across a variety of input configurations (eg. types, sizes, etc.) - run benchmarks to check for performance degradation - test the change on various OS platforms (Ubuntu, RHEL, etc.) - build and run the code on different GPU architectures (MI-series, Radeon series cards, etc.) ## Code Structure ## rocPRIM is a header-only library. Library code is located inside of `rocprim/include/rocprim/`, and within the `rocprim` namespace. Note that all the symbols inside the `rocprim::detail` namespace are not part of the public API. Algorithms are grouped by the level-of-scope at which they operate. The following subdirectories organize them by hardware-level scope: * `device/`: contains headers for device-level algorithms, which are to be called from host code. * `block/`: contains headers for block-level algorithms, only callable from device code. * `warp/`: contains headers for warp/wavefront-level algorithms, only callable from device code. * `thread/`: contains headers for thread-level algorithms, only callable from device code. Supporting code is distributed into several subdirectories depending on its scope: * `detail/`: utility functions and structs for the internal state of algorithms. * `detail/config/`: configs for tuned algorithms (see [tuning](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/concepts/tuning.html)). * `intrinsics/`: specialized intrinsic functions (eg. atomics, warp-shuffling, bit manipulation, etc.). Some of them are just wrappers around HIP's intrinsics, atomic/warp-shuffle functions or compiler's intrinsics. * `iterator/`: iterators that are used to interact with most algorithms in the library (like `constant_iterator` for iterating over a homogeneous range of values or `transform_iterator` for applying a transformation to a given range of values). * `types/`: a number of convenient types used in the library (eg. for storing future values, compile-time integer sequences, etc.). Correctness code (tests) is located inside the `test` folder. Several test suites exist depending on what they assess: * `extra`: test suite that should be run after rocPRIM is installed from package or from source. It is a short smoke test to verify the correctness of the installation or packaging process. * `hip`: test suite that checks HIP functionality that is of particular interest to rocPRIM. * `hipgraph`: test suite for verifying that rocPRIM's algorithms work with `hipGraph`. * `rocprim`: test suite for checking the correctness of rocPRIM's algorithms. Finally, performance code (benchmarks) is located inside the `benchmark` folder. Tuned algorithms use three files: * `benchmark/benchmark_.cpp` * `benchmark/benchmark_.parallel.cpp.in` * `benchmark/benchmark_.parallel.hpp` while non-tuned algorithms have only one `benchmark/benchmark_.cpp` file. ## Coding Style ## C and C++ code should be formatted using `clang-format`. Use the clang-format version shipped with ROCm, which is available in the `/opt/rocm` directory. Please do not use your system's built-in `clang-format`, as this is an older version that will have different results. The check_format script (`scripts/code-format/check-format.sh`) allows to check for formatting violations. These can be easily fixed as described below. To format a file, use: ```bash /opt/rocm/llvm/bin/clang-format -style=file -i ``` To format all modified (staged) files, use the following command inside the root directory of rocPRIM: ```bash /opt/rocm/llvm/bin/git-clang-format --style=file --binary /opt/rocm/llvm/bin/clang-format ``` Format modifications will stay unstaged, so that they can be reviewed before commiting. The formatting can also be done on a per-commit basis, by running: ```bash /opt/rocm/llvm/bin/git-clang-format --style=file --binary /opt/rocm/llvm/bin/clang-format ``` or installing githooks: ```bash ./.githooks/install ``` The githooks installed will both format the code and update the copyright dates (see [deliverables](#deliverables)). Additionally, some code editors (such as Visual Studio Code, CLion, XCode, Eclipse, Vim, etc.) have clang-format plugins available, so that formatting can be done from the editor instead of from command-line. This is especially useful for formatting while coding. ### Namespaces ### As mentioned in [Code Structure](#code-structure), rocPRIM's symbols are exposed within the `rocprim` namespace, with the exception of the ones intended for internal use which are inside `rocprim::detail`. This is done so that users can place rocPRIM in a different namespace (keeping `rocprim` as the innermost namespace) to prevent a namespace collision when two independent rocPRIM libraries end up in the same compute unit through, for instance, indirect inclusion. Therefore, files from `rocprim/include/rocprim` containing any rocPRIM symbol should start with `BEGIN_ROCPRIM_NAMESPACE` and end with `END_ROCPRIM_NAMESPACE`. These are macros that wrap the namespace opening and closing, respectively. Implementation details are put into the `rocprim::detail` namespace. No wrapping macros are defined for this one, so just the ususal ```c++ namespace detail { ... } ``` should be used when needed. ## Documenting Style ## Apart from the usual comments to ease understanding of the code, Sphinx and Doxygen are used to document the functionality available from rocPRIM. The Sphinx docs for the API are organized mostly following the code structure. The folders `_ops` (block_ops, device_ops, etc.) contain the documentation files for methods operating in the correspondent hardware levels. The documentation for supporting code is placed in separate files, located inside `docs/reference`. To connect Sphinx with Doxygen, Breathe is used. There is a Doxygen group defined for each folder under `rocprim/include/rocprim/` which has documented functionality named as `module` (for example, `threadmodule` for members of `rocprim/include/rocprim/thread` or `intrinsicsmodule` for members of `rocprim/include/rocprim/intrinsics`). Placing the contents of a file inside the correspondent Doxygen group guarantees that Sphinx will get access to the documentation inside that file. Only members of the public API need to be documented with these two tools, as in the ones outside the `rocprim::detail` namespace, as all symbols inside said namespace are excluded from the documentation. If some member does not need documentation (such as a specialization of a class that doesn't need any extra clarifications) it can be left out of Doxygen docs by encapsulating the code as shown below: ```c++ /// \cond // code without doxygen documentation here /// \endcond ``` This isn't always possible (for instance, when base classes need to be excluded), so a pre-processor approach is also available: ```c++ #ifndef DOXYGEN_DOCUMENTATION_BUILD // code without doxygen documentation here #endif // DOXYGEN_DOCUMENTATION_BUILD ``` Some files also use the following structure: ```c++ #ifndef DOXYGEN_SHOULD_SKIP_THIS // code without doxygen documentation here #endif // DOXYGEN_SHOULD_SKIP_THIS ``` New code should prefer `DOXYGEN_DOCUMENTATION_BUILD` over `DOXYGEN_SHOULD_SKIP_THIS`, as its easier to understand. `DOXYGEN_SHOULD_SKIP_THIS` is defined to be 1 when Doxygen is parsing, logically making its correct usage a double-negation. In general terms, a file properly documented should look like something along the lines of: ```c++ /// \addtogroup /// @{ BEGIN_ROCPRIM_NAMESPACE namespace detail { // here just add comments if needed ... } // end namespace detail /// \brief Some public class. /// /// Here some more info can be added to the brief description. /// \tparam A Template type used by the class. /// \tparam ... template class some_class { /// \brief A type used within the class. using class_type = some_other_type; /// \brief A method member of the class. /// /// \tparam B Another template parameter. /// \param [in] param_in_first Input parameter description. /// \param [in] param_in_second [optional] Optional input parameter description. /// \param [out] param_out Output parameter description. /// \param [in,out] param_in_out Input/Output parameter description. /// \return Returned object description. template return_type some_class_method(A param_in_first, B param_in_second = {}) { ... } } ... END_ROCPRIM_NAMESPACE /// @} // end of group ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follows the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). We also mostly abide [GitHub's best practices for pull-requests](https://docs.github.com/en/pull-requests/collaborating-with-pull-requests/getting-started/best-practices-for-pull-requests), namely: 1. **Write small PRs**. PRs should be feature-focused and respect the scope of the issue(s) that it refers to. This makes reviews easier and faster, and yields less chances of overlooking bugs in the new/modified code. 2. **Review your own PR**. Before opening/undrafting your PR, take your time to review all the changes as if you were one of the reviewers. This helps catching typos or small errors in advance. 3. **Provide context and guidance**. PRs should generally have a descriptive title and an explanatory body that includes: - **scope** (purpose) of the PR: explanation of the scope of the PR (for instance, what feature/bug the PR adds/fixes). This helps identifying new issues to be spawned from the comments received in the PR: if any comment suggests any addition/fix that falls out of this scope, a new issue should be created so that the comment is tackled in another (feature-focused) PR. - some **notes** explaining the changes/additions made so that reviewers know which decisions were taken and why. Here you can also explicitly request feedback on specific matters that you think may need to be discussed. - if necessary, **how to verify** that the issue(s) at hand are indeed tackled with this PR (something like "the newly added test covering the fixed bug's case is passing"). When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch, which serves as our integration branch. Releases are cut to `release/rocm-rel-x.y`, where x and y refer to the release major and minor numbers. ### Deliverables ### #### Correctness, performance and documentation #### Code that introduces new features should have **test coverage** and **benchmark coverage**. **Documentation** must also be added following the guidelines described in [Documentation Style](#documentation-style). If modifying existing functionality, tests, benchmarks and documentation must be updated to fit the new behavior and/or parameters. If the autotuning is run, benchmarks should be re-run to check whether performance indeed improves. If so, the new configuration files generated should be added to the corresponding PR. #### License #### rocPRIM is an open source library. Because of this, we include the **license description** shown below at the top of every source file. If you create new source files in the repository, please include this text in them as well (replacing "xx" with the digits for the current year): ```c++ // Copyright (c) 20xx Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. ``` If you modify existing files licensed in a previous year, add a dash followed by the modification year to indicate that the license also covers the most recent changes (like so: `Copyright (c) 20xx-20yy`). It may also happen that such an interval is already specified in the license, but the last year indicated is previous to the current modification date, in this case just change it accordingly. Under the `scripts/copyright-date` folder there is `check-copyright` script that we use to check if the copyright date updates are done. It can also be used to automatize those updates. Run ```bash scripts/copyright-date/check-copyright.sh -u ``` inside rocPRIM's root directory to update the copyright statements of modified files or set ```bash git config --local hooks.updateCopyright true ``` to automatically update copyrights when committing. #### Changes Record #### All noticeable changes are recorded in the `CHANGELOG.md` file. For every release, we annotate the additions, fixes, changes, deprecations and/or optimizations introduced within that release. When opening a PR, make sure to add to the correspondent sections under the latest unreleased release all the meaningful changes introduced. ### Process ### After you create a PR, you can take a look at a diff of the changes you made using the PR's "Files" tab. PRs must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. During code reviews, another developer(s) will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. When a modification request has been completed, the conversation thread about it will be marked as resolved. To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. rocPRIM-rocm-7.1.0/LICENSE.md000066400000000000000000000020701506507210100153300ustar00rootroot00000000000000MIT License Copyright (C) Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. rocPRIM-rocm-7.1.0/NOTICES.txt000066400000000000000000000066161506507210100156030ustar00rootroot00000000000000Notices and Licenses file ______________________________________________________________________________ AMD copyrighted code (MIT) Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ROCmSoftwarePlatform-rocPRIM v2.5.0 (MIT) Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. florianrappl-cmdparser v-u (MIT) Copyright (c) 2015 - 2016 Florian Rappl Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. rocPRIM-rocm-7.1.0/README.md000066400000000000000000000273501506507210100152130ustar00rootroot00000000000000# rocPRIM > [!NOTE] > The published rocPRIM documentation is available [here](https://rocm.docs.amd.com/projects/rocPRIM/en/latest/) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the `docs` folder of this repository. As with all ROCm projects, the documentation is open source. For more information on contributing to the documentation, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). rocPRIM is a header-only library that provides HIP parallel primitives. You can use this library to develop performant GPU-accelerated code on AMD ROCm platforms. ## Requirements * Git * CMake (3.16 or later) * AMD [ROCm](https://rocm.docs.amd.com/en/latest/) platform (1.8.2 or later) * Including [HIP-clang](https://github.com/ROCm/HIP/blob/master/INSTALL.md#hip-clang) compiler * C++17 * Python 3.6 or higher (HIP on Windows only, required only for install script) * Visual Studio 2019 with Clang support (HIP on Windows only) * Strawberry Perl (HIP on Windows only) Optional: * [GoogleTest](https://github.com/google/googletest) * Required only for tests. Building tests is on by default. * This is automatically downloaded and built by the CMake script. * [Google Benchmark](https://github.com/google/benchmark) * Required only for benchmarks. Building benchmarks is off by default. * This is automatically downloaded and built by the CMake script. ## Build and install You can build and install rocPRIM on Linux or Windows. * Linux: ```shell git clone https://github.com/ROCm/rocPRIM.git # Go to rocPRIM directory, create and go to the build directory. cd rocPRIM; mkdir build; cd build # Configure rocPRIM, setup options for your system. # Build options: # ONLY_INSTALL - OFF by default, If this flag is on, the build ignore the BUILD_* flags # BUILD_TEST - OFF by default, # BUILD_EXAMPLE - OFF by default, # BUILD_BENCHMARK - OFF by default. # BENCHMARK_CONFIG_TUNING - OFF by default. The purpose of this flag to find the best kernel config parameters. # At ON the compilation time can be increased significantly. # AMDGPU_TARGETS - list of AMD architectures, default: gfx803;gfx900;gfx906;gfx908. # You can make compilation faster if you want to test/benchmark only on one architecture, # for example, add -DAMDGPU_TARGETS=gfx906 to 'cmake' parameters. # AMDGPU_TEST_TARGETS - list of AMD architectures, default: "" (default system device) # If you want to detect failures on a per GFX IP basis, setting it to some set of ips will create # separate tests with the ip name embedded into the test name. Building for all, but selecting # tests only of a specific architecture is possible for eg: ctest -R gfx803|gfx900 # USE_SYSTEM_LIB - OFF by default. Setting this flag to ON will build tests from the installed ROCm libs provided by the system. This only takes effect when BUILD_TEST is ON. # # ! IMPORTANT ! # Set C++ compiler to HIP-clang. You can do it by adding 'CXX=' # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # Using HIP-clang: [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../. # Build make -j4 # Optionally, run tests if they're enabled. ctest --output-on-failure # Install [sudo] make install ``` * Windows: We've added initial support for HIP on Windows; to install, use the provided `rmake.py` python script: ```shell git clone https://github.com/ROCm/rocPRIM.git cd rocPRIM # the -i option will install rocPRIM to C:\hipSDK by default python rmake.py -i # the -c option will build all clients including unit tests python rmake.py -c ``` ### Using rocPRIM Include the `` header: ```cpp #include ``` We recommended including rocPRIM into a CMake project by using the package configuration files. The rocPRIM package name is `rocprim`. ```cmake # "/opt/rocm" - default install prefix find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") ... # Includes only rocPRIM headers, HIP libraries have # to be linked manually by user target_link_libraries( roc::rocprim) # Include rocPRIM headers and required HIP dependencies # - If using HIP language support (USE_HIPCXX=ON): target_link_libraries( hip::host) # - Otherwise: target_link_libraries( hip::device) ``` For more information on `hip::host` and `hip::device`, please see the [ROCm documentation](https://rocm.docs.amd.com/en/latest/conceptual/cmake-packages.html#consuming-the-hip-api-in-c-code). ## Running unit tests Unit tests are implemented in terms of GoogleTest. Collections of tests are wrapped and invoked from CTest. ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # List available tests ctest --show-only # To run all tests ctest # Run specific test(s) ctest -R # To run the Google Test manually ./test/rocprim/test_ ``` ### Using multiple GPUs concurrently for testing This feature requires using CMake 3.16+ for building and testing. ```note Prior versions of CMake can't assign IDs to tests when running in parallel. Assigning tests to distinct devices could only be done at the cost of extreme complexity. ``` Unit tests can make use of the [CTest resource allocation](https://cmake.org/cmake/help/latest/manual/ctest.1.html#resource-allocation) feature, which you can use to distribute tests across multiple GPUs in an intelligent manner. This feature can accelerate testing when multiple GPUs of the same family are in a system. It can also test multiple product families from one invocation without having to use the `HIP_VISIBLE_DEVICES` environment variable. The feature relies on the presence of a resource specifications file. ```important Trying to use `RESOURCE_GROUPS` and `--resource-spec-file` with CMake and CTest for versions prior to 3.16 silently omits the feature. No warnings are issued about unknown properties or command-line arguments. Make sure that the `cmake` and `ctest` versions you invoke are sufficiently recent. ``` #### Auto resource specification generation You can independently call the utility script located in the repository using the following code: ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # Invoke directly or use CMake script mode via cmake -P ../cmake/GenerateResourceSpec.cmake # Assuming you have 2 compatible GPUs in the system ctest --resource-spec-file ./resources.json --parallel 2 ``` #### Manual Assuming you have two GPUs from the gfx900 family and they are the first devices enumerated by the system, you can use `-D AMDGPU_TEST_TARGETS=gfx900` during configuration to specify that only one family will be tested. Leaving this var empty (default) results in targeting the default device in the system. To let CMake know there are two GPUs that should be targeted, you have to provide a `JSON` file to CTest via the `--resource-spec-file ` flag. For example: ```json { "version": { "major": 1, "minor": 0 }, "local": [ { "gfx900": [ { "id": "0" }, { "id": "1" } ] } ] } ``` Invoking CTest as `ctest --resource-spec-file --parallel 2` allows two tests to run concurrently, distributed between the two GPUs. ### Using custom seeds for the tests Modify the `rocPRIM/test/rocprim/test_seed.hpp` file. ```cpp //(1) static constexpr int random_seeds_count = 10; //(2) static constexpr unsigned int seeds [] = {0, 2, 10, 1000}; //(3) static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); ``` (1) Defines a constant that sets how many passes over the tests will be done with runtime-generated seeds. Modify at will. (2) Defines the user-generated seeds. Each of the array elements will be used as seed for all tests. Modify at will. If you don't want any static seeds, leave the array empty. ```cpp static constexpr unsigned int seeds [] = {}; ``` (3) Never modify this line. ## Running benchmarks ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # To run benchmark for warp functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_warp_ [--size ] [--trials ] # To run benchmark for block functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_block_ [--size ] [--trials ] # To run benchmark for device functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_device_ [--size ] [--trials ] ``` ### Performance configuration Most device-specific primitives provided by rocPRIM can be tuned for other AMD devices, and different types and operations, by passing compile-time configuration structures as a template parameter. The main "knobs" are usually the size of the block and the number of items processed by a single thread. rocPRIM has built-in default configurations for each of its primitives, these will be used automatically based on the input types and the target architecture from the stream used. ## hipCUB [hipCUB](https://github.com/ROCm/hipCUB/) is a thin wrapper library on top of [rocPRIM](https://github.com/ROCm/rocPRIM) or [CUB](https://github.com/NVlabs/cub). You can use it to port projects that use the CUB library to the [HIP](https://github.com/ROCm/HIP) layer and run them on AMD hardware. In the [ROCm](https://rocm.docs.amd.com/en/latest/) environment, hipCUB uses the rocPRIM library as a backend. ## Building the documentation locally ### Requirements #### Doxygen The build system uses Doxygen [version 1.9.4](https://github.com/doxygen/doxygen/releases/tag/Release_1_9_4). You can try using a newer version, but that might cause issues. After you have downloaded Doxygen version 1.9.4: ```shell # Add doxygen to your PATH echo 'export PATH=/bin:$PATH' >> ~/.bashrc # Apply the updated .bashrc source ~/.bashrc # Confirm that you are using version 1.9.4 doxygen --version ``` #### Python The build system uses Python version 3.10. You can try using a newer version, but that might cause issues. You can install Python 3.10 alongside your other Python versions using [pyenv](https://github.com/pyenv/pyenv?tab=readme-ov-file#installation): ```shell # Install Python 3.10 pyenv install 3.10 # Create a Python 3.10 virtual environment pyenv virtualenv 3.10 venv_rocprim # Activate the virtual environment pyenv activate venv_rocprim ``` ### Building After cloning this repository, and `cd`ing into it: ```shell # Install Python dependencies python3 -m pip install -r docs/sphinx/requirements.txt # Build the documentation python3 -m sphinx -T -E -b html -d docs/_build/doctrees -D language=en docs docs/_build/html ``` You can then open `docs/_build/html/index.html` in your browser to view the documentation. ### Build documentation via CMake Install [rocm-cmake](https://github.com/ROCm/rocm-cmake/) ```shell # Change directory to rocPRIM cd rocPRIM # Install documentation dependencies python3 -m pip install -r docs/sphinx/requirements.txt # Set C++ compiler # This example uses hipcc and assumes it is at the path /usr/bin export CXX=hipcc export PATH=/usr/bin:$PATH # Configure the project cmake -S . -B ./build -D BUILD_DOCS=ON # Build the documentation cmake --build ./build --target doc # To serve the HTML docs locally cd ./build/docs/html python3 -m http.server ``` ## Support You can report bugs and feature requests through our GitHub [issue tracker](https://github.com/ROCm/rocPRIM/issues). ## Contributions and license Contributions of any kind are most welcome! Contribution instructions are in [CONTRIBUTING](./CONTRIBUTING.md). Licensing information is in [LICENSE](./LICENSE.txt). rocPRIM-rocm-7.1.0/benchmark/000077500000000000000000000000001506507210100156575ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/benchmark/CMakeLists.txt000066400000000000000000000175371506507210100204340ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various configs" OFF) include(../cmake/ConfigAutotune.cmake) include(ConfigAutotuneSettings.cmake) option(BENCHMARK_TUNE_PARAM_NAMES "Tuning parameter names" "") option(BENCHMARK_TUNE_PARAMS "Tuning parameters" "") if(BENCHMARK_CONFIG_TUNING) add_custom_target("benchmark_config_tuning") endif() function(add_rocprim_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) if(USE_HIPCXX) set_source_files_properties(${BENCHMARK_SOURCE} PROPERTIES LANGUAGE HIP) endif() if(BENCHMARK_CONFIG_TUNING) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${BENCHMARK_TARGET}.parallel.cpp.in") message(STATUS "found ${BENCHMARK_TARGET}.parallel.cpp.in file, compiling in parallel.") read_config_autotune_settings(${BENCHMARK_TARGET} list_across_names list_across output_pattern_suffix) if(BENCHMARK_TUNE_PARAM_NAMES AND BENCHMARK_TUNE_PARAMS) set(list_across_names "${BENCHMARK_TUNE_PARAM_NAMES}") set(list_across "${BENCHMARK_TUNE_PARAMS}") endif() #make sure that variables are not empty, i.e. there actually is an entry for that benchmark in benchmark/ConfigAutotuneSettings.cmake if(list_across_names) add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) target_compile_definitions(${BENCHMARK_TARGET} PRIVATE BENCHMARK_CONFIG_TUNING) add_matrix(TARGET ${BENCHMARK_TARGET} SHARDS 1 CURRENT_SHARD 0 INPUT "${BENCHMARK_TARGET}.parallel.cpp.in" OUTPUT_PATTERN "${BENCHMARK_TARGET}_${output_pattern_suffix}" NAMES ${list_across_names} LISTS ${list_across}) add_dependencies(benchmark_config_tuning ${BENCHMARK_TARGET}) else() message(WARNING "No config-tuning entry in benchmark/ConfigAutotuneSettings.cmake for ${BENCHMARK_TARGET}!") return() endif() else() #do nothing if BENCHMARK_CONFIG_TUNING is ON but no ${BENCHMARK_TARGET}.parallel.cpp.in exists return() endif() else() add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) endif() if(BUILD_NAIVE_BENCHMARK) target_compile_definitions(${BENCHMARK_TARGET} PUBLIC BUILD_NAIVE_BENCHMARK) endif() target_link_libraries(${BENCHMARK_TARGET} PRIVATE rocprim benchmark::benchmark ) if(USE_HIPCXX) target_link_libraries(${BENCHMARK_TARGET} PRIVATE $,hip::host,hip::device> ) else() target_link_libraries(${BENCHMARK_TARGET} PRIVATE hip::device) endif() target_compile_options(${BENCHMARK_TARGET} PRIVATE $<$: /bigobj # number of sections exceeded object file format limit: compile with /bigobj > ) set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" ) if (ROCPRIM_INSTALL) rocm_install(TARGETS ${BENCHMARK_TARGET} COMPONENT benchmarks) endif() if (WIN32 AND NOT DEFINED DLLS_COPIED) set(DLLS_COPIED "YES") set(DLLS_COPIED ${DLLS_COPIED} PARENT_SCOPE) # for now adding in all .dll as dependency chain is not cmake based on win32 file( GLOB third_party_dlls LIST_DIRECTORIES ON CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll ${CMAKE_SOURCE_DIR}/rtest.* ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET ${BENCHMARK_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${file_i} ${PROJECT_BINARY_DIR}/benchmark ) endforeach( file_i ) endif() endfunction() # **************************************************************************** # Benchmarks # **************************************************************************** add_rocprim_benchmark(benchmark_block_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_block_discontinuity.cpp) add_rocprim_benchmark(benchmark_block_exchange.cpp) add_rocprim_benchmark(benchmark_block_histogram.cpp) add_rocprim_benchmark(benchmark_block_radix_sort.cpp) add_rocprim_benchmark(benchmark_block_radix_rank.cpp) add_rocprim_benchmark(benchmark_block_reduce.cpp) add_rocprim_benchmark(benchmark_block_run_length_decode.cpp) add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_device_adjacent_find.cpp) add_rocprim_benchmark(benchmark_device_batch_memcpy.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) add_rocprim_benchmark(benchmark_device_find_first_of.cpp) add_rocprim_benchmark(benchmark_device_find_end.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) add_rocprim_benchmark(benchmark_device_merge_inplace.cpp) add_rocprim_benchmark(benchmark_device_merge_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp) add_rocprim_benchmark(benchmark_device_nth_element.cpp) add_rocprim_benchmark(benchmark_device_partial_sort.cpp) add_rocprim_benchmark(benchmark_device_partial_sort_copy.cpp) add_rocprim_benchmark(benchmark_device_partition.cpp) add_rocprim_benchmark(benchmark_device_radix_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_onesweep.cpp) add_rocprim_benchmark(benchmark_device_reduce_by_key.cpp) add_rocprim_benchmark(benchmark_device_reduce_by_key_deterministic.cpp) add_rocprim_benchmark(benchmark_device_reduce.cpp) add_rocprim_benchmark(benchmark_device_run_length_encode.cpp) add_rocprim_benchmark(benchmark_device_run_length_encode_non_trivial_runs.cpp) add_rocprim_benchmark(benchmark_device_scan.cpp) add_rocprim_benchmark(benchmark_device_scan_deterministic.cpp) add_rocprim_benchmark(benchmark_device_scan_by_key.cpp) add_rocprim_benchmark(benchmark_device_search.cpp) add_rocprim_benchmark(benchmark_device_scan_by_key_deterministic.cpp) add_rocprim_benchmark(benchmark_device_search_n.cpp) add_rocprim_benchmark(benchmark_device_select.cpp) add_rocprim_benchmark(benchmark_device_segmented_radix_sort_keys.cpp) add_rocprim_benchmark(benchmark_device_segmented_radix_sort_pairs.cpp) add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp) add_rocprim_benchmark(benchmark_device_transform.cpp) add_rocprim_benchmark(benchmark_device_transform_pointer.cpp) add_rocprim_benchmark(benchmark_predicate_iterator.cpp) add_rocprim_benchmark(benchmark_warp_exchange.cpp) add_rocprim_benchmark(benchmark_warp_reduce.cpp) add_rocprim_benchmark(benchmark_warp_scan.cpp) add_rocprim_benchmark(benchmark_warp_sort.cpp) add_rocprim_benchmark(benchmark_device_memory.cpp) rocPRIM-rocm-7.1.0/benchmark/ConfigAutotuneSettings.cmake000066400000000000000000000231511506507210100233360ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # All default fallback types as listed in scripts/autotune/fallback_config.json set(TUNING_TYPES "rocprim::int128_t int64_t int short int8_t double float rocprim::half") # If config selection happens based on two types, the second type has limited fallbacks. The selection is based # on the size and it is ignored whether the type is floating-point or integral. The autotuning script uses the # benchmarks for the integral types as fallback, hence tuning for the floating-point types is not needed. set(LIMITED_TUNING_TYPES "rocprim::int128_t int64_t int short int8_t") function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix) if(file STREQUAL "benchmark_device_adjacent_difference") set(list_across_names "DataType;Left;Aliasing;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};\ true;no_alias in_place;32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Left@_@Aliasing@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_adjacent_find") set(list_across_names "InputType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@InputType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_histogram") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_merge") set(list_across_names "KeyType;ValueType;BlockSize;UseMergePath" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 128 256 512 1024;true" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@UseMergePath@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize;BlockSortMethod" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 256 512 1024;rocprim::block_sort_algorithm::stable_merge_sort" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@BlockSortMethod@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_onesweep") set(list_across_names "KeyType;ValueType;BlockSize;RadixBits" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 128 256 512 1024;4 5 6 7 8" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@RadixBits@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_reduce") set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan") set(list_across_names "DataType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Algo@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan_by_key") set(list_across_names "KeyType;ValueType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@Algo@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_binary_search") set(list_across_names "SubAlgorithm;ValueType;OutputType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ binary_search upper_bound lower_bound;${TUNING_TYPES};${LIMITED_TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@SubAlgorithm@_@ValueType@_@OutputType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_search_n") set(list_across_names "InputType;BlockSize;ItemsPerThread;Threshold" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};64 128 256 512 1024;1 2 4 8 16;4 8 12 16" PARENT_SCOPE) set(output_pattern_suffix "@InputType@_@BlockSize@_@ItemsPerThread@_@Threshold@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_segmented_radix_sort_keys") set(list_across_names "\ KeyType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) set(list_across "${TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) set(output_pattern_suffix "\ @KeyType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_segmented_radix_sort_pairs") set(list_across_names "\ KeyType;ValueType;RadixBits;BlockSize;ItemsPerThread;WarpSmallLWS;WarpSmallIPT;WarpSmallBS;WarpPartition;WarpMediumLWS;WarpMediumIPT;WarpMediumBS" PARENT_SCOPE) set(list_across "${TUNING_TYPES};${LIMITED_TUNING_TYPES};8;256;4 8 16;8;4;256;64;16;8;256" PARENT_SCOPE) set(output_pattern_suffix "\ @KeyType@_@ValueType@_@RadixBits@_@BlockSize@_@ItemsPerThread@_@WarpSmallLWS@_@WarpSmallIPT@_@WarpSmallBS@_@WarpPartition@_@WarpMediumLWS@_@WarpMediumIPT@_@WarpMediumBS@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_segmented_reduce") set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_transform") set(list_across_names "\ DataType;BlockSize;" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "\ @DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_transform_pointer") set(list_across_names "\ DataType;BlockSize;LoadType" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024;rocprim::load_default rocprim::load_nontemporal" PARENT_SCOPE) set(output_pattern_suffix "\ @DataType@_@BlockSize@_@LoadType@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_partition") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_select") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_reduce_by_key") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_find_first_of") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_run_length_encode") set(list_across_names "KeyType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};128 192 256 384 512" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_run_length_encode_non_trivial_runs") set(list_across_names "KeyType;BlockSize;BlockLoadMethod" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256 512 1024;block_load_vectorize block_load_warp_transpose" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@BlockSize@_@BlockLoadMethod@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};32 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) endif() endfunction() rocPRIM-rocm-7.1.0/benchmark/benchmark_block_adjacent_difference.cpp000066400000000000000000000354611506507210100255030ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(Args... args) { Benchmark::template run(args...); } struct subtract_left { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rocprim::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; ++trial) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_left(input, output, rocprim::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_left_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rocprim::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; ++trial) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left_partial(input, output, rocprim::minus<>{}, T(123), tile_size, storage); } else { adjacent_diff_t().subtract_left_partial(input, output, rocprim::minus<>{}, tile_size, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rocprim::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; ++trial) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_right(input, output, rocprim::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_right(input, output, rocprim::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rocprim::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; ++trial) { T output[ItemsPerThread]; adjacent_diff_t().subtract_right_partial(input, output, rocprim::minus<>{}, tile_size, storage); for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; template auto run_benchmark(benchmark_utils::state&& state) -> std::enable_if_t::value && !std::is_same::value> { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const auto random_range = limit_random_range(0, 10); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(input.size()); state.run( [&] { kernel <<>>(d_input.get(), d_output.get(), Trials); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } template auto run_benchmark(benchmark_utils::state&& state) -> std::enable_if_t::value || std::is_same::value> { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); static constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const auto random_range_input = limit_random_range(0, 10); const auto random_range_tile_sizes = limit_random_range(0, items_per_block); const std::vector input = get_random_data(size, random_range_input.first, random_range_input.second, seed.get_0()); const std::vector tile_sizes = get_random_data(num_blocks, random_range_tile_sizes.first, random_range_tile_sizes.second, seed.get_1()); common::device_ptr d_input(input); common::device_ptr d_tile_sizes(tile_sizes); common::device_ptr d_output(input.size()); state.run( [&] { kernel <<>>(d_input.get(), d_tile_sizes.get(), d_output.get(), Trials); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ executor.queue_fn( \ bench_naming::format_name("{lvl:block,algo:adjacent_difference,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile) \ CREATE_BENCHMARK(type, block, 3, with_tile) \ CREATE_BENCHMARK(type, block, 4, with_tile) \ CREATE_BENCHMARK(type, block, 8, with_tile) \ CREATE_BENCHMARK(type, block, 16, with_tile) \ CREATE_BENCHMARK(type, block, 32, with_tile) template void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) { BENCHMARK_TYPE(int, 256, false) BENCHMARK_TYPE(float, 256, false) BENCHMARK_TYPE(int8_t, 256, false) BENCHMARK_TYPE(rocprim::half, 256, false) BENCHMARK_TYPE(long long, 256, false) BENCHMARK_TYPE(double, 256, false) BENCHMARK_TYPE(rocprim::int128_t, 256, false) BENCHMARK_TYPE(rocprim::uint128_t, 256, false) if(!std::is_same::value) { BENCHMARK_TYPE(int, 256, true) BENCHMARK_TYPE(float, 256, true) BENCHMARK_TYPE(int8_t, 256, true) BENCHMARK_TYPE(rocprim::half, 256, true) BENCHMARK_TYPE(long long, 256, true) BENCHMARK_TYPE(double, 256, true) BENCHMARK_TYPE(rocprim::int128_t, 256, true) BENCHMARK_TYPE(rocprim::uint128_t, 256, true) } } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); add_benchmarks("subtract_left", executor); add_benchmarks("subtract_right", executor); add_benchmarks("subtract_left_partial", executor); add_benchmarks("subtract_right_partial", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_discontinuity.cpp000066400000000000000000000233561506507210100244650ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, T* d_output) { Runner::template run(d_input, d_output); } struct flag_heads { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads(head_flags, T(123), input, rocprim::equal_to()); } else { bdiscontinuity.flag_heads(head_flags, input, rocprim::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += head_flags[i]; } rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_tails { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_discontinuity bdiscontinuity; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_tails(tail_flags, T(123), input, rocprim::equal_to()); } else { bdiscontinuity.flag_tails(tail_flags, input, rocprim::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += tail_flags[i]; } rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_heads_and_tails { template __device__ static void run(const T* d_input, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rocprim::equal_to()); } else { bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rocprim::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += head_flags[i]; input[i] += tail_flags[i]; } rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); const auto random_range = limit_random_range(0, 10); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T* d_input; T* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { kernel <<>>(d_input, d_output); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block, bool) \ CREATE_BENCHMARK(type, block, 1, bool) \ CREATE_BENCHMARK(type, block, 2, bool) \ CREATE_BENCHMARK(type, block, 3, bool) \ CREATE_BENCHMARK(type, block, 4, bool) \ CREATE_BENCHMARK(type, block, 8, bool) template void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) { BENCHMARK_TYPE(int, 256, false) BENCHMARK_TYPE(int, 256, true) BENCHMARK_TYPE(int8_t, 256, false) BENCHMARK_TYPE(int8_t, 256, true) BENCHMARK_TYPE(uint8_t, 256, false) BENCHMARK_TYPE(uint8_t, 256, true) BENCHMARK_TYPE(rocprim::half, 256, false) BENCHMARK_TYPE(rocprim::half, 256, true) BENCHMARK_TYPE(long long, 256, false) BENCHMARK_TYPE(long long, 256, true) BENCHMARK_TYPE(rocprim::int128_t, 256, false) BENCHMARK_TYPE(rocprim::int128_t, 256, true) BENCHMARK_TYPE(rocprim::uint128_t, 256, false) BENCHMARK_TYPE(rocprim::uint128_t, 256, true) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); add_benchmarks("flag_heads", executor); add_benchmarks("flag_tails", executor); add_benchmarks("flag_heads_and_tails", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_exchange.cpp000066400000000000000000000262171506507210100233410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(const T* d_input, const unsigned int* d_ranks, T* d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.blocked_to_striped(input, input); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct striped_to_blocked { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.striped_to_blocked(input, input); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct blocked_to_warp_striped { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.blocked_to_warp_striped(input, input); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct warp_striped_to_blocked { template __device__ static void run(const T* d_input, const unsigned int*, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.warp_striped_to_blocked(input, input); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_blocked { template __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); rocprim::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.scatter_to_blocked(input, input, ranks); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_striped { template __device__ static void run(const T* d_input, const unsigned int* d_ranks, T* d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rocprim::block_load_direct_striped(lid, d_input + block_offset, input); rocprim::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_exchange exchange; exchange.scatter_to_striped(input, input, ranks); ::rocprim::syncthreads(); } rocprim::block_store_direct_striped(lid, d_output + block_offset, input); } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input(size); // Fill input for(size_t i = 0; i < size; ++i) { input[i] = T(i); } std::vector ranks(size); // Fill ranks (for scatter operations) engine_type gen(seed.get_0()); for(size_t bi = 0; bi < size / items_per_block; ++bi) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } common::device_ptr d_input(input); common::device_ptr d_ranks(ranks); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { kernel <<>>(d_input.get(), d_ranks.get(), d_output.get()); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1) \ CREATE_BENCHMARK(type, block, 2) \ CREATE_BENCHMARK(type, block, 3) \ CREATE_BENCHMARK(type, block, 4) \ CREATE_BENCHMARK(type, block, 7) \ CREATE_BENCHMARK(type, block, 8) template void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; BENCHMARK_TYPE(int, 256) BENCHMARK_TYPE(int8_t, 256) BENCHMARK_TYPE(rocprim::half, 256) BENCHMARK_TYPE(long long, 256) BENCHMARK_TYPE(custom_float2, 256) BENCHMARK_TYPE(float2, 256) BENCHMARK_TYPE(custom_double2, 256) BENCHMARK_TYPE(double2, 256) BENCHMARK_TYPE(float4, 256) BENCHMARK_TYPE(rocprim::int128_t, 256) BENCHMARK_TYPE(rocprim::uint128_t, 256) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); add_benchmarks("blocked_to_striped", executor); add_benchmarks("striped_to_blocked", executor); add_benchmarks("blocked_to_warp_striped", executor); add_benchmarks("warp_striped_to_blocked", executor); add_benchmarks("scatter_to_blocked", executor); add_benchmarks("scatter_to_striped", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_histogram.cpp000066400000000000000000000170701506507210100235510ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct histogram { static constexpr auto algorithm_type = algorithm; template __device__ static void run(const T* input, T* output) { // TODO: Move global_offset into final loop const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; unsigned int global_offset = blockIdx.x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; ++k) { values[k] = input[index + k]; } using bhistogram_t = rocprim::block_histogram; __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { bhistogram_t().histogram(values, histogram, storage); for(unsigned int k = 0; k < ItemsPerThread; ++k) { values[k] = BinSize - 1 - values[k]; } } ROCPRIM_UNROLL for(unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + threadIdx.x < BinSize) { output[global_offset + threadIdx.x] = histogram[offset + threadIdx.x]; global_offset += BlockSize; } } } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); const auto bin_size = BinSize * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); common::device_ptr d_input(input); common::device_ptr d_output(bin_size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { kernel <<>>(d_input.get(), d_output.get()); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(Benchmark, method, T, BS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T \ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + std::string(method) + "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(Benchmark, method, T, BS) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 16) #define BENCHMARK_TYPE_128(Benchmark, method, T, BS) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 1) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 2) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 3) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 4) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 8) \ CREATE_BENCHMARK(Benchmark, method, T, BS, 12) #define BENCHMARK_ATOMIC() \ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 256) \ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 320) \ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", int, 512) \ \ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 256) \ BENCHMARK_TYPE(histogram_atomic_t, "using_atomic", unsigned long long, 320) #define BENCHMARK_SORT() \ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 256) \ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 320) \ BENCHMARK_TYPE(histogram_sort_t, "using_sort", int, 512) \ \ BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 256) \ BENCHMARK_TYPE(histogram_sort_t, "using_sort", unsigned long long, 320) \ \ BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::int128_t, 256) \ BENCHMARK_TYPE_128(histogram_sort_t, "using_sort", rocprim::uint128_t, 256) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); #ifndef BENCHMARK_CONFIG_TUNING using histogram_atomic_t = histogram; using histogram_sort_t = histogram; BENCHMARK_ATOMIC() BENCHMARK_SORT() #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_radix_rank.cpp000066400000000000000000000153261506507210100237000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, unsigned int* ranks_output) { using rank_type = rocprim::block_radix_rank; const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, keys_input + block_offset, keys); unsigned int ranks[ItemsPerThread]; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { ROCPRIM_SHARED_MEMORY typename rank_type::storage_type storage; unsigned int begin_bit = 0; const unsigned int end_bit = sizeof(T) * 8; while(begin_bit < end_bit) { const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); if constexpr(Descending) { rank_type().rank_keys_desc(keys, ranks, storage, begin_bit, pass_bits); } else { rank_type().rank_keys(keys, ranks, storage, begin_bit, pass_bits); } begin_bit += RadixBits; } } rocprim::block_store_direct_striped(lid, ranks_output + block_offset, ranks); } template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr size_t items_per_block = BlockSize * ItemsPerThread; const size_t grid_size = ((N + items_per_block - 1) / items_per_block); const size_t size = items_per_block * grid_size; std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { rank_kernel <<>>(d_input.get(), d_output.get()); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IPT, KIND) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T \ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" #KIND \ "}}") \ .c_str(), \ run_benchmark); // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic) \ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::basic_memoize) \ CREATE_BENCHMARK(type, block, ipt, rocprim::block_radix_rank_algorithm::match) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK_KINDS(type, block, 1) \ CREATE_BENCHMARK_KINDS(type, block, 4) \ CREATE_BENCHMARK_KINDS(type, block, 8) \ CREATE_BENCHMARK_KINDS(type, block, 12) \ CREATE_BENCHMARK_KINDS(type, block, 16) \ CREATE_BENCHMARK_KINDS(type, block, 20) // clang-format on int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); BENCHMARK_TYPE(int, 128) BENCHMARK_TYPE(int, 256) BENCHMARK_TYPE(int, 512) BENCHMARK_TYPE(uint8_t, 128) BENCHMARK_TYPE(uint8_t, 256) BENCHMARK_TYPE(uint8_t, 512) BENCHMARK_TYPE(long long, 128) BENCHMARK_TYPE(long long, 256) BENCHMARK_TYPE(long long, 512) BENCHMARK_TYPE(rocprim::int128_t, 128) BENCHMARK_TYPE(rocprim::int128_t, 256) BENCHMARK_TYPE(rocprim::int128_t, 512) BENCHMARK_TYPE(rocprim::uint128_t, 128) BENCHMARK_TYPE(rocprim::uint128_t, 256) BENCHMARK_TYPE(rocprim::uint128_t, 512) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_radix_sort.cpp000066400000000000000000000233671506507210100237400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include enum class benchmark_kinds { sort_keys, sort_pairs }; template using select_decomposer_t = std::conditional_t::value, custom_type_decomposer, rocprim::identity_decomposer>; template __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_radix_sort sort; sort.sort(keys, 0, sizeof(T) * 8, select_decomposer_t{}); } rocprim::block_store_direct_striped(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T* input, T* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; ++i) { values[i] = keys[i] + T(1); } ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { rocprim::block_radix_sort sort; sort.sort(keys, values, 0, sizeof(T) * 8, select_decomposer_t{}); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { keys[i] += values[i]; } rocprim::block_store_direct_striped(lid, output + block_offset, keys); } template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { if constexpr(BenchmarkKind == benchmark_kinds::sort_keys) { sort_keys_kernel <<>>(d_input.get(), d_output.get()); } else if constexpr(BenchmarkKind == benchmark_kinds::sort_pairs) { sort_pairs_kernel <<>>(d_input.get(), d_output.get()); } HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, RB, IPT) \ executor.queue_fn( \ bench_naming::format_name("{lvl:block,algo:radix_sort,key_type:" #T ",subalgo:" + name \ + ",cfg:{bs:" #BS ",rb:" #RB ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block, radix_bits) \ CREATE_BENCHMARK(type, block, radix_bits, 1) \ CREATE_BENCHMARK(type, block, radix_bits, 2) \ CREATE_BENCHMARK(type, block, radix_bits, 3) \ CREATE_BENCHMARK(type, block, radix_bits, 4) \ CREATE_BENCHMARK(type, block, radix_bits, 8) template void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) { using custom_int_type = common::custom_type; BENCHMARK_TYPE(int, 64, 3) BENCHMARK_TYPE(int, 512, 3) BENCHMARK_TYPE(int, 64, 4) BENCHMARK_TYPE(int, 128, 4) BENCHMARK_TYPE(int, 192, 4) BENCHMARK_TYPE(int, 256, 4) BENCHMARK_TYPE(int, 320, 4) BENCHMARK_TYPE(int, 512, 4) BENCHMARK_TYPE(int8_t, 64, 3) BENCHMARK_TYPE(int8_t, 512, 3) BENCHMARK_TYPE(int8_t, 64, 4) BENCHMARK_TYPE(int8_t, 128, 4) BENCHMARK_TYPE(int8_t, 192, 4) BENCHMARK_TYPE(int8_t, 256, 4) BENCHMARK_TYPE(int8_t, 320, 4) BENCHMARK_TYPE(int8_t, 512, 4) BENCHMARK_TYPE(uint8_t, 64, 3) BENCHMARK_TYPE(uint8_t, 512, 3) BENCHMARK_TYPE(uint8_t, 64, 4) BENCHMARK_TYPE(uint8_t, 128, 4) BENCHMARK_TYPE(uint8_t, 192, 4) BENCHMARK_TYPE(uint8_t, 256, 4) BENCHMARK_TYPE(uint8_t, 320, 4) BENCHMARK_TYPE(uint8_t, 512, 4) BENCHMARK_TYPE(rocprim::half, 64, 3) BENCHMARK_TYPE(rocprim::half, 512, 3) BENCHMARK_TYPE(rocprim::half, 64, 4) BENCHMARK_TYPE(rocprim::half, 128, 4) BENCHMARK_TYPE(rocprim::half, 192, 4) BENCHMARK_TYPE(rocprim::half, 256, 4) BENCHMARK_TYPE(rocprim::half, 320, 4) BENCHMARK_TYPE(rocprim::half, 512, 4) BENCHMARK_TYPE(long long, 64, 3) BENCHMARK_TYPE(long long, 512, 3) BENCHMARK_TYPE(long long, 64, 4) BENCHMARK_TYPE(long long, 128, 4) BENCHMARK_TYPE(long long, 192, 4) BENCHMARK_TYPE(long long, 256, 4) BENCHMARK_TYPE(long long, 320, 4) BENCHMARK_TYPE(long long, 512, 4) BENCHMARK_TYPE(custom_int_type, 64, 3) BENCHMARK_TYPE(custom_int_type, 512, 3) BENCHMARK_TYPE(custom_int_type, 64, 4) BENCHMARK_TYPE(custom_int_type, 128, 4) BENCHMARK_TYPE(custom_int_type, 192, 4) BENCHMARK_TYPE(custom_int_type, 256, 4) BENCHMARK_TYPE(custom_int_type, 320, 4) BENCHMARK_TYPE(custom_int_type, 512, 4) BENCHMARK_TYPE(rocprim::int128_t, 64, 3) BENCHMARK_TYPE(rocprim::int128_t, 512, 3) BENCHMARK_TYPE(rocprim::int128_t, 64, 4) BENCHMARK_TYPE(rocprim::int128_t, 128, 4) BENCHMARK_TYPE(rocprim::int128_t, 192, 4) BENCHMARK_TYPE(rocprim::int128_t, 256, 4) BENCHMARK_TYPE(rocprim::int128_t, 320, 4) BENCHMARK_TYPE(rocprim::int128_t, 512, 4) BENCHMARK_TYPE(rocprim::uint128_t, 64, 3) BENCHMARK_TYPE(rocprim::uint128_t, 512, 3) BENCHMARK_TYPE(rocprim::uint128_t, 64, 4) BENCHMARK_TYPE(rocprim::uint128_t, 128, 4) BENCHMARK_TYPE(rocprim::uint128_t, 192, 4) BENCHMARK_TYPE(rocprim::uint128_t, 256, 4) BENCHMARK_TYPE(rocprim::uint128_t, 320, 4) BENCHMARK_TYPE(rocprim::uint128_t, 512, 4) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); add_benchmarks("keys", executor); add_benchmarks("pairs", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_reduce.cpp000066400000000000000000000153201506507210100230170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct reduce { template __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; T reduced_value; for(unsigned int k = 0; k < ItemsPerThread; ++k) { values[k] = input[i * ItemsPerThread + k]; } using breduce_t = rocprim::block_reduce; __shared__ typename breduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { breduce_t().reduce(values, reduced_value, storage); values[0] = reduced_value; } if(threadIdx.x == 0) { output[blockIdx.x] = reduced_value; } } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { kernel <<>>(d_input.get(), d_output.get()); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + name + "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1) \ CREATE_BENCHMARK(type, block, 2) \ CREATE_BENCHMARK(type, block, 3) \ CREATE_BENCHMARK(type, block, 4) \ CREATE_BENCHMARK(type, block, 8) \ CREATE_BENCHMARK(type, block, 11) \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(const std::string& name, benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64) BENCHMARK_TYPE(float, 64) BENCHMARK_TYPE(double, 64) BENCHMARK_TYPE(int8_t, 64) BENCHMARK_TYPE(uint8_t, 64) BENCHMARK_TYPE(rocprim::half, 64) BENCHMARK_TYPE(rocprim::int128_t, 64) BENCHMARK_TYPE(rocprim::uint128_t, 64) BENCHMARK_TYPE(int, 256) BENCHMARK_TYPE(float, 256) BENCHMARK_TYPE(double, 256) BENCHMARK_TYPE(int8_t, 256) BENCHMARK_TYPE(uint8_t, 256) BENCHMARK_TYPE(rocprim::half, 256) BENCHMARK_TYPE(rocprim::int128_t, 256) BENCHMARK_TYPE(rocprim::uint128_t, 256) CREATE_BENCHMARK(custom_float2, 256, 1) CREATE_BENCHMARK(custom_float2, 256, 4) CREATE_BENCHMARK(custom_float2, 256, 8) CREATE_BENCHMARK(float2, 256, 1) CREATE_BENCHMARK(float2, 256, 4) CREATE_BENCHMARK(float2, 256, 8) CREATE_BENCHMARK(custom_double2, 256, 1) CREATE_BENCHMARK(custom_double2, 256, 4) CREATE_BENCHMARK(custom_double2, 256, 8) CREATE_BENCHMARK(double2, 256, 1) CREATE_BENCHMARK(double2, 256, 4) CREATE_BENCHMARK(double2, 256, 8) CREATE_BENCHMARK(float4, 256, 1) CREATE_BENCHMARK(float4, 256, 4) CREATE_BENCHMARK(float4, 256, 8) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); using reduce_uwr_t = reduce; add_benchmarks("using_warp_reduce", executor); using reduce_rr_t = reduce; add_benchmarks("raking_reduce", executor); using reduce_rrco_t = reduce; add_benchmarks("raking_reduce_commutative_only", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_run_length_decode.cpp000066400000000000000000000213011506507210100252140ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" #include #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void block_run_length_decode_kernel(const ItemT* d_run_items, const OffsetT* d_run_offsets, ItemT* d_decoded_items, bool enable_store = false) { using BlockRunLengthDecodeT = rocprim::block_run_length_decode; ItemT run_items[RunsPerThread]; OffsetT run_offsets[RunsPerThread]; const unsigned global_thread_idx = BlockSize * hipBlockIdx_x + hipThreadIdx_x; rocprim::block_load_direct_blocked(global_thread_idx, d_run_items, run_items); rocprim::block_load_direct_blocked(global_thread_idx, d_run_offsets, run_offsets); BlockRunLengthDecodeT block_run_length_decode(run_items, run_offsets); const OffsetT total_decoded_size = d_run_offsets[(hipBlockIdx_x + 1) * BlockSize * RunsPerThread] - d_run_offsets[hipBlockIdx_x * BlockSize * RunsPerThread]; #pragma nounroll for(unsigned i = 0; i < Trials; ++i) { OffsetT decoded_window_offset = 0; while(decoded_window_offset < total_decoded_size) { ItemT decoded_items[DecodedItemsPerThread]; block_run_length_decode.run_length_decode(decoded_items, decoded_window_offset); if(enable_store) { rocprim::block_store_direct_blocked(global_thread_idx, d_decoded_items + decoded_window_offset, decoded_items); } decoded_window_offset += BlockSize * DecodedItemsPerThread; } } } template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(ItemT); constexpr auto runs_per_block = BlockSize * RunsPerThread; const auto target_num_runs = 2 * N / (MinRunLength + MaxRunLength); const auto num_runs = runs_per_block * ((target_num_runs + runs_per_block - 1) / runs_per_block); std::vector run_items(num_runs); std::vector run_offsets(num_runs + 1); engine_type prng(seed.get_0()); using ItemDistribution = std::conditional_t::value, common::uniform_int_distribution, std::uniform_real_distribution>; ItemDistribution run_item_dist(0, 100); common::uniform_int_distribution run_length_dist(MinRunLength, MaxRunLength); for(size_t i = 0; i < num_runs; ++i) { run_items[i] = run_item_dist(prng); } for(size_t i = 1; i < num_runs + 1; ++i) { const OffsetT next_run_length = run_length_dist(prng); run_offsets[i] = run_offsets[i - 1] + next_run_length; } const OffsetT output_length = run_offsets.back(); common::device_ptr d_run_items(run_items); common::device_ptr d_run_offsets(run_offsets); common::device_ptr d_output(output_length); state.run( [&] { block_run_length_decode_kernel <<>>( d_run_items.get(), d_run_offsets.get(), d_output.get()); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); }); state.set_throughput(output_length * Trials, sizeof(ItemT)); } #define CREATE_BENCHMARK(IT, OT, MINRL, MAXRL, BS, RPT, DIPT) \ executor.queue_fn( \ bench_naming::format_name("{lvl:block,algo:run_length_decode" \ ",item_type:" #IT ",offset_type:" #OT ",min_run_length:" #MINRL \ ",max_run_length:" #MAXRL ",cfg:{block_size:" #BS \ ",run_per_thread:" #RPT ",decoded_items_per_thread:" #DIPT "}}") \ .c_str(), \ &run_benchmark); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); CREATE_BENCHMARK(int, int, 1, 5, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 10, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 50, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 100, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 500, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 1000, 128, 2, 4) CREATE_BENCHMARK(int, int, 1, 5000, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 5, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 10, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 50, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 100, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 500, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 1000, 128, 2, 4) CREATE_BENCHMARK(double, long long, 1, 5000, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 5, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 10, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 50, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 100, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 500, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 1000, 128, 2, 4) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t, 1, 5000, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 5, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 10, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 50, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 100, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 500, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 1000, 128, 2, 4) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t, 1, 5000, 128, 2, 4) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_scan.cpp000066400000000000000000000205171506507210100225000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct inclusive_scan { template __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; ++k) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rocprim::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { bscan_t().inclusive_scan(values, values, storage); } for(unsigned int k = 0; k < ItemsPerThread; ++k) { output[i * ItemsPerThread + k] = values[k]; } } }; template struct exclusive_scan { template __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; using U = typename std::remove_reference::type; T values[ItemsPerThread]; U init = U(100); for(unsigned int k = 0; k < ItemsPerThread; ++k) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rocprim::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { bscan_t().exclusive_scan(values, values, init, storage); } for(unsigned int k = 0; k < ItemsPerThread; ++k) { output[i * ItemsPerThread + k] = values[k]; } } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const auto& stream = state.stream; // Calculate the number of elements N size_t N = bytes / sizeof(T); // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input(size, T(1)); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { kernel <<>>(d_input.get(), d_output.get()); HIP_CHECK(hipGetLastError()); }); state.set_throughput(size * Trials, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:block,algo:scan,subalgo:" + algorithm_name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark); #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1) \ CREATE_BENCHMARK(type, block, 2) \ CREATE_BENCHMARK(type, block, 3) \ CREATE_BENCHMARK(type, block, 4) \ CREATE_BENCHMARK(type, block, 8) \ CREATE_BENCHMARK(type, block, 11) \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(const std::string& method_name, const std::string& algorithm_name, benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64) BENCHMARK_TYPE(float, 64) BENCHMARK_TYPE(double, 64) BENCHMARK_TYPE(int8_t, 64) BENCHMARK_TYPE(uint8_t, 64) BENCHMARK_TYPE(rocprim::half, 64) BENCHMARK_TYPE(int, 256) BENCHMARK_TYPE(float, 256) BENCHMARK_TYPE(double, 256) BENCHMARK_TYPE(int8_t, 256) BENCHMARK_TYPE(uint8_t, 256) BENCHMARK_TYPE(rocprim::half, 256) CREATE_BENCHMARK(custom_float2, 256, 1) CREATE_BENCHMARK(custom_float2, 256, 4) CREATE_BENCHMARK(custom_float2, 256, 8) CREATE_BENCHMARK(float2, 256, 1) CREATE_BENCHMARK(float2, 256, 4) CREATE_BENCHMARK(float2, 256, 8) CREATE_BENCHMARK(custom_double2, 256, 1) CREATE_BENCHMARK(custom_double2, 256, 4) CREATE_BENCHMARK(custom_double2, 256, 8) CREATE_BENCHMARK(double2, 256, 1) CREATE_BENCHMARK(double2, 256, 4) CREATE_BENCHMARK(double2, 256, 8) CREATE_BENCHMARK(float4, 256, 1) CREATE_BENCHMARK(float4, 256, 4) CREATE_BENCHMARK(float4, 256, 8) CREATE_BENCHMARK(rocprim::int128_t, 256, 1) CREATE_BENCHMARK(rocprim::int128_t, 256, 4) CREATE_BENCHMARK(rocprim::int128_t, 256, 8) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(rocprim::uint128_t, 256, 8) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 1, 0); using inclusive_scan_uws_t = inclusive_scan; add_benchmarks("inclusive_scan", "using_warp_scan", executor); using exclusive_scan_uws_t = exclusive_scan; add_benchmarks("exclusive_scan", "using_warp_scan", executor); using inclusive_scan_rts_t = inclusive_scan; add_benchmarks("inclusive_scan", "reduce_then_scan", executor); using exclusive_scan_rts_t = exclusive_scan; add_benchmarks("exclusive_scan", "reduce_then_scan", executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_sort.cpp000066400000000000000000000064331506507210100225440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_block_sort.parallel.hpp" // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK_IPT_ALG(K, V, BS, IPT, ALG) \ benchmark_utils::executor::queue_sorted_instance< \ block_sort_benchmark>(); \ benchmark_utils::executor::queue_sorted_instance< \ block_sort_benchmark>(); #define CREATE_BENCHMARK_IPT(K, V, BS, IPT) \ CREATE_BENCHMARK_IPT_ALG(K, V, BS, IPT, rocprim::block_sort_algorithm::merge_sort) \ CREATE_BENCHMARK_IPT_ALG(K, V, BS, IPT, rocprim::block_sort_algorithm::stable_merge_sort) \ CREATE_BENCHMARK_IPT_ALG(K, V, BS, IPT, rocprim::block_sort_algorithm::bitonic_sort) #define CREATE_BENCHMARK(K, V, BS) \ CREATE_BENCHMARK_IPT(K, V, BS, 1) \ CREATE_BENCHMARK_IPT(K, V, BS, 4) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 10, 0); // Block sizes as large as possible are most relevant CREATE_BENCHMARK(float, rocprim::empty_type, 256) CREATE_BENCHMARK(double, rocprim::empty_type, 256) CREATE_BENCHMARK(rocprim::half, rocprim::empty_type, 256) CREATE_BENCHMARK(uint8_t, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 512) CREATE_BENCHMARK(double, rocprim::empty_type, 512) CREATE_BENCHMARK(rocprim::int128_t, rocprim::empty_type, 256) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::empty_type, 256) CREATE_BENCHMARK(int, int, 512) CREATE_BENCHMARK(float, double, 512) CREATE_BENCHMARK(double, int64_t, 512) CREATE_BENCHMARK(rocprim::half, int16_t, 512) CREATE_BENCHMARK(uint8_t, uint32_t, 512) CREATE_BENCHMARK(int64_t, rocprim::int128_t, 512) CREATE_BENCHMARK(uint64_t, rocprim::uint128_t, 512) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_block_sort.parallel.hpp000066400000000000000000000235541506507210100243470ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); rocprim::block_sort bsort; bsort.sort(keys); rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; ValueType values[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { values[item] = block_offset + lid * ItemsPerThread + item; } rocprim::block_sort bsort; bsort.sort(keys, values); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = keys[item] + static_cast(values[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void stable_sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); using stable_key_type = rocprim::tuple; stable_key_type stable_keys[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { stable_keys[item] = rocprim::make_tuple(keys[item], ItemsPerThread * lid + item); } // Special comparison that preserves relative order of equal keys auto stable_compare_function = [](const stable_key_type& a, const stable_key_type& b) mutable -> bool { const bool ab = rocprim::less{}(rocprim::get<0>(a), rocprim::get<0>(b)); return ab || (!rocprim::less{}(rocprim::get<0>(b), rocprim::get<0>(a)) && (rocprim::get<1>(a) < rocprim::get<1>(b))); }; rocprim::block_sort bsort; bsort.sort(stable_keys, stable_compare_function); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = rocprim::get<0>(stable_keys[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template struct block_sort_benchmark : public benchmark_utils::autotune_interface { private: static constexpr bool with_values = !std::is_same::value; static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; static const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } public: std::string sort_key() const override { using namespace std::string_literals; return std::string((with_values ? "_pairs"s : "_keys"s) + (stable ? "_stable"s : ""s) + pad_string(std::to_string(items_per_block), 5) + ", " + name()); } std::string name() const override { return bench_naming::format_name( "{lvl:block,algo:sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",stable:" + (stable ? "true" : "false") + ",cfg:{bs:" + std::to_string(BlockSize) + ",ipt:" + std::to_string(ItemsPerThread) + ",method:" + std::string(get_block_sort_method_name(block_sort_algorithm)) + "}}"); } static auto dispatch_block_sort(std::false_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) { sort_kernel <<>>(d_input, d_output); } static auto dispatch_block_sort(std::true_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) { stable_sort_kernel <<>>(d_input, d_output); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements N size_t N = bytes / sizeof(KeyType); const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); static constexpr auto stable_tag = rocprim::detail::bool_constant{}; state.run( [&] { dispatch_block_sort(stable_tag, size, stream, d_input.get(), d_output.get()); }); state.set_throughput(size, sizeof(KeyType)); state.gbench_state.counters["sorted_size"] = benchmark::Counter(BlockSize * ItemsPerThread, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); } }; #endif // ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_config_dispatch.cpp000066400000000000000000000050401506507210100235200ustar00rootroot00000000000000 #include "benchmark_utils.hpp" #include #include #include #include #include enum class stream_kind { default_stream, per_thread_stream, explicit_stream, async_stream }; template static void BM_host_target_arch(benchmark_utils::state&& state) { const hipStream_t stream = []() -> hipStream_t { hipStream_t stream = 0; switch(StreamKind) { case stream_kind::default_stream: return stream; case stream_kind::per_thread_stream: return hipStreamPerThread; case stream_kind::explicit_stream: HIP_CHECK(hipStreamCreate(&stream)); return stream; case stream_kind::async_stream: HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); return stream; } }(); state.run( [&] { rocprim::detail::target_arch target_arch; HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch)); benchmark::DoNotOptimize(target_arch); }); state.set_throughput(1, sizeof(char)); if(StreamKind != stream_kind::default_stream && StreamKind != stream_kind::per_thread_stream) { HIP_CHECK(hipStreamDestroy(stream)); } } __global__ void empty_kernel() {} // An empty kernel launch for baseline static void BM_kernel_launch(benchmark_utils::state&& state) { const auto& stream = state.stream; state.run( [&] { empty_kernel<<>>(); HIP_CHECK(hipGetLastError()); }); state.set_throughput(1, sizeof(char)); } #define CREATE_BENCHMARK(ST, SK) \ executor.queue_fn( \ bench_naming::format_name("{lvl:na,algo:" #ST ",cfg:default_config}").c_str(), \ BM_host_target_arch); int main(int argc, char** argv) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0, true, 100); CREATE_BENCHMARK(default_stream, stream_kind::default_stream) CREATE_BENCHMARK(per_thread_stream, stream_kind::per_thread_stream) CREATE_BENCHMARK(explicit_stream, stream_kind::explicit_stream) CREATE_BENCHMARK(async_stream, stream_kind::async_stream) executor.queue_fn( bench_naming::format_name("{lvl:na,algo:empty_kernel,cfg:default_config}").c_str(), BM_kernel_launch); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_difference.cpp000066400000000000000000000053041506507210100256410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_adjacent_difference.parallel.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/device_adjacent_difference.hpp" #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T, Left, Aliasing) \ executor.queue_instance(device_adjacent_difference_benchmark()); // clang-format off #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, true, common::api_variant::no_alias) \ CREATE_BENCHMARK(T, true, common::api_variant::in_place) \ CREATE_BENCHMARK(T, false, common::api_variant::no_alias) \ CREATE_BENCHMARK(T, false, common::api_variant::in_place) // clang-format on int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARKS(int); CREATE_BENCHMARKS(std::int64_t); CREATE_BENCHMARKS(uint8_t); CREATE_BENCHMARKS(rocprim::half); CREATE_BENCHMARKS(float); CREATE_BENCHMARKS(double); CREATE_BENCHMARKS(custom_float2); CREATE_BENCHMARKS(custom_double2); CREATE_BENCHMARKS(rocprim::int128_t); CREATE_BENCHMARKS(rocprim::uint128_t); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in000066400000000000000000000031071506507210100300400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_adjacent_difference.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/device_adjacent_difference.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_adjacent_difference_benchmark_generator< @DataType@, @BlockSize@, @Left@, common::api_variant::@Aliasing@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_difference.parallel.hpp000066400000000000000000000151571506507210100274500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/device_adjacent_difference.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include template std::string config_name() { auto config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_adjacent_difference_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:adjacent_difference" + (Left ? ""s : "_right"s) + (Aliasing == common::api_variant::no_alias ? ""s : "_inplace"s) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using output_type = T; static constexpr bool debug_synchronous = false; // Generate data const size_t size = bytes / sizeof(T); const auto random_range = limit_random_range(1, 100); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output; if constexpr(Aliasing == common::api_variant::no_alias) { d_output.resize(size); } static constexpr auto left_tag = rocprim::detail::bool_constant{}; static constexpr auto alias_tag = std::integral_constant{}; // Allocate temporary storage std::size_t temp_storage_size; common::device_ptr d_temp_storage; const auto launch = [&] { return common::dispatch_adjacent_difference(left_tag, alias_tag, d_temp_storage.get(), temp_storage_size, d_input.get(), d_output.get(), size, rocprim::plus<>{}, stream, debug_synchronous); }; HIP_CHECK(launch()); d_temp_storage.resize(temp_storage_size); state.run([&] { HIP_CHECK(launch()); }); state.set_throughput(size, sizeof(T)); } }; template struct device_adjacent_difference_benchmark_generator { // Device Adjacent difference uses block_load/store_transpose to coalesce memory transaction to global memory // However it accesses shared memory with a stride of items per thread, which leads to reduced performance if power // of two is used for small types. Experiments shown that primes are the best choice for performance. static constexpr std::array primes{1, 2, 3, 5, 7, 11, 13, 17, 19, 23, 29, 31}; static constexpr unsigned int max_items_per_thread_arg = TUNING_SHARED_MEMORY_MAX / (BlockSize * sizeof(T) * 2 + sizeof(T)); template struct create_ipt { template auto operator()(std::vector>& storage) -> std::enable_if_t<(ipt_num < max_items_per_thread_arg)> { using generated_config = rocprim::adjacent_difference_config; storage.emplace_back( std::make_unique< device_adjacent_difference_benchmark>()); } template auto operator()(std::vector>&) -> std::enable_if_t {} }; static void create(std::vector>& storage) { static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_find.cpp000066400000000000000000000060731506507210100244730ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_adjacent_find.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif // C++ Standard Library #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T, P) executor.queue_instance(device_adjacent_find_benchmark()); #define CREATE_ADJACENT_FIND_BENCHMARKS(T) \ CREATE_BENCHMARK(T, 1) \ CREATE_BENCHMARK(T, 5) \ CREATE_BENCHMARK(T, 9) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; // Tuned types CREATE_ADJACENT_FIND_BENCHMARKS(int8_t) CREATE_ADJACENT_FIND_BENCHMARKS(int16_t) CREATE_ADJACENT_FIND_BENCHMARKS(int32_t) CREATE_ADJACENT_FIND_BENCHMARKS(int64_t) CREATE_ADJACENT_FIND_BENCHMARKS(rocprim::half) CREATE_ADJACENT_FIND_BENCHMARKS(float) CREATE_ADJACENT_FIND_BENCHMARKS(double) CREATE_ADJACENT_FIND_BENCHMARKS(rocprim::int128_t) CREATE_ADJACENT_FIND_BENCHMARKS(rocprim::uint128_t) // Custom types CREATE_ADJACENT_FIND_BENCHMARKS(custom_float2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_double2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_int2) CREATE_ADJACENT_FIND_BENCHMARKS(custom_char_double) CREATE_ADJACENT_FIND_BENCHMARKS(custom_longlong_double) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_find.parallel.cpp.in000066400000000000000000000027201506507210100266660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_adjacent_find.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_adjacent_find_benchmark_generator< @InputType@, @BlockSize@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_adjacent_find.parallel.hpp000066400000000000000000000176451506507210100263020ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" // gbench #include // HIP #include // rocPRIM #include #include #include #include // C++ Standard Library #include #include #include #include #include #include #include #include template std::string config_name() { auto config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_adjacent_find_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:adjacent_find,input_type:" + std::string(Traits::name()) + ",first_adj_pos:" + std::to_string(FirstAdjPosDecimal * 0.1f) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using input_type = InputT; using output_type = std::size_t; const size_t size = bytes / sizeof(input_type); // Get index of the first adjacent equal pair std::size_t first_adj_index = static_cast(size * FirstAdjPosDecimal * 0.1f); if(first_adj_index >= size - 1) { first_adj_index = size - 2; } // Generate data ensuring there is no adjacent pair before first_adj_index std::vector input(size); if(std::is_same::value) { // For int8_t that has a very limited range of values, iota initialization // seems to give a more reliable benchmark input std::iota(input.begin(), input.end(), 0); } else { input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector iota(size); std::iota(iota.begin(), iota.end(), 0); std::transform(iota.begin() + 1, iota.begin() + first_adj_index + 1, input.begin() + 1, [&](std::size_t& idx) { while(input[idx] == input[idx - 1]) { input[idx] = get_random_value( common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); } return input[idx]; }); } // Insert first adjacent pair input[first_adj_index] = input[first_adj_index + 1]; input_type* d_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(*d_input), hipMemcpyHostToDevice)); std::size_t tmp_storage_size; void* d_tmp_storage = nullptr; auto launch_adjacent_find = [&]() { HIP_CHECK(::rocprim::adjacent_find(d_tmp_storage, tmp_storage_size, d_input, d_output, size, rocprim::equal_to{}, stream, false)); }; // Get size of temporary storage launch_adjacent_find(); HIP_CHECK(hipMalloc(&d_tmp_storage, tmp_storage_size)); state.run([&] { launch_adjacent_find(); }); state.set_throughput(first_adj_index, sizeof(input_type)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_tmp_storage)); } }; template struct device_adjacent_find_benchmark_generator { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2<32>::VALUE; template struct create_pos { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExp; using generated_config = rocprim::adjacent_find_config; void operator()( std::vector>& storage) { storage.emplace_back( std::make_unique>()); } }; void operator()(std::vector>& storage) { static_for_each< make_index_range, create_ipt>(storage); } }; static void create(std::vector>& storage) { static_for_each, create_pos>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_FIND_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_batch_memcpy.cpp000066400000000000000000000533031506507210100243530ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/device_batch_memcpy.hpp" #include "../common/utils_device_ptr.hpp" #include // rocPRIM #include #include #include #include #include #ifdef BUILD_NAIVE_BENCHMARK #include #include #include #endif #include #include #include #include #include #include #include #include #include using namespace std::string_literals; template::type = 0> void batch_copy(void* temporary_storage, size_t& storage_size, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, uint32_t num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_memcpy(temporary_storage, storage_size, sources, destinations, sizes, num_copies, stream)); } template::type = 0> void batch_copy(void* temporary_storage, size_t& storage_size, InputBufferItType sources, OutputBufferItType destinations, BufferSizeItType sizes, uint32_t num_copies, hipStream_t stream) { HIP_CHECK(rocprim::batch_copy(temporary_storage, storage_size, sources, destinations, sizes, num_copies, stream)); } template struct BatchMemcpyData { size_t total_num_elements = 0; common::device_ptr d_input; common::device_ptr d_output; common::device_ptr d_buffer_srcs; common::device_ptr d_buffer_dsts; common::device_ptr d_buffer_sizes; BatchMemcpyData() = default; BatchMemcpyData(const BatchMemcpyData&) = delete; BatchMemcpyData(BatchMemcpyData&& other) = default; BatchMemcpyData& operator=(BatchMemcpyData&& other) = default; BatchMemcpyData& operator=(const BatchMemcpyData&) = delete; size_t total_num_bytes() const { return total_num_elements * sizeof(ValueType); } ~BatchMemcpyData() {} }; template BatchMemcpyData prepare_data(hipStream_t stream, const managed_seed& seed, const int32_t num_tlev_buffers, const int32_t num_wlev_buffers, const int32_t num_blev_buffers) { const bool shuffle_buffers = false; BatchMemcpyData result; using config = rocprim::detail::wrapped_batch_memcpy_config; rocprim::detail::target_arch target_arch; hipError_t success = rocprim::detail::host_target_arch(stream, target_arch); if(success != hipSuccess) { return result; } const rocprim::detail::batch_memcpy_config_params params = rocprim::detail::dispatch_target_arch(target_arch); const int32_t wlev_min_size = params.wlev_size_threshold; const int32_t blev_min_size = params.blev_size_threshold; const size_t num_buffers = num_tlev_buffers + num_wlev_buffers + num_blev_buffers; const int32_t wlev_min_elems = rocprim::detail::ceiling_div(wlev_min_size, sizeof(ValueType)); const int32_t blev_min_elems = rocprim::detail::ceiling_div(blev_min_size, sizeof(ValueType)); constexpr int32_t max_size = 1024 * 1024; constexpr int32_t max_elems = max_size / sizeof(ValueType); // Generate data std::mt19937_64 rng(seed.get_0()); // Number of elements in each buffer. std::vector h_buffer_num_elements(num_buffers); auto iter = h_buffer_num_elements.begin(); iter = generate_random_data_n(iter, num_tlev_buffers, 1, wlev_min_elems - 1, rng); iter = generate_random_data_n(iter, num_wlev_buffers, wlev_min_elems, blev_min_elems - 1, rng); iter = generate_random_data_n(iter, num_blev_buffers, blev_min_elems, max_elems, rng); // Shuffle the sizes so that size classes aren't clustered std::shuffle(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), rng); // Get the byte size of each buffer std::vector h_buffer_num_bytes(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_num_bytes[i] = h_buffer_num_elements[i] * sizeof(ValueType); } result.total_num_elements = std::accumulate(h_buffer_num_elements.begin(), h_buffer_num_elements.end(), size_t{0}); std::vector h_input_for_memcpy; std::vector h_input_for_copy; common::init_input(h_input_for_memcpy, h_input_for_copy, rng, result.total_num_elements * sizeof(ValueType)); result.d_input.resize(result.total_num_elements); result.d_output.resize(result.total_num_elements); result.d_buffer_srcs.resize(num_buffers); result.d_buffer_dsts.resize(num_buffers); result.d_buffer_sizes.resize(num_buffers); using offset_type = size_t; // Generate the source and shuffled destination offsets. std::vector src_offsets; std::vector dst_offsets; if(shuffle_buffers) { src_offsets = common::shuffled_exclusive_scan(h_buffer_num_elements, rng); dst_offsets = common::shuffled_exclusive_scan(h_buffer_num_elements, rng); } else { src_offsets = std::vector(num_buffers); dst_offsets = std::vector(num_buffers); // Consecutive offsets (no shuffling). // src/dst offsets first element is 0, so skip that! std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, src_offsets.begin() + 1); std::partial_sum(h_buffer_num_elements.begin(), h_buffer_num_elements.end() - 1, dst_offsets.begin() + 1); } // Generate the source and destination pointers. std::vector h_buffer_srcs(num_buffers); std::vector h_buffer_dsts(num_buffers); for(size_t i = 0; i < num_buffers; ++i) { h_buffer_srcs[i] = result.d_input.get() + src_offsets[i]; h_buffer_dsts[i] = result.d_output.get() + dst_offsets[i]; } // Prepare the batch memcpy. if(IsMemCpy) { using cast_value_type = typename decltype(result.d_input)::value_type; result.d_input.store(std::vector( reinterpret_cast(h_input_for_memcpy.data()), reinterpret_cast(h_input_for_memcpy.data()) + result.total_num_elements)); result.d_buffer_sizes.store(h_buffer_num_bytes); } else { result.d_input.store( decltype(h_input_for_copy)(h_input_for_copy.data(), h_input_for_copy.data() + result.total_num_elements)); result.d_buffer_sizes.store(h_buffer_num_elements); } result.d_buffer_srcs.store(h_buffer_srcs); result.d_buffer_dsts.store(h_buffer_dsts); return result; } template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& seed = state.seed; constexpr size_t num_buffers = NumTlevBuffers + NumWlevBuffers + NumBlevBuffers; size_t temp_storage_bytes = 0; BatchMemcpyData data; batch_copy(nullptr, temp_storage_bytes, data.d_buffer_srcs.get(), data.d_buffer_dsts.get(), data.d_buffer_sizes.get(), num_buffers, stream); common::device_ptr d_temp_storage(temp_storage_bytes); data = prepare_data(stream, seed, NumTlevBuffers, NumWlevBuffers, NumBlevBuffers); state.run( [&] { batch_copy(d_temp_storage.get(), temp_storage_bytes, data.d_buffer_srcs.get(), data.d_buffer_dsts.get(), data.d_buffer_sizes.get(), num_buffers, stream); }); state.set_throughput(data.total_num_elements, sizeof(ValueType)); } // Naive implementation used for comparison #ifdef BUILD_NAIVE_BENCHMARK template __launch_bounds__(BlockSize) __global__ void naive_kernel(void** in_ptr, void** out_ptr, const OffsetType* sizes) { using underlying_type = unsigned char; constexpr int32_t items_per_thread = 4; constexpr int32_t tile_size = items_per_thread * BlockSize; const int32_t buffer_id = rocprim::flat_block_id(); auto in = reinterpret_cast(in_ptr[buffer_id]); auto out = reinterpret_cast(out_ptr[buffer_id]); const auto size = sizes[buffer_id]; const auto size_in_elements = size / sizeof(underlying_type); const auto tiles = size_in_elements / tile_size; auto num_items_to_copy = size; for(size_t i = 0; i < tiles; ++i) { underlying_type data[items_per_thread]; rocprim::block_load_direct_blocked(rocprim::flat_block_thread_id(), in, data, num_items_to_copy); rocprim::block_store_direct_blocked(rocprim::flat_block_thread_id(), out, data, num_items_to_copy); in += tile_size; out += tile_size; num_items_to_copy -= tile_size; } } template void run_naive_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& seed = state.seed; const auto data = prepare_data(stream, seed, NumTlevBuffers, NumWlevBuffers, NumBlevBuffers); constexpr size_t num_buffers = NumTlevBuffers + NumWlevBuffers + NumBlevBuffers; state.run( [&] { naive_kernel <<>>((void**)data.d_buffer_srcs.get(), (void**)data.d_buffer_dsts.get(), data.d_buffer_sizes.get()); }); state.set_throughput(data.total_num_elements, sizeof(ValueType)); } #define CREATE_NAIVE_BENCHMARK(item_size, \ item_alignment, \ size_type, \ num_tlev, \ num_wlev, \ num_blev) \ executor.queue_fn( \ bench_naming::format_name( \ "{lvl:device,item_size:" #item_size ",item_alignment:" #item_alignment \ ",size_type:" #size_type ",algo:naive_memcpy,num_tlev:" #num_tlev \ ",num_wlev:" #num_wlev ",num_blev:" #num_blev ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_naive_benchmark, \ size_type, \ true, \ num_tlev, \ num_wlev, \ num_blev>(std::forward(state)); \ }); #endif // BUILD_NAIVE_BENCHMARK #define CREATE_BENCHMARK(item_size, \ item_alignment, \ size_type, \ num_tlev, \ num_wlev, \ num_blev, \ is_memcpy) \ executor.queue_fn(bench_naming::format_name("{lvl:device,item_size:" #item_size \ ",item_alignment:" #item_alignment \ ",size_type:" #size_type ",algo:" \ + (is_memcpy ? "batch_memcpy"s : "batch_copy"s) \ + ",num_tlev:" #num_tlev ",num_wlev:" #num_wlev \ ",num_blev:" #num_blev ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_benchmark, \ size_type, \ is_memcpy, \ num_tlev, \ num_wlev, \ num_blev>(std::forward(state)); \ }); #define CREATE_NORMAL_BENCHMARK(item_size, \ item_alignment, \ size_type, \ num_tlev, \ num_wlev, \ num_blev) \ CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev, true) \ CREATE_BENCHMARK(item_size, item_alignment, size_type, num_tlev, num_wlev, num_blev, false) #ifndef BUILD_NAIVE_BENCHMARK #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 100000, 0, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 100000, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 0, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 1000, 1000, 1000) #else #define BENCHMARK_TYPE(item_size, item_alignment) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 100000, 0, 0) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 100000, 0) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 0, 0, 1000) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, uint32_t, 1000, 1000, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 100000, 0, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 100000, 0) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 0, 1000) \ CREATE_NORMAL_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 1000, 1000, 1000) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 100000, 0, 0) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 100000, 0) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 0, 0, 1000) \ CREATE_NAIVE_BENCHMARK(item_size, item_alignment, rocprim::uint128_t, 1000, 1000, 1000) #endif //BUILD_NAIVE_BENCHMARK int32_t main(int32_t argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 0, 1, 5); BENCHMARK_TYPE(1, 1) BENCHMARK_TYPE(1, 2) BENCHMARK_TYPE(1, 4) BENCHMARK_TYPE(1, 8) BENCHMARK_TYPE(2, 2) BENCHMARK_TYPE(4, 4) BENCHMARK_TYPE(8, 8) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_binary_search.cpp000066400000000000000000000071051506507210100245300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_binary_search.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T, K, SORTED, SUBALGORITHM) \ executor.queue_fn( \ bench_naming::format_name("{lvl:device,algo:" + SUBALGORITHM{}.name() \ + ",key_type:" #T ",subalgo:" #K "_percent_" \ + std::string(SORTED ? "sorted" : "random") \ + "_needles,cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ device_binary_search_benchmark().run( \ std::forward(state)); \ }); #define BENCHMARK_ALGORITHMS(T, K, SORTED) \ CREATE_BENCHMARK(T, K, SORTED, binary_search_subalgorithm) \ CREATE_BENCHMARK(T, K, SORTED, lower_bound_subalgorithm) \ CREATE_BENCHMARK(T, K, SORTED, upper_bound_subalgorithm) #define BENCHMARK_TYPE(type) \ BENCHMARK_ALGORITHMS(type, 10, true) \ BENCHMARK_ALGORITHMS(type, 10, false) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; BENCHMARK_TYPE(float) BENCHMARK_TYPE(double) BENCHMARK_TYPE(int8_t) BENCHMARK_TYPE(uint8_t) BENCHMARK_TYPE(rocprim::half) BENCHMARK_TYPE(rocprim::int128_t) BENCHMARK_TYPE(rocprim::uint128_t) BENCHMARK_TYPE(custom_float2) BENCHMARK_TYPE(custom_double2) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_binary_search.parallel.cpp.in000066400000000000000000000031411506507210100267240ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_binary_search.parallel.hpp" #include #include #include namespace { auto unused = benchmark_utils::executor::queue_sorted_instance>>(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_binary_search.parallel.hpp000066400000000000000000000175651506507210100263430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" #include #include #include #include #include #include #include #include #include #include #include struct binary_search_subalgorithm { std::string name() const { return "binary_search"; } }; struct lower_bound_subalgorithm { std::string name() const { return "lower_bound"; } }; struct upper_bound_subalgorithm { std::string name() const { return "upper_bound"; } }; template struct dispatch_binary_search_helper { template hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) { using config = rocprim::binary_search_config; return rocprim::binary_search(std::forward(args)...); } template hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) { using config = rocprim::upper_bound_config; return rocprim::upper_bound(std::forward(args)...); } template hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) { using config = rocprim::lower_bound_config; return rocprim::lower_bound(std::forward(args)...); } }; template<> struct dispatch_binary_search_helper { template hipError_t dispatch_binary_search(binary_search_subalgorithm, Args&&... args) { return rocprim::binary_search(std::forward(args)...); } template hipError_t dispatch_binary_search(upper_bound_subalgorithm, Args&&... args) { return rocprim::upper_bound(std::forward(args)...); } template hipError_t dispatch_binary_search(lower_bound_subalgorithm, Args&&... args) { return rocprim::lower_bound(std::forward(args)...); } }; template std::string binary_search_config_name() { return "{bs:" + std::to_string(Config::block_size) + ",ipt:" + std::to_string(Config::items_per_thread) + "}"; } template<> inline std::string binary_search_config_name() { return "default_config"; } template struct device_binary_search_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:" + SubAlgorithm{}.name() + ",value_type:" + std::string(Traits::name()) + ",output_type:" + std::string(Traits::name()) + ",cfg:" + binary_search_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& bytes = state.bytes; const auto& seed = state.seed; const auto& stream = state.stream; size_t needles_bytes = bytes * K / 100; using compare_op_type = typename std:: conditional::value, half_less, rocprim::less>::type; // Calculate the number of elements from byte size size_t haystack_size = bytes / sizeof(T); size_t needles_size = needles_bytes / sizeof(T); compare_op_type compare_op; // Generate data std::vector haystack(haystack_size); std::iota(haystack.begin(), haystack.end(), 0); const auto random_range = limit_random_range(0, haystack_size); std::vector needles = get_random_data(needles_size, random_range.first, random_range.second, seed.get_0()); if(SortedNeedles) { std::sort(needles.begin(), needles.end(), compare_op); } common::device_ptr d_haystack(haystack); common::device_ptr d_needles(needles); common::device_ptr d_output(needles_size); size_t temporary_storage_bytes; auto dispatch_helper = dispatch_binary_search_helper(); HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, nullptr, temporary_storage_bytes, d_haystack.get(), d_needles.get(), d_output.get(), haystack_size, needles_size, compare_op, stream)); common::device_ptr d_temporary_storage(temporary_storage_bytes); state.run( [&] { HIP_CHECK(dispatch_helper.dispatch_binary_search(SubAlgorithm{}, d_temporary_storage.get(), temporary_storage_bytes, d_haystack.get(), d_needles.get(), d_output.get(), haystack_size, needles_size, compare_op, stream)); }); state.set_throughput(needles_size, sizeof(T)); } }; #endif // ROCPRIM_BENCHMARK_BINARY_SEARCH_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_find_end.cpp000066400000000000000000000056321506507210100234700ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_end.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include #include #include #include #include #include #define CREATE_BENCHMARK_FIND_END(TYPE, KEY_SIZE, REPEATING) \ executor.queue_instance(device_find_end_benchmark(KEY_SIZE, REPEATING)); #define CREATE_BENCHMARK_PATTERN(TYPE, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 10, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 100, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 1000, REPEATING) \ CREATE_BENCHMARK_FIND_END(TYPE, 10000, REPEATING) #define CREATE_BENCHMARK(TYPE) \ CREATE_BENCHMARK_PATTERN(TYPE, true) \ CREATE_BENCHMARK_PATTERN(TYPE, false) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_find_end.hpp000066400000000000000000000125331506507210100234730ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include template struct device_find_end_benchmark : public benchmark_utils::autotune_interface { size_t key_size_ = 10; bool repeating_ = false; device_find_end_benchmark(size_t KeySize, bool repeating) { key_size_ = KeySize; repeating_ = repeating; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:find_end,value_pattern:" + (repeating_ ? "repeating"s : "random"s) + ",key_size:" + std::to_string(key_size_) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using output_type = size_t; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t key_size = std::min(size, key_size_); // Generate data std::vector keys_input = get_random_data(key_size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector input(size); if(repeating_) { // Repeating similar pattern without early exits. keys_input[0] = 0; for(size_t i = 0; i < size; ++i) { input[i] = keys_input[i % key_size]; } keys_input[0] = 1; } else { input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0() + 1); } common::device_ptr d_keys_input(keys_input); common::device_ptr d_input(input); common::device_ptr d_output(1); rocprim::equal_to compare_op; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::find_end(nullptr, temporary_storage_bytes, d_input.get(), d_keys_input.get(), d_output.get(), size, key_size, compare_op, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); state.run( [&] { HIP_CHECK(rocprim::find_end(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), d_keys_input.get(), d_output.get(), size, key_size, compare_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_FIND_END_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_find_first_of.cpp000066400000000000000000000055241506507210100245350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, FIRST_OCCURENCE) \ executor.queue_instance(device_find_first_of_benchmark(KEYS_SIZE, FIRST_OCCURENCE)); // clang-format off #define CREATE_BENCHMARK0(TYPE, KEYS_SIZE) \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.1) \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 0.5) \ CREATE_BENCHMARK_FIND_FIRST_OF(TYPE, KEYS_SIZE, 1.0) #define CREATE_BENCHMARK(TYPE) \ CREATE_BENCHMARK0(TYPE, 1) \ CREATE_BENCHMARK0(TYPE, 10) \ CREATE_BENCHMARK0(TYPE, 100) \ CREATE_BENCHMARK0(TYPE, 1000) \ CREATE_BENCHMARK0(TYPE, 10000) // clang-format on int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 2); #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(int16_t) CREATE_BENCHMARK(int32_t) CREATE_BENCHMARK(float) CREATE_BENCHMARK(int64_t) CREATE_BENCHMARK(double) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_int2 = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_longlong_double) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_find_first_of.parallel.cpp.in000066400000000000000000000027011506507210100267270ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_find_first_of.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_find_first_of_benchmark_generator<@DataType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_find_first_of.parallel.hpp000066400000000000000000000232021506507210100263260ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include template std::string config_name() { const rocprim::detail::find_first_of_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_find_first_of_benchmark : public benchmark_utils::autotune_interface { std::vector keys_sizes; std::vector first_occurrences; device_find_first_of_benchmark(size_t keys_size, double first_occurrence) { keys_sizes.push_back(keys_size); first_occurrences.push_back(first_occurrence); } device_find_first_of_benchmark(const std::vector& keys_sizes, const std::vector& first_occurrences) { this->keys_sizes = keys_sizes; this->first_occurrences = first_occurrences; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:find_first_of,"s + (keys_sizes.size() == 1 ? "keys_size:"s + std::to_string(keys_sizes[0]) : ""s) + (first_occurrences.size() == 1 ? ",first_occurrence:"s + std::to_string(first_occurrences[0]) : ""s) + ",value_type:"s + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using type = T; using key_type = T; using output_type = size_t; const size_t size = bytes / sizeof(type); const size_t max_keys_size = *std::max_element(keys_sizes.begin(), keys_sizes.end()); // Generate data std::vector key_input = get_random_data(max_keys_size, 0, 100, seed.get_0()); std::vector input = get_random_data(size, 101, common::generate_limits::max(), seed.get_0()); std::vector d_inputs(first_occurrences.size()); for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { type* d_input; HIP_CHECK(hipMalloc(&d_input, size * sizeof(*d_input))); HIP_CHECK(hipMemcpyAsync(d_input, input.data(), input.size() * sizeof(*d_input), hipMemcpyHostToDevice, stream)); // Set the first occurrence of keys in input const size_t p = static_cast(size * first_occurrences[fi]); if(p < size) { const type key = key_input[0]; HIP_CHECK(hipMemcpyAsync(d_input + p, &key, sizeof(*d_input), hipMemcpyHostToDevice, stream)); } d_inputs[fi] = d_input; } key_type* d_key_input; output_type* d_output; HIP_CHECK(hipMalloc(&d_key_input, max_keys_size * sizeof(*d_key_input))); HIP_CHECK(hipMalloc(&d_output, sizeof(*d_output))); HIP_CHECK(hipMemcpy(d_key_input, key_input.data(), key_input.size() * sizeof(*d_key_input), hipMemcpyHostToDevice)); ::rocprim::equal_to compare_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; auto run = [&](size_t key_size, const type* d_input) { HIP_CHECK(rocprim::find_first_of(d_temporary_storage, temporary_storage_bytes, d_input, d_key_input, d_output, input.size(), key_size, compare_op, stream)); }; size_t max_temporary_storage_bytes = 0; for(size_t keys_size : keys_sizes) { run(keys_size, d_inputs[0]); max_temporary_storage_bytes = std::max(max_temporary_storage_bytes, temporary_storage_bytes); } temporary_storage_bytes = max_temporary_storage_bytes; HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); state.run( [&] { for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { for(size_t keys_size : keys_sizes) { run(keys_size, d_inputs[fi]); } } }); // Only a part of data (before the first occurrence) must be actually processed. In ideal // cases when no thread blocks do unneeded work (i.e. exit early once the match is found), // performance for different values of first_occurrence must be similar. size_t sum_effective_size = 0; for(double first_occurrence : first_occurrences) { sum_effective_size += static_cast(size * first_occurrence); } size_t sum_keys_size = 0; for(size_t keys_size : keys_sizes) { sum_keys_size += keys_size; } state.set_throughput(sum_effective_size, sizeof(type)); // Each input is read once but all keys are read by all threads so performance is likely // compute-bound or bound by cache bandwidth for reading keys rather than reading inputs. // Let's additionally report the rate of comparisons to see if it reaches a plateau with // increasing keys_size. state.gbench_state.counters["comparisons_per_second"] = benchmark::Counter( static_cast(state.gbench_state.iterations() * state.batch_iterations * sum_effective_size * sum_keys_size), benchmark::Counter::kIsRate); for(size_t fi = 0; fi < first_occurrences.size(); ++fi) { HIP_CHECK(hipFree(d_inputs[fi])); } HIP_CHECK(hipFree(d_key_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temporary_storage)); } }; template struct device_find_first_of_benchmark_generator { template struct create_ipt { using generated_config = rocprim::find_first_of_config; void operator()(std::vector>& storage) { std::vector keys_sizes{1, 10, 100, 1000}; std::vector first_occurrences{0.1, 0.5, 1.0}; storage.emplace_back( std::make_unique>( keys_sizes, first_occurrences)); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread = 16; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_FIND_FIRST_OF_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_histogram.cpp000066400000000000000000000525131506507210100237170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_histogram.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { case 0: return 100; case 1: return 81; case 2: return 54; case 3: return 33; case 4: return 20; default: return 0; } } template void run_even_benchmark(benchmark_utils::state&& state, size_t bins, size_t scale, int entropy_reduction) { const auto& stream = state.stream; const auto& bytes = state.bytes; // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const level_type lower_level = 0; const level_type upper_level = bins * scale; // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); common::device_ptr d_input(input); common::device_ptr d_histogram(bins); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::histogram_even(nullptr, temporary_storage_bytes, d_input.get(), size, d_histogram.get(), bins + 1, lower_level, upper_level, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::histogram_even(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), size, d_histogram.get(), bins + 1, lower_level, upper_level, stream, false)); }); state.set_throughput(size, sizeof(T)); } template void run_multi_even_benchmark(benchmark_utils::state&& state, size_t bins, size_t scale, int entropy_reduction) { const auto& stream = state.stream; const auto& bytes = state.bytes; // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; unsigned int num_levels[ActiveChannels]; level_type lower_level[ActiveChannels]; level_type upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { lower_level[channel] = 0; upper_level[channel] = bins * scale; num_levels[channel] = bins + 1; } // Generate data std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); common::device_ptr d_input(input); counter_type* d_histogram[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } size_t temporary_storage_bytes = 0; HIP_CHECK((rocprim::multi_histogram_even(nullptr, temporary_storage_bytes, d_input.get(), size, d_histogram, num_levels, lower_level, upper_level, stream, false))); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK( (rocprim::multi_histogram_even(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), size, d_histogram, num_levels, lower_level, upper_level, stream, false))); }); state.set_throughput(size * Channels, sizeof(T)); for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipFree(d_histogram[channel])); } } template void run_range_benchmark(benchmark_utils::state&& state, size_t bins) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; // Generate data const auto random_range = limit_random_range(0, bins); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector levels(bins + 1); for(size_t i = 0; i < levels.size(); ++i) { levels[i] = static_cast(i); } common::device_ptr d_input(input); common::device_ptr d_levels(levels); common::device_ptr d_histogram(bins); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::histogram_range(nullptr, temporary_storage_bytes, d_input.get(), size, d_histogram.get(), bins + 1, d_levels.get(), stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::histogram_range(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), size, d_histogram.get(), bins + 1, d_levels.get(), stream, false)); }); state.set_throughput(size, sizeof(T)); } template void run_multi_range_benchmark(benchmark_utils::state&& state, size_t bins) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(T); using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const int num_levels_channel = bins + 1; unsigned int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { levels[channel].resize(num_levels_channel); for(size_t i = 0; i < levels[channel].size(); ++i) { levels[channel][i] = static_cast(i); } num_levels[channel] = num_levels_channel; } // Generate data const auto random_range = limit_random_range(0, bins); std::vector input = get_random_data(size * Channels, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); level_type* d_levels[ActiveChannels]; counter_type* d_histogram[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), num_levels_channel * sizeof(level_type), hipMemcpyHostToDevice)); } size_t temporary_storage_bytes = 0; HIP_CHECK((rocprim::multi_histogram_range(nullptr, temporary_storage_bytes, d_input.get(), size, d_histogram, num_levels, d_levels, stream, false))); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK( (rocprim::multi_histogram_range(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), size, d_histogram, num_levels, d_levels, stream, false))); }); state.set_throughput(size * Channels, sizeof(T)); for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } } #define CREATE_EVEN_BENCHMARK(T, BINS, SCALE) \ executor.queue_fn( \ bench_naming::format_name("{lvl:device,algo:histogram_even,value_type:" #T ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_even_benchmark(std::forward(state), \ BINS, \ SCALE, \ entropy_reduction); \ }); #define BENCHMARK_EVEN_TYPE(T, S) \ CREATE_EVEN_BENCHMARK(T, 10, S) \ CREATE_EVEN_BENCHMARK(T, 100, S) \ CREATE_EVEN_BENCHMARK(T, 1000, S) \ CREATE_EVEN_BENCHMARK(T, 10000, S) #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ executor.queue_fn(bench_naming::format_name( \ "{lvl:device,algo:multi_histogram_even,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_multi_even_benchmark( \ std::forward(state), \ BINS, \ SCALE, \ entropy_reduction); \ }); #define BENCHMARK_MULTI_EVEN_TYPE(C, A, T, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 100, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 1000, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10000, S) #define CREATE_RANGE_BENCHMARK(T, BINS) \ executor.queue_fn( \ bench_naming::format_name("{lvl:device,algo:histogram_range,value_type:" #T ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { run_range_benchmark(std::forward(state), BINS); }); #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10) \ CREATE_RANGE_BENCHMARK(T, 100) \ CREATE_RANGE_BENCHMARK(T, 1000) \ CREATE_RANGE_BENCHMARK(T, 10000) #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ executor.queue_fn(bench_naming::format_name( \ "{lvl:device,algo:multi_histogram_range,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_multi_range_benchmark( \ std::forward(state), \ BINS); \ }); #define BENCHMARK_MULTI_RANGE_TYPE(C, A, T) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 100) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 1000) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10000) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING const int entropy_reductions[] = {0, 2, 4, 6}; // Even benchmarks for(int entropy_reduction : entropy_reductions) { BENCHMARK_EVEN_TYPE(long long, 12345) BENCHMARK_EVEN_TYPE(int, 1234) BENCHMARK_EVEN_TYPE(short, 5) CREATE_EVEN_BENCHMARK(unsigned char, 16, 16) CREATE_EVEN_BENCHMARK(unsigned char, 256, 1) BENCHMARK_EVEN_TYPE(double, 1234) BENCHMARK_EVEN_TYPE(float, 1234) BENCHMARK_EVEN_TYPE(rocprim::half, 5) CREATE_EVEN_BENCHMARK(rocprim::int128_t, 16, 16) CREATE_EVEN_BENCHMARK(rocprim::int128_t, 256, 1) CREATE_EVEN_BENCHMARK(rocprim::uint128_t, 16, 16) CREATE_EVEN_BENCHMARK(rocprim::uint128_t, 256, 1) } // Multi-even benchmarks for(int entropy_reduction : entropy_reductions) { BENCHMARK_MULTI_EVEN_TYPE(4, 4, int, 1234) BENCHMARK_MULTI_EVEN_TYPE(4, 3, short, 5) CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 16) CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1) BENCHMARK_MULTI_EVEN_TYPE(3, 3, float, 1234) CREATE_MULTI_EVEN_BENCHMARK(4, 3, rocprim::int128_t, 16, 16) CREATE_MULTI_EVEN_BENCHMARK(4, 3, rocprim::int128_t, 256, 1) CREATE_MULTI_EVEN_BENCHMARK(4, 3, rocprim::uint128_t, 16, 16) CREATE_MULTI_EVEN_BENCHMARK(4, 3, rocprim::uint128_t, 256, 1) } // Range benchmarks BENCHMARK_RANGE_TYPE(long long) BENCHMARK_RANGE_TYPE(int) BENCHMARK_RANGE_TYPE(short) CREATE_RANGE_BENCHMARK(unsigned char, 16) CREATE_RANGE_BENCHMARK(unsigned char, 256) BENCHMARK_RANGE_TYPE(double) BENCHMARK_RANGE_TYPE(float) BENCHMARK_RANGE_TYPE(rocprim::half) CREATE_RANGE_BENCHMARK(rocprim::int128_t, 16) CREATE_RANGE_BENCHMARK(rocprim::int128_t, 256) CREATE_RANGE_BENCHMARK(rocprim::uint128_t, 16) CREATE_RANGE_BENCHMARK(rocprim::uint128_t, 256) // Multi-range benchmarks BENCHMARK_MULTI_RANGE_TYPE(4, 4, int) BENCHMARK_MULTI_RANGE_TYPE(4, 3, short) CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 16) CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 256) BENCHMARK_MULTI_RANGE_TYPE(3, 3, float) BENCHMARK_MULTI_RANGE_TYPE(2, 2, double) CREATE_MULTI_RANGE_BENCHMARK(4, 3, rocprim::int128_t, 16) CREATE_MULTI_RANGE_BENCHMARK(4, 3, rocprim::int128_t, 256) CREATE_MULTI_RANGE_BENCHMARK(4, 3, rocprim::uint128_t, 16) CREATE_MULTI_RANGE_BENCHMARK(4, 3, rocprim::uint128_t, 256) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_histogram.parallel.cpp.in000066400000000000000000000026641506507210100261210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_histogram.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_histogram_benchmark_generator<@DataType@, @BlockSize@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_histogram.parallel.hpp000066400000000000000000000354241506507210100255210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include template std::vector generate(size_t size, int entropy_reduction, int lower_level, int upper_level) { if(entropy_reduction >= 5) { return std::vector(size, static_cast((lower_level + upper_level) / 2)); } const size_t max_random_size = 1024 * 1024 + 4321; const unsigned int seed = 123; engine_type gen(seed); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { // Reduce enthropy by applying bitwise AND to random bits // "An Improved Supercomputer Sorting Benchmark", 1992 // Kurt Thearling & Stephen Smith auto v = gen(); for(int e = 0; e < entropy_reduction; ++e) { v &= gen(); } return T(lower_level + v % (upper_level - lower_level)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } // Cache for input data when multiple cases must be benchmarked with various configurations and // same inputs can be used for consecutive benchmarks. // It must be used as a singleton. template class input_cache { public: ~input_cache() { clear(); } void clear() { total_cache_size = 0; cache.clear(); } // The function returns an existing buffer if main_key matches and there is additional_key // in the cache, or generates a new buffer using gen(). // If main_key does not match, it frees all device buffers and resets the cache. template T* get_or_generate(const std::string& main_key, const std::string& additional_key, F gen) { // Experimentally determined maximum size, before the GPU runs out of memory. static constexpr short max_default_bytes_count = 88; if(this->main_key != main_key) { // The main key (for example, data type) has been changed, clear the cache clear(); this->main_key = main_key; } auto result = cache.find(additional_key); if(result != cache.end()) { return reinterpret_cast(result->second.get()); } // Generate a new buffer std::vector data = gen(); common::device_ptr d_buffer; if(total_cache_size >= max_default_bytes_count) { // the memory space of the value of last key-value pair is held by d_buffer // and the pair is erased from the cache map auto iter = cache.end(); --iter; d_buffer = std::move(iter->second); cache.erase(iter); } else { // it will generate a new memory space to store in cache // so records the new size in advance total_cache_size += sizeof(T); } d_buffer.store(data); cache[additional_key] = std::move(d_buffer); return cache[additional_key].get(); } static input_cache& instance() { static input_cache instance; return instance; } private: std::string main_key; std::map> cache; short total_cache_size = 0; }; template std::string config_name() { const rocprim::detail::histogram_config_params config = Config(); return "{bs:" + std::to_string(config.histogram_config.block_size) + ",ipt:" + std::to_string(config.histogram_config.items_per_thread) + ",max_grid_size:" + std::to_string(config.max_grid_size) + ",shared_impl_max_bins:" + std::to_string(config.shared_impl_max_bins) + ",shared_impl_histograms:" + std::to_string(config.shared_impl_histograms) + ",global_hist_bs:" + std::to_string(config.histogram_global_config.block_size) + ",global_hist_ipt:" + std::to_string(config.histogram_global_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_histogram_benchmark : public benchmark_utils::autotune_interface { std::vector cases; device_histogram_benchmark(const std::vector& cases) : cases(cases) {} std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:histogram,value_type:" + std::string(Traits::name()) + ",channels:" + std::to_string(Channels) + ",active_channels:" + std::to_string(ActiveChannels) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; using counter_type = unsigned int; using level_type = typename std:: conditional_t::value && sizeof(T) < sizeof(int), int, T>; struct case_data { unsigned int bins; int entropy_reduction; level_type lower_level[ActiveChannels]{}; level_type upper_level[ActiveChannels]{}; unsigned int num_levels[ActiveChannels]{}; T* get_d_input(size_t bytes) { return input_cache::instance().get_or_generate( std::string(Traits::name()), std::to_string(bins) + "_" + std::to_string(entropy_reduction), [&]() { return generate(bytes, entropy_reduction, 0, bins); }); }; }; const std::size_t size = bytes / Channels; size_t temporary_storage_bytes = 0; counter_type* d_histogram[ActiveChannels]; unsigned int max_bins = 0; std::vector cases_data; for(const auto& bins : cases) { for(int entropy_reduction : {0, 2, 4, 6}) { case_data data = {bins, entropy_reduction}; // Reuse inputs for the same sample type. This autotune uses multipe inputs for all // combinations of bins and entropy, but the inputs do not depend on autotuned // params (bs, ipt, shared_impl_max_bins) and can be reused saving time needed for // generating and copying to device. for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { data.lower_level[channel] = 0; data.upper_level[channel] = bins; data.num_levels[channel] = bins + 1; } cases_data.push_back(data); size_t current_temporary_storage_bytes = 0; HIP_CHECK((rocprim::multi_histogram_even( nullptr, current_temporary_storage_bytes, data.get_d_input(bytes), size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); temporary_storage_bytes = std::max(temporary_storage_bytes, current_temporary_storage_bytes); max_bins = std::max(max_bins, bins); } } common::device_ptr d_temporary_storage(temporary_storage_bytes); for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipMalloc(&d_histogram[channel], max_bins * sizeof(counter_type))); } HIP_CHECK(hipDeviceSynchronize()); size_t total_size = 0; for(auto& data : cases_data) { T* d_input = data.get_d_input(bytes); state.run( [&] { HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage.get(), temporary_storage_bytes, d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); }); total_size += size * Channels; } state.set_throughput(total_size, sizeof(T)); for(unsigned int channel = 0; channel < ActiveChannels; ++channel) { HIP_CHECK(hipFree(d_histogram[channel])); } } }; template struct device_histogram_benchmark_generator { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread = 16; static constexpr unsigned int min_shared_impl_histograms = 2; static constexpr unsigned int max_shared_impl_histograms = 4; template struct create_ipt { template struct create_shared_impl_histograms { using generated_config = rocprim::histogram_config, 2048, 2048, SharedImplHistograms, rocprim::kernel_config<1024, 4>>; template auto create(std::vector>& storage, const std::vector& cases) -> typename std::enable_if<(items_per_thread * Channels <= max_items_per_thread), void>::type { storage.emplace_back( std::make_unique< device_histogram_benchmark>( cases)); } template auto create( std::vector>& /*storage*/, const std::vector& /*cases*/) -> typename std::enable_if::type {} void operator()( std::vector>& storage, const std::vector& cases) { // Tune histograms for single-channel data (histogram_even) create<1, 1>(storage, cases); // and some multi-channel configurations (multi_histogram_even) create<2, 2>(storage, cases); create<3, 3>(storage, cases); create<4, 4>(storage, cases); create<4, 3>(storage, cases); } }; void operator()(std::vector>& storage, const std::vector& cases) { static_for_each, create_shared_impl_histograms>(storage, cases); } }; static void create(std::vector>& storage) { // Benchmark multiple cases (with various sample distributions) and use sum of all cases // as a measurement for autotuning std::vector cases; if(std::is_same::value) { cases = {16, 127}; } else { cases = { 10, 100, 1000, 10000 // Multiple bins to trigger a global memory implementation }; } static_for_each, create_ipt>(storage, cases); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_memory.cpp000066400000000000000000001611661506507210100232370ustar00rootroot00000000000000// MIT License // // Copyright (c) 2018-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // rocPRIM #include #include #include #include #include #include #include #include #include #include #include enum memory_operation_method { block_primitives_transpose, striped, vectorized, block_primitive_direct, }; enum kernel_operation { no_operation, block_scan, custom_operation, atomics_no_collision, atomics_inter_block_collision, atomics_inter_warp_collision, }; template struct operation; // no operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&)[ItemsPerThread], void* = nullptr, unsigned int = 0, T* = nullptr) const { // No operation } }; #define repeats 30 // custom operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) const { (void)shared_storage; (void)shared_storage_size; (void)global_mem_output; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] = input[i] + 666; ROCPRIM_UNROLL for(unsigned int j = 0; j < repeats; ++j) { input[i] = input[i] * (input[j % ItemsPerThread]); } } } }; // block scan template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) const { (void)global_mem_output; using block_scan_type = typename rocprim:: block_scan; block_scan_type bscan; // when using vectorized or striped functions // NOTE: This is not safe but it is the easiest way to prevent code repetition if(shared_storage == nullptr || shared_storage_size < sizeof(typename block_scan_type::storage_type)) { __shared__ typename block_scan_type::storage_type storage; shared_storage = &storage; } bscan.inclusive_scan( input, input, *(reinterpret_cast(shared_storage))); __syncthreads(); } }; // atomics_no_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void)shared_storage; (void)shared_storage_size; (void)input; unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void)shared_storage; (void)shared_storage_size; (void)input; unsigned int index = (threadIdx.x % rocprim::arch::wavefront::min_size()) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void)shared_storage; (void)shared_storage_size; (void)input; unsigned int index = threadIdx.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // block_primitive_direct method base kernel template::value_type, typename std::enable_if::type = 0> __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim:: block_load; using block_store_type = typename rocprim:: block_store; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } // vectorized method base kernel template::value_type, typename std::enable_if::type = 0> __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; rocprim::block_load_direct_blocked_vectorized(threadIdx.x, input + offset, items); __syncthreads(); op(items, nullptr, 0, output); rocprim::block_store_direct_blocked_vectorized(threadIdx.x, output + offset, items); } // striped method base kernel template::value_type, typename std::enable_if::type = 0> __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T items[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, items); op(items, nullptr, 0, output); rocprim::block_store_direct_striped(lid, output + block_offset, items); } // block_primitives_transpose method base kernel template::value_type, typename std::enable_if::type = 0> __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim:: block_load; using block_store_type = typename rocprim::block_store; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; const size_t size = bytes / sizeof(T); const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; state.run( [&] { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input.get(), d_output.get(), selected_operation); }); state.set_throughput(size, sizeof(T)); } template void run_benchmark_memcpy(benchmark_utils::state&& state) { const auto& bytes = state.bytes; const size_t size = bytes / sizeof(T); // Allocate device buffers // Note: since this benchmark only tests performance by memcpying between device buffers, // we don't really need to transfer data into these from the host - whatever happens // to be in device memory will do. common::device_ptr d_input(size); common::device_ptr d_output(size); state.run( [&] { HIP_CHECK(hipMemcpy(d_output.get(), d_input.get(), size * sizeof(T), hipMemcpyDeviceToDevice)); }); state.set_throughput(size, sizeof(T)); } #define CREATE_BENCHMARK(METHOD, OPERATION, T, BLOCK_SIZE, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:device,algo:memory,subalgo:" #METHOD \ ",operation:" #OPERATION ",key_type:" #T \ ",cfg:{bs:" #BLOCK_SIZE ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark); #define CREATE_BENCHMARK_MEMCPY(T) \ executor.queue_fn( \ bench_naming::format_name("{lvl:device,algo:memory,subalgo:copy,key_type:" #T \ ",cfg:default_config}") \ .c_str(), \ run_benchmark_memcpy); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 10); // simple memory copy not running kernel CREATE_BENCHMARK_MEMCPY(int) // simple memory copy CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, 1024, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, no_operation, rocprim::uint128_t, 1024, 2) // simple memory copy using vector type CREATE_BENCHMARK(vectorized, no_operation, int, 128, 1) CREATE_BENCHMARK(vectorized, no_operation, int, 128, 2) CREATE_BENCHMARK(vectorized, no_operation, int, 128, 4) CREATE_BENCHMARK(vectorized, no_operation, int, 128, 8) CREATE_BENCHMARK(vectorized, no_operation, int, 128, 16) CREATE_BENCHMARK(vectorized, no_operation, int, 256, 1) CREATE_BENCHMARK(vectorized, no_operation, int, 256, 2) CREATE_BENCHMARK(vectorized, no_operation, int, 256, 4) CREATE_BENCHMARK(vectorized, no_operation, int, 256, 8) CREATE_BENCHMARK(vectorized, no_operation, int, 256, 16) CREATE_BENCHMARK(vectorized, no_operation, int, 512, 1) CREATE_BENCHMARK(vectorized, no_operation, int, 512, 2) CREATE_BENCHMARK(vectorized, no_operation, int, 512, 4) CREATE_BENCHMARK(vectorized, no_operation, int, 512, 8) CREATE_BENCHMARK(vectorized, no_operation, int, 1024, 1) CREATE_BENCHMARK(vectorized, no_operation, int, 1024, 2) CREATE_BENCHMARK(vectorized, no_operation, int, 1024, 4) CREATE_BENCHMARK(vectorized, no_operation, int, 1024, 8) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 128, 1) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 128, 2) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 128, 4) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 128, 8) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 128, 16) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 256, 1) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 256, 2) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 256, 4) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 256, 8) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 256, 16) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 512, 1) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 512, 2) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 512, 4) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 512, 8) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 1024, 1) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 1024, 2) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 1024, 4) CREATE_BENCHMARK(vectorized, no_operation, uint64_t, 1024, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 256, 16) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 512, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 1024, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::int128_t, 1024, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 256, 16) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 512, 8) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 1024, 2) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 1024, 4) CREATE_BENCHMARK(vectorized, no_operation, rocprim::uint128_t, 1024, 8) // simple memory copy using striped CREATE_BENCHMARK(striped, no_operation, int, 128, 1) CREATE_BENCHMARK(striped, no_operation, int, 128, 2) CREATE_BENCHMARK(striped, no_operation, int, 128, 4) CREATE_BENCHMARK(striped, no_operation, int, 128, 8) CREATE_BENCHMARK(striped, no_operation, int, 128, 16) CREATE_BENCHMARK(striped, no_operation, int, 256, 1) CREATE_BENCHMARK(striped, no_operation, int, 256, 2) CREATE_BENCHMARK(striped, no_operation, int, 256, 4) CREATE_BENCHMARK(striped, no_operation, int, 256, 8) CREATE_BENCHMARK(striped, no_operation, int, 256, 16) CREATE_BENCHMARK(striped, no_operation, int, 512, 1) CREATE_BENCHMARK(striped, no_operation, int, 512, 2) CREATE_BENCHMARK(striped, no_operation, int, 512, 4) CREATE_BENCHMARK(striped, no_operation, int, 512, 8) CREATE_BENCHMARK(striped, no_operation, int, 1024, 1) CREATE_BENCHMARK(striped, no_operation, int, 1024, 2) CREATE_BENCHMARK(striped, no_operation, int, 1024, 4) CREATE_BENCHMARK(striped, no_operation, int, 1024, 8) CREATE_BENCHMARK(striped, no_operation, uint64_t, 128, 1) CREATE_BENCHMARK(striped, no_operation, uint64_t, 128, 2) CREATE_BENCHMARK(striped, no_operation, uint64_t, 128, 4) CREATE_BENCHMARK(striped, no_operation, uint64_t, 128, 8) CREATE_BENCHMARK(striped, no_operation, uint64_t, 128, 16) CREATE_BENCHMARK(striped, no_operation, uint64_t, 256, 1) CREATE_BENCHMARK(striped, no_operation, uint64_t, 256, 2) CREATE_BENCHMARK(striped, no_operation, uint64_t, 256, 4) CREATE_BENCHMARK(striped, no_operation, uint64_t, 256, 8) CREATE_BENCHMARK(striped, no_operation, uint64_t, 256, 16) CREATE_BENCHMARK(striped, no_operation, uint64_t, 512, 1) CREATE_BENCHMARK(striped, no_operation, uint64_t, 512, 2) CREATE_BENCHMARK(striped, no_operation, uint64_t, 512, 4) CREATE_BENCHMARK(striped, no_operation, uint64_t, 512, 8) CREATE_BENCHMARK(striped, no_operation, uint64_t, 1024, 1) CREATE_BENCHMARK(striped, no_operation, uint64_t, 1024, 2) CREATE_BENCHMARK(striped, no_operation, uint64_t, 1024, 4) CREATE_BENCHMARK(striped, no_operation, uint64_t, 1024, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 256, 16) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 512, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 1024, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::int128_t, 1024, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 256, 16) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 512, 8) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 1024, 2) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 1024, 4) CREATE_BENCHMARK(striped, no_operation, rocprim::uint128_t, 1024, 8) // block_scan CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 128, 32) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, 1024, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, 1024, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, block_scan, rocprim::uint128_t, 1024, 2) // vectorized - block_scan CREATE_BENCHMARK(vectorized, block_scan, int, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, int, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, int, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, int, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, int, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, int, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, int, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, int, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, int, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, int, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, int, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, int, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, int, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, int, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, int, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, int, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, int, 1024, 4) CREATE_BENCHMARK(vectorized, block_scan, int, 1024, 8) CREATE_BENCHMARK(vectorized, block_scan, float, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, float, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, float, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, float, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, float, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, float, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, float, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, float, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, float, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, float, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, float, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, float, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, float, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, float, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, float, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, float, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, float, 1024, 4) CREATE_BENCHMARK(vectorized, block_scan, float, 1024, 8) CREATE_BENCHMARK(vectorized, block_scan, double, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, double, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, double, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, double, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, double, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, double, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, double, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, double, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, double, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, double, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, double, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, double, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, double, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, double, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, double, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, double, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, double, 1024, 4) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, uint64_t, 1024, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::int128_t, 1024, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 256, 16) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 512, 8) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 1024, 2) CREATE_BENCHMARK(vectorized, block_scan, rocprim::uint128_t, 1024, 4) // custom_op CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::int128_t, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, custom_operation, rocprim::uint128_t, 1024, 2) // block_primitives_transpose - atomics no collision CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, 1024, 8) // block_primitives_transpose - atomics inter block collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, 1024, 8) // block_primitives_transpose - atomics inter warp collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 128, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 128, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 128, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 128, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 128, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 256, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 256, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 256, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 256, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 256, 16) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 512, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 512, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 512, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 512, 8) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 1024, 1) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 1024, 2) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 1024, 4) CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, 1024, 8) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge.cpp000066400000000000000000000060601506507210100230150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(...) executor.queue_instance(device_merge_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_int2 = common::custom_type; using custom_double2 = common::custom_type; using huge_float2_1024 = common::custom_huge_type<1024, float, float>; using huge_float2_2048 = common::custom_huge_type<2048, float, float>; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) CREATE_BENCHMARK(huge_float2_1024) CREATE_BENCHMARK(huge_float2_2048) CREATE_BENCHMARK(int, int) CREATE_BENCHMARK(long long, long long) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(custom_int2, custom_int2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) CREATE_BENCHMARK(huge_float2_1024, huge_float2_1024) CREATE_BENCHMARK(huge_float2_2048, huge_float2_2048) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge.parallel.cpp.in000066400000000000000000000026611506507210100252200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_merge.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_merge_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge.parallel.hpp000066400000000000000000000313071506507210100246170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include #include #include #include #include #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif template std::string config_name() { const rocprim::detail::merge_config_params params = Config(); return "{bs:" + std::to_string(params.kernel_config.block_size) + ",ipt:" + std::to_string(params.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:merge,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = KeyType; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; size_t size = bytes / sizeof(key_type); const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data const auto random_range = limit_random_range(0, size); std::vector keys_input1 = get_random_data(size1, random_range.first, random_range.second, seed.get_0()); std::vector keys_input2 = get_random_data(size2, random_range.first, random_range.second, seed.get_1()); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); common::device_ptr d_keys_input1(keys_input1); common::device_ptr d_keys_input2(keys_input2); common::device_ptr d_keys_output(size); common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input1.get(), d_keys_input2.get(), d_keys_output.get(), size1, size2, compare_op, stream, false)); d_temporary_storage.resize(temporary_storage_bytes); state.run( [&] { HIP_CHECK(rocprim::merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input1.get(), d_keys_input2.get(), d_keys_output.get(), size1, size2, compare_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = KeyType; using value_type = ValueType; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; size_t size = bytes / sizeof(key_type); const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data const auto random_range = limit_random_range(0, size); std::vector keys_input1 = get_random_data(size1, random_range.first, random_range.second, seed.get_0()); std::vector keys_input2 = get_random_data(size2, random_range.first, random_range.second, seed.get_1()); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); std::vector values_input1(size1); std::vector values_input2(size2); std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); common::device_ptr d_keys_input1(keys_input1); common::device_ptr d_keys_input2(keys_input2); common::device_ptr d_keys_output(size); common::device_ptr d_values_input1(size1); common::device_ptr d_values_input2(size2); common::device_ptr d_values_output(size); common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input1.get(), d_keys_input2.get(), d_keys_output.get(), d_values_input1.get(), d_values_input2.get(), d_values_output.get(), size1, size2, compare_op, stream, false)); d_temporary_storage.resize(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input1.get(), d_keys_input2.get(), d_keys_output.get(), d_values_input1.get(), d_values_input2.get(), d_values_output.get(), size1, size2, compare_op, stream, false)); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_merge_benchmark_generator { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::merge_config; using benchmark_struct = device_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; struct create_default_config { using default_config = typename rocprim::detail::default_merge_config_base::type; using benchmark_struct = device_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread_exponent = 0u; // Very large block sizes don't work with large items_per_thread since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(KeyType) + sizeof(ValueType); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; create_default_config()(storage); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_MERGE_PARALLEL_HPP_rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_inplace.cpp000066400000000000000000000153501506507210100245120ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include template struct random_monotonic_iterator { const unsigned int seed; random_monotonic_iterator(unsigned int seed) : seed(seed) {} using difference_type = std::ptrdiff_t; using value_type = T; // not all integral types are valid for int distribution using dist_value_type = std::conditional_t::value && !common::is_valid_for_int_distribution::value, int, T>; using dist_type = std::conditional_t::value, common::uniform_int_distribution, std::uniform_real_distribution>; std::mt19937 engine{seed}; dist_type dist{dist_value_type{0}, dist_value_type{increment}}; dist_value_type value = dist_value_type{0}; int operator*() const { return limit_cast(value); } random_monotonic_iterator& operator++() { // prefix value += dist(engine); return *this; } random_monotonic_iterator operator++(int) { // postfix random_monotonic_iterator retval{*this}; value += dist(engine); return retval; } }; template struct inplace_runner { using value_type = ValueT; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; value_type* d_data; size_t left_size; size_t right_size; hipStream_t stream; common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; compare_op_type compare_op{}; inplace_runner(value_type* data, size_t left_size, size_t right_size, hipStream_t stream) : d_data(data), left_size(left_size), right_size(right_size), stream(stream) {} void prepare() { HIP_CHECK(rocprim::merge_inplace(d_temporary_storage.get(), temporary_storage_bytes, d_data, left_size, right_size, compare_op, stream)); d_temporary_storage.resize(temporary_storage_bytes); } void run() { HIP_CHECK(rocprim::merge_inplace(d_temporary_storage.get(), temporary_storage_bytes, d_data, left_size, right_size, compare_op, stream)); } }; template void run_merge_inplace_benchmarks(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& size_a = state.bytes; const auto& size_b = state.bytes; const auto& seed = state.seed; using value_type = ValueT; using runner_type = RunnerT; size_t total_size = size_a + size_b; std::vector h_data(total_size); auto gen_a_it = random_monotonic_iterator{seed.get_0()}; auto gen_b_it = random_monotonic_iterator{seed.get_1()}; // generate left array for(size_t i = 0; i < size_a; ++i) { h_data[i] = static_cast(*(gen_a_it++)); } // generate right array for(size_t i = 0; i < size_b; ++i) { h_data[size_a + i] = static_cast(*(gen_b_it++)); } common::device_ptr d_data(total_size); runner_type runner{d_data.get(), size_a, size_b, stream}; runner.prepare(); state.run_before_every_iteration([&] { d_data.store(h_data); }); state.run([&] { runner.run(); }); state.set_throughput(total_size, sizeof(value_type)); } #define CREATE_BENCHMARK(Value) \ executor.queue_fn( \ bench_naming::format_name("{lvl:device,algo:merge_inplace,value_type:" #Value \ ",cfg:default_config}") \ .c_str(), \ [=](benchmark_utils::state&& state) \ { \ run_merge_inplace_benchmarks>( \ std::forward(state)); \ }); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(int16_t) CREATE_BENCHMARK(int32_t) CREATE_BENCHMARK(int64_t) CREATE_BENCHMARK(rocprim::int128_t) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort.cpp000066400000000000000000000071131506507210100240640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include #include #include #include #include #include #define CREATE_BENCHMARK(...) executor.queue_instance(device_merge_sort_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_double2_copy = common::custom_type_copyable; // specific benchmark for specialization workaround using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; // used by ssbk using custom_char_double_copy = common::custom_type_copyable; // specific benchmark for specialization workaround using custom_longlong_double = common::custom_type; using huge_float2_1024 = common::custom_huge_type<1024, float, float>; using huge_float2_2048 = common::custom_huge_type<2048, float, float>; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(huge_float2_1024) CREATE_BENCHMARK(huge_float2_2048) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2_copy) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_char_double_copy) CREATE_BENCHMARK(custom_int2, custom_longlong_double) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort.hpp000066400000000000000000000216461506507210100241000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include namespace rp = rocprim; template struct device_merge_sort_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:merge_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rp::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); if constexpr(common::is_custom_type_copyable::value) { for(size_t i = 0; i < size; i++) { value_type value; value.x = static_cast(i); value.y = static_cast(i); values_input[i] = value; } } else { std::iota(values_input.begin(), values_input.end(), 0); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rp::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_merge.cpp000066400000000000000000000061361506507210100264210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(...) \ executor.queue_instance(device_merge_sort_block_merge_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_merge.parallel.cpp.in000066400000000000000000000027461506507210100306240ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_merge_sort_block_merge_benchmark_generator<@BlockSize@, @UseMergePath@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_merge.parallel.hpp000066400000000000000000000420371506507210100302210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include template std::string config_name() { const rocprim::detail::merge_sort_block_merge_config_params config = Config(); return "{oddeven_bs:" + std::to_string(config.merge_oddeven_config.block_size) + ",oddeven_ipt:" + std::to_string(config.merge_oddeven_config.items_per_thread) + ",oddeven_size_limit:" + std::to_string(config.merge_oddeven_config.size_limit) + ",mergepath_partition_bs:" + std::to_string(config.merge_mergepath_partition_config.block_size) + ",mergepath_bs:" + std::to_string(config.merge_mergepath_config.block_size) + ",mergepath_ipt:" + std::to_string(config.merge_mergepath_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_merge_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:merge_sort_block_merge,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } // Because merge_sort_block_merge expects partially sorted input: using block_sort_config = rocprim::default_config; // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys(size); HIP_CHECK(hipDeviceSynchronize()); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rocprim::detail::merge_sort_block_sort(d_keys_input.get(), d_keys_input.get(), values_ptr, values_ptr, size, sorted_block_size, lesser_op, stream, false)); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::detail::merge_sort_block_merge(nullptr, temporary_storage_bytes, d_keys.get(), values_ptr, size, sorted_block_size, lesser_op, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); hipError_t err = rocprim::detail::merge_sort_block_merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys.get(), values_ptr, size, sorted_block_size, lesser_op, stream, false); if(err == hipError_t::hipErrorAssert) { state.gbench_state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); state.run_before_every_iteration( [&] { HIP_CHECK(hipMemcpyAsync(d_keys.get(), d_keys_input.get(), size * sizeof(key_type), hipMemcpyDeviceToDevice, stream)); }); state.run( [&] { HIP_CHECK(rocprim::detail::merge_sort_block_merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys.get(), values_ptr, size, sorted_block_size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys(size); common::device_ptr d_values_input(values_input); common::device_ptr d_values(size); HIP_CHECK(hipDeviceSynchronize()); ::rocprim::less lesser_op; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rocprim::detail::merge_sort_block_sort(d_keys_input.get(), d_keys_input.get(), d_values_input.get(), d_values_input.get(), size, sorted_block_size, lesser_op, stream, false)); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::detail::merge_sort_block_merge(nullptr, temporary_storage_bytes, d_keys.get(), d_values.get(), size, sorted_block_size, lesser_op, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); hipError_t err = rocprim::detail::merge_sort_block_merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys.get(), d_values.get(), size, sorted_block_size, lesser_op, stream, false); if(err == hipError_t::hipErrorAssert) { state.gbench_state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); state.run_before_every_iteration( [&] { HIP_CHECK(hipMemcpyAsync(d_keys.get(), d_keys_input.get(), size * sizeof(key_type), hipMemcpyDeviceToDevice, stream)); HIP_CHECK(hipMemcpyAsync(d_values.get(), d_values_input.get(), size * sizeof(value_type), hipMemcpyDeviceToDevice, stream)); }); state.run( [&] { HIP_CHECK(rocprim::detail::merge_sort_block_merge(d_temporary_storage.get(), temporary_storage_bytes, d_keys.get(), d_values.get(), size, sorted_block_size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; template struct device_merge_sort_block_merge_benchmark_generator { static constexpr unsigned int get_limit() { return use_mergepath ? 0 : UINT32_MAX; } template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_merge_config; using benchmark_struct = device_merge_sort_block_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread_exponent = 0u; // Very large block sizes don't work with large items_per_thread since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(Key) + sizeof(Value); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_sort.cpp000066400000000000000000000063161506507210100263110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(...) \ executor.queue_instance(device_merge_sort_block_sort_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; using custom_char_short = common::custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) CREATE_BENCHMARK(int, custom_char_short) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_sort.parallel.cpp.in000066400000000000000000000027471506507210100305150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_merge_sort_block_sort_benchmark_generator<@BlockSize@, @BlockSortMethod@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp000066400000000000000000000235421506507210100301110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include #include #include constexpr const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::merge_sort_block_sort_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_sort_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:merge_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; state.run( [&] { HIP_CHECK(rocprim::detail::merge_sort_block_sort(d_keys_input.get(), d_keys_output.get(), values_ptr, values_ptr, size, items_per_block, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); common::device_ptr d_values_input(values_input); common::device_ptr d_values_output(size); ::rocprim::less lesser_op; unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::detail::merge_sort_block_sort(d_keys_input.get(), d_keys_output.get(), d_values_input.get(), d_values_output.get(), size, items_per_block, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; template struct device_merge_sort_block_sort_benchmark_generator { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_sort_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_merge_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than merge_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread_exponent = rocprim::Log2<(1024 / BlockSize)>::VALUE; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_nth_element.cpp000066400000000000000000000054251506507210100242240ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_nth_element.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include #include #include #include #include #include #define CREATE_BENCHMARK_NTH_ELEMENT(TYPE, SMALL_N) \ executor.queue_instance(device_nth_element_benchmark(SMALL_N)); #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_NTH_ELEMENT(TYPE, true) CREATE_BENCHMARK_NTH_ELEMENT(TYPE, false) \ } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_nth_element.hpp000066400000000000000000000106501506507210100242250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include template struct device_nth_element_benchmark : public benchmark_utils::autotune_interface { bool small_n = false; device_nth_element_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:nth_element,nth:" + (small_n ? "small"s : "large"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t nth = 10; if(!small_n) { nth = size / 2; } // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); ::rocprim::less lesser_op; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::nth_element(nullptr, temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), nth, size, lesser_op, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); state.run( [&] { HIP_CHECK(rocprim::nth_element(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), nth, size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_NTH_ELEMENT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partial_sort.cpp000066400000000000000000000054371506507210100244300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partial_sort.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include #include #include #include #include #include #define CREATE_BENCHMARK_PARTIAL_SORT(TYPE, SMALL_N) \ executor.queue_instance(device_partial_sort_benchmark(SMALL_N)); #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PARTIAL_SORT(TYPE, true) CREATE_BENCHMARK_PARTIAL_SORT(TYPE, false) \ } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partial_sort.hpp000066400000000000000000000121461506507210100244300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include template struct device_partial_sort_benchmark : public benchmark_utils::autotune_interface { bool small_n = false; device_partial_sort_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partial_sort,nth:" + (small_n ? "small"s : "half"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t middle = 10; if(!small_n) { middle = size / 2; } // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_new_data; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(*d_keys_input))); HIP_CHECK(hipMalloc(&d_keys_new_data, size * sizeof(*d_keys_new_data))); HIP_CHECK(hipMemcpy(d_keys_new_data, keys_input.data(), size * sizeof(*d_keys_input), hipMemcpyHostToDevice)); rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::partial_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, middle, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); state.run_before_every_iteration( [&] { HIP_CHECK(hipMemcpy(d_keys_input, d_keys_new_data, size * sizeof(*d_keys_input), hipMemcpyDeviceToDevice)); }); state.run( [&] { HIP_CHECK(rocprim::partial_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, middle, size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_new_data)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partial_sort_copy.cpp000066400000000000000000000053701506507210100254560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partial_sort_copy.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include #include #include #include #include #include #define CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, SMALL_N) \ executor.queue_instance(device_partial_sort_copy_benchmark(SMALL_N)); #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, true) \ CREATE_BENCHMARK_PARTIAL_SORT_COPY(TYPE, false) \ } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partial_sort_copy.hpp000066400000000000000000000112121506507210100254530ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include template struct device_partial_sort_copy_benchmark : public benchmark_utils::autotune_interface { bool small_n = false; device_partial_sort_copy_benchmark(bool SmallN) { small_n = SmallN; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partial_sort_copy,nth:" + (small_n ? "small"s : "half"s) + ",key_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t middle = 10; if(!small_n) { middle = size / 2; } // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); rocprim::less lesser_op; common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::partial_sort_copy(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), middle, size, lesser_op, stream, false)); d_temporary_storage.resize(temporary_storage_bytes); state.run( [&] { HIP_CHECK(rocprim::partial_sort_copy(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), middle, size, lesser_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_PARTIAL_SORT_COPY_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partition.cpp000066400000000000000000000163541506507210100237360ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partition.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_PARTITION_FLAG_BENCHMARK(T, F, p) \ executor.queue_instance(device_partition_flag_benchmark()); #define CREATE_PARTITION_PREDICATE_BENCHMARK(T, p) \ executor.queue_instance(device_partition_predicate_benchmark()); #define CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(T, F, p) \ executor.queue_instance( \ device_partition_two_way_flag_benchmark()); #define CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(T, p) \ executor.queue_instance( \ device_partition_two_way_predicate_benchmark()); #define CREATE_PARTITION_THREE_WAY_BENCHMARK(T, p) \ executor.queue_instance(device_partition_three_way_benchmark()); #define BENCHMARK_FLAG_TYPE(type, flag_type) \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p005) \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p025) \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p050) \ CREATE_PARTITION_FLAG_BENCHMARK(type, flag_type, partition_probability::p075) #define BENCHMARK_PREDICATE_TYPE(type) \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p005) \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p025) \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p050) \ CREATE_PARTITION_PREDICATE_BENCHMARK(type, partition_probability::p075) #define BENCHMARK_TWO_WAY_FLAG_TYPE(type, flag_type) \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p005) \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p025) \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p050) \ CREATE_PARTITION_TWO_WAY_FLAG_BENCHMARK(type, flag_type, partition_probability::p075) #define BENCHMARK_TWO_WAY_PREDICATE_TYPE(type) \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p005) \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p025) \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p050) \ CREATE_PARTITION_TWO_WAY_PREDICATE_BENCHMARK(type, partition_probability::p075) #define BENCHMARK_THREE_WAY_TYPE(type) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p005_p025) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p025_p050) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p050_p075) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, partition_three_way_probability::p075_p100) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_double2 = common::custom_type; using custom_int_double = common::custom_type; using huge_float2 = common::custom_huge_type<1024, float, float>; BENCHMARK_FLAG_TYPE(int, unsigned char) BENCHMARK_FLAG_TYPE(float, unsigned char) BENCHMARK_FLAG_TYPE(double, unsigned char) BENCHMARK_FLAG_TYPE(uint8_t, uint8_t) BENCHMARK_FLAG_TYPE(int8_t, int8_t) BENCHMARK_FLAG_TYPE(rocprim::half, int8_t) BENCHMARK_FLAG_TYPE(custom_double2, unsigned char) BENCHMARK_FLAG_TYPE(rocprim::int128_t, int8_t) BENCHMARK_FLAG_TYPE(rocprim::uint128_t, uint8_t) BENCHMARK_FLAG_TYPE(huge_float2, uint8_t) BENCHMARK_PREDICATE_TYPE(int) BENCHMARK_PREDICATE_TYPE(float) BENCHMARK_PREDICATE_TYPE(double) BENCHMARK_PREDICATE_TYPE(uint8_t) BENCHMARK_PREDICATE_TYPE(int8_t) BENCHMARK_PREDICATE_TYPE(rocprim::half) BENCHMARK_PREDICATE_TYPE(custom_int_double) BENCHMARK_PREDICATE_TYPE(rocprim::int128_t) BENCHMARK_PREDICATE_TYPE(rocprim::uint128_t) BENCHMARK_PREDICATE_TYPE(huge_float2) BENCHMARK_TWO_WAY_FLAG_TYPE(int, unsigned char) BENCHMARK_TWO_WAY_FLAG_TYPE(float, unsigned char) BENCHMARK_TWO_WAY_FLAG_TYPE(double, unsigned char) BENCHMARK_TWO_WAY_FLAG_TYPE(uint8_t, uint8_t) BENCHMARK_TWO_WAY_FLAG_TYPE(int8_t, int8_t) BENCHMARK_TWO_WAY_FLAG_TYPE(rocprim::half, int8_t) BENCHMARK_TWO_WAY_FLAG_TYPE(custom_double2, unsigned char) BENCHMARK_TWO_WAY_FLAG_TYPE(rocprim::int128_t, int8_t) BENCHMARK_TWO_WAY_FLAG_TYPE(rocprim::uint128_t, uint8_t) BENCHMARK_TWO_WAY_FLAG_TYPE(huge_float2, uint8_t) BENCHMARK_TWO_WAY_PREDICATE_TYPE(int) BENCHMARK_TWO_WAY_PREDICATE_TYPE(float) BENCHMARK_TWO_WAY_PREDICATE_TYPE(double) BENCHMARK_TWO_WAY_PREDICATE_TYPE(uint8_t) BENCHMARK_TWO_WAY_PREDICATE_TYPE(int8_t) BENCHMARK_TWO_WAY_PREDICATE_TYPE(rocprim::half) BENCHMARK_TWO_WAY_PREDICATE_TYPE(custom_int_double) BENCHMARK_TWO_WAY_PREDICATE_TYPE(rocprim::int128_t) BENCHMARK_TWO_WAY_PREDICATE_TYPE(rocprim::uint128_t) BENCHMARK_TWO_WAY_PREDICATE_TYPE(huge_float2) BENCHMARK_THREE_WAY_TYPE(int) BENCHMARK_THREE_WAY_TYPE(float) BENCHMARK_THREE_WAY_TYPE(double) BENCHMARK_THREE_WAY_TYPE(uint8_t) BENCHMARK_THREE_WAY_TYPE(int8_t) BENCHMARK_THREE_WAY_TYPE(rocprim::half) BENCHMARK_THREE_WAY_TYPE(custom_int_double) BENCHMARK_THREE_WAY_TYPE(rocprim::int128_t) BENCHMARK_THREE_WAY_TYPE(rocprim::uint128_t) BENCHMARK_THREE_WAY_TYPE(huge_float2) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partition.parallel.cpp.in000066400000000000000000000026711506507210100261330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_partition.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_partition_benchmark_generator<@DataType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_partition.parallel.hpp000066400000000000000000000602371506507210100255350ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" #include "cmdparser.hpp" #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif // BENCHMARK_CONFIG_TUNING #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif // BENCHMARK_CONFIG_TUNING enum class partition_probability { p005, p025, p050, p075, tuning }; inline float get_probability(partition_probability probability) { switch(probability) { case partition_probability::p005: return 0.05f; case partition_probability::p025: return 0.25f; case partition_probability::p050: return 0.50f; case partition_probability::p075: return 0.75f; case partition_probability::tuning: return 0.0f; // not used } return 0.0f; } inline const char* get_probability_name(partition_probability probability) { switch(probability) { case partition_probability::p005: return "0.05"; case partition_probability::p025: return "0.25"; case partition_probability::p050: return "0.50"; case partition_probability::p075: return "0.75"; case partition_probability::tuning: return "tuning"; } return "invalid"; } enum class partition_three_way_probability { p005_p025, p025_p050, p050_p075, p075_p100, tuning }; inline std::pair get_probability(partition_three_way_probability probability) { switch(probability) { case partition_three_way_probability::p005_p025: return std::make_pair(0.05f, 0.25f); case partition_three_way_probability::p025_p050: return std::make_pair(0.25f, 0.50f); case partition_three_way_probability::p050_p075: return std::make_pair(0.50f, 0.75f); case partition_three_way_probability::p075_p100: return std::make_pair(0.75f, 1.00f); case partition_three_way_probability::tuning: return std::make_pair(0.00f, 0.00f); // not used } return std::make_pair(0.00f, 0.00f); } inline const char* get_probability_name(partition_three_way_probability probability) { switch(probability) { case partition_three_way_probability::p005_p025: return "0.05:0.25"; case partition_three_way_probability::p025_p050: return "0.25:0.50"; case partition_three_way_probability::p050_p075: return "0.50:0.75"; case partition_three_way_probability::p075_p100: return "0.75:1.00"; case partition_three_way_probability::tuning: return "tuning"; } return "invalid"; } template struct device_partition_flag_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } common::device_ptr d_input(input); common::device_ptr d_flags_0(flags_0); common::device_ptr d_flags_1; common::device_ptr d_flags_2; if(is_tuning) { d_flags_1.store(flags_1); d_flags_2.store(flags_2); } common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::partition(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_flags, d_output.get(), d_selected_count_output.get(), size, stream)); }; dispatch_flags(d_flags_0.get()); if(is_tuning) { dispatch_flags(d_flags_1.get()); dispatch_flags(d_flags_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_predicate_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::partition(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_output.get(), d_selected_count_output.get(), size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_two_way_flag_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partition_two_way,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } common::device_ptr d_input(input); common::device_ptr d_flags_0(flags_0); common::device_ptr d_flags_1; common::device_ptr d_flags_2; if(is_tuning) { d_flags_1.store(flags_1); d_flags_2.store(flags_2); } common::device_ptr d_output_selected(size); common::device_ptr d_output_rejected(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::partition_two_way(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_flags, d_output_selected.get(), d_output_rejected.get(), d_selected_count_output.get(), size, stream)); }; dispatch_flags(d_flags_0.get()); if(is_tuning) { dispatch_flags(d_flags_1.get()); dispatch_flags(d_flags_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes = 0; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_two_way_predicate_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:partition_two_way,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output_selected(size); common::device_ptr d_output_rejected(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::partition_two_way(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_output_selected.get(), d_output_rejected.get(), d_selected_count_output.get(), size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == partition_probability::tuning; }; template struct device_partition_three_way_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:partition_three_way,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output_first(size); common::device_ptr d_output_second(size); common::device_ptr d_output_unselected(size); common::device_ptr d_selected_count_output(2); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](std::pair probability) { const float probability_one = probability.first; auto predicate_one = [probability_one](const DataType& value) { return value < DataType(127 * probability_one); }; const float probability_two = probability.second; auto predicate_two = [probability_two](const DataType& value) { return value < DataType(127 * probability_two); }; HIP_CHECK(rocprim::partition_three_way(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_output_first.get(), d_output_second.get(), d_output_unselected.get(), d_selected_count_output.get(), size, predicate_one, predicate_two, stream)); }; if(is_tuning) { // clang-format off std::array, 7> probabilities = {{ {0.33f, 0.66f}, // 1st, 2nd, and 3rd bin {0.50f, 1.00f}, // 1st and 2nd bin {0.00f, 0.50f}, // 2nd and 3rd bin {0.50f, 0.50f}, // 1st and 3rd bin {1.00f, 1.00f}, // 1st bin {0.00f, 1.00f}, // 2nd bin {0.00f, 0.00f}}}; // 3rd bin // clang-format on for(const std::pair& probability : probabilities) { dispatch_predicate(probability); } } else { dispatch_predicate(get_probability(Probability)); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == partition_three_way_probability::tuning; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_partition_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::select_config; storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); } }; static void create(std::vector>& storage) { static constexpr int max_items_per_thread = std::min(64 / sizeof(DataType), size_t{32}); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_PARTITION_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort.cpp000066400000000000000000000062641506507210100241020ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_radix_sort.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #include #include #define CREATE_RADIX_SORT_BENCHMARK(...) \ executor.queue_instance(device_radix_sort_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); using custom_key = common::custom_type; CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) CREATE_RADIX_SORT_BENCHMARK(custom_key) CREATE_RADIX_SORT_BENCHMARK(rocprim::int128_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_key = common::custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) CREATE_RADIX_SORT_BENCHMARK(custom_key, double) CREATE_RADIX_SORT_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort.hpp000066400000000000000000000323601506507210100241030ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_data_generation.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include template struct device_radix_sort_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg: default_config}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> std::enable_if_t::value, void> { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, static_cast(nullptr), static_cast(nullptr), size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, static_cast(nullptr), static_cast(nullptr), size, stream)); }); state.set_throughput(size, sizeof(key_type)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> std::enable_if_t::value, void> { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; ++i) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(invoke_radix_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, stream)); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } private: template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && std::is_same::value, hipError_t> { (void)values_input; (void)values_output; return rocprim::radix_sort_keys(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, size, 0, sizeof(K) * 8, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && std::is_same::value, hipError_t> { (void)values_input; (void)values_output; return rocprim::radix_sort_keys(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, size, custom_type_decomposer{}, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && !std::is_same::value, hipError_t> { return rocprim::radix_sort_pairs(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, values_input, values_output, size, 0, sizeof(K) * 8, stream); } template static auto invoke_radix_sort(void* d_temporary_storage, size_t& temp_storage_bytes, K* keys_input, K* keys_output, V* values_input, V* values_output, size_t size, hipStream_t stream) -> std::enable_if_t::value && !std::is_same::value, hipError_t> { return rocprim::radix_sort_pairs(d_temporary_storage, temp_storage_bytes, keys_input, keys_output, values_input, values_output, size, custom_type_decomposer{}, stream); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_block_sort.cpp000066400000000000000000000054321506507210100263170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // HIP API #include #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(...) \ executor.queue_instance(device_radix_sort_block_sort_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_char_double = common::custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(int, custom_char_double) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_block_sort.parallel.cpp.in000066400000000000000000000027241506507210100305200ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_radix_sort_block_sort_benchmark_generator<@BlockSize@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_block_sort.parallel.hpp000066400000000000000000000177411506507210100301250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include template std::string config_name() { const rocprim::detail::kernel_config_params config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_block_sort_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; state.run( [&] { HIP_CHECK((rocprim::detail::radix_sort_block_sort( d_keys_input.get(), d_keys_output.get(), values_ptr, values_ptr, size, items_per_block, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); }); state.set_throughput(size, sizeof(key_type)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); // Generate data std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; ++i) { values_input[i] = value_type(i); } common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); common::device_ptr d_values_input(values_input); common::device_ptr d_values_output(size); unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK((rocprim::detail::radix_sort_block_sort( d_keys_input.get(), d_keys_output.get(), d_values_input.get(), d_values_output.get(), size, items_per_block, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false))); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; template struct device_radix_sort_block_sort_benchmark_generator { template struct create_ipt { using generated_config = rocprim::kernel_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than radix_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread = 1024 / BlockSize; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX - 2000; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key), sizeof(Value)); static constexpr unsigned int max_items_per_thread = std::min(32u, max_shared_memory / (BlockSize * max_size_per_element)); static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_onesweep.cpp000066400000000000000000000060321506507210100260000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. // HIP API #include #include "benchmark_device_radix_sort_onesweep.parallel.hpp" #include "benchmark_utils.hpp" #include #include #include #define CREATE_RADIX_SORT_BENCHMARK(...) \ executor.queue_instance(device_radix_sort_onesweep_benchmark<__VA_ARGS__>()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) CREATE_RADIX_SORT_BENCHMARK(rocprim::int128_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) CREATE_RADIX_SORT_BENCHMARK(rocprim::int128_t, rocprim::int128_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::uint128_t, rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_onesweep.parallel.cpp.in000066400000000000000000000027411506507210100302030ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_radix_sort_onesweep.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_radix_sort_onesweep_benchmark_generator<@BlockSize@, @RadixBits@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_radix_sort_onesweep.parallel.hpp000066400000000000000000000345621506507210100276110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #else #include #endif constexpr const char* radix_rank_algorithm_name(rocprim::block_radix_rank_algorithm algorithm) { switch(algorithm) { case rocprim::block_radix_rank_algorithm::basic: return "block_radix_rank_algorithm::basic"; case rocprim::block_radix_rank_algorithm::basic_memoize: return "block_radix_rank_algorithm::basic_memoize"; case rocprim::block_radix_rank_algorithm::match: return "block_radix_rank_algorithm::match"; } return ""; // unknown algorithm } template std::string config_name() { constexpr rocprim::detail::radix_sort_onesweep_config_params params = Config(); return "{histogram:{bs:" + std::to_string(params.histogram.block_size) + ",ipt:" + std::to_string(params.histogram.items_per_thread) + "},sort:{" + "bs:" + std::to_string(params.sort.block_size) + ",ipt:" + std::to_string(params.sort.items_per_thread) + "},bits_per_place:" + std::to_string(params.radix_bits_per_place) + ",algorithm:" + radix_rank_algorithm_name(params.radix_rank_algorithm) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_onesweep_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_onesweep,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } // keys benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; rocprim::empty_type* d_values_ptr = nullptr; HIP_CHECK(( rocprim::detail::radix_sort_onesweep_impl(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), nullptr, d_keys_output.get(), d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false, false))); d_temporary_storage.resize(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK((rocprim::detail::radix_sort_onesweep_impl( d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), nullptr, d_keys_output.get(), d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false, false))); }); state.set_throughput(size, sizeof(key_type)); } // pairs benchmark template auto do_run(benchmark_utils::state&& state) const -> typename std::enable_if::value, void>::type { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using value_type = Value; // Calculate the number of elements size_t size = bytes / sizeof(key_type); std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input(size); for(size_t i = 0; i < size; ++i) { values_input[i] = value_type(i); } common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); common::device_ptr d_values_input(values_input); common::device_ptr d_values_output(size); common::device_ptr d_temporary_storage; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; HIP_CHECK(( rocprim::detail::radix_sort_onesweep_impl(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), nullptr, d_keys_output.get(), d_values_input.get(), nullptr, d_values_output.get(), size, is_result_in_output, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false, false))); d_temporary_storage.resize(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK((rocprim::detail::radix_sort_onesweep_impl( d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), nullptr, d_keys_output.get(), d_values_input.get(), nullptr, d_values_output.get(), size, is_result_in_output, rocprim::identity_decomposer{}, 0, sizeof(key_type) * 8, stream, false, false))); }); state.set_throughput(size, sizeof(key_type) + sizeof(value_type)); } void run(benchmark_utils::state&& state) override { do_run(std::forward(state)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_radix_sort_onesweep_benchmark_generator { template static constexpr bool is_buildable() { // Calculation uses `rocprim::arch::wavefront::min_size()`, which is 32 on host side unless overridden. // However, this does not affect the total size of shared memory for the current configuration space. // Were the implementation to change, causing retuning, this needs to be re-evaluated and possibly taken into account. using sharedmem_storage = typename rocprim::detail::onesweep_iteration_helper< Key, Value, size_t, BlockSize, ItemsPerThread, RadixBits, false, RadixRankAlgorithm, rocprim::identity_decomposer>::storage_type; return sizeof(sharedmem_storage) < TUNING_SHARED_MEMORY_MAX; } template struct create_ipt; template struct create_ipt())>> { using generated_config = rocprim::radix_sort_onesweep_config, rocprim::kernel_config, RadixBits, RadixRankAlgorithm>; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_onesweep_benchmark>()); } }; template struct create_ipt())>> { void operator()(std::vector>&) const {} }; template static void create_algo(std::vector>& storage) { create_ipt<1u, RadixRankAlgorithm>()(storage); create_ipt<4u, RadixRankAlgorithm>()(storage); create_ipt<6u, RadixRankAlgorithm>()(storage); create_ipt<8u, RadixRankAlgorithm>()(storage); create_ipt<12u, RadixRankAlgorithm>()(storage); create_ipt<16u, RadixRankAlgorithm>()(storage); create_ipt<18u, RadixRankAlgorithm>()(storage); create_ipt<22u, RadixRankAlgorithm>()(storage); } static void create(std::vector>& storage) { create_algo(storage); create_algo(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce.cpp000066400000000000000000000052131506507210100231640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include #ifndef BENCHMARK_CONFIG_TUNING #include #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T, REDUCE_OP) \ executor.queue_instance(device_reduce_benchmark()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(rocprim::int128_t, rocprim::plus) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::plus) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce.parallel.cpp.in000066400000000000000000000032571506507210100253720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_reduce.parallel.hpp" #include #include #include #include #include namespace { auto unused = benchmark_utils::executor::queue_sorted_instance, rocprim::reduce_config<@BlockSize@u, @ItemsPerThread@u, rocprim::block_reduce_algorithm::using_warp_reduce>>>(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce.parallel.hpp000066400000000000000000000121621506507210100247650ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include #include #include #include #include #include #include constexpr const char* get_reduce_method_name(rocprim::block_reduce_algorithm alg) { switch(alg) { case rocprim::block_reduce_algorithm::raking_reduce: return "raking_reduce"; case rocprim::block_reduce_algorithm::raking_reduce_commutative_only: return "raking_reduce_commutative_only"; case rocprim::block_reduce_algorithm::using_warp_reduce: return "using_warp_reduce"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::reduce_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_reduce_method_name(config.block_reduce_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, typename Config = rocprim::default_config> struct device_reduce_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:reduce,key_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(T); BinaryFunction reduce_op{}; const auto random_range = limit_random_range(0, 1000); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(1); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(rocprim::reduce(nullptr, temp_storage_size_bytes, d_input.get(), d_output.get(), T(), size, reduce_op, stream)); common::device_ptr d_temp_storage(temp_storage_size_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::reduce(d_temp_storage.get(), temp_storage_size_bytes, d_input.get(), d_output.get(), T(), size, reduce_op, stream)); }); state.set_throughput(size, sizeof(T)); } }; #endif rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce_by_key.cpp000066400000000000000000000027061506507210100245320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce_by_key.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING add_benchmarks(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce_by_key.parallel.cpp.in000066400000000000000000000027101506507210100267250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_reduce_by_key.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_reduce_by_key_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce_by_key.parallel.hpp000066400000000000000000000265361506507210100263410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif #include #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif template std::string config_name() { const rocprim::detail::reduce_by_key_config_params params = Config(); return "{bs:" + std::to_string(params.kernel_config.block_size) + ",ipt:" + std::to_string(params.kernel_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_reduce_by_key_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:reduce_by_key,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",max_segment_length:" + std::to_string(MaxSegmentLength) + ",cfg:" + config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; constexpr std::array tuning_max_segment_lengths = {10, 1000}; constexpr int num_input_arrays = is_tuning ? tuning_max_segment_lengths.size() : 1; constexpr size_t item_size = sizeof(KeyType) + sizeof(ValueType); const size_t size = bytes / item_size; std::vector key_inputs[num_input_arrays]; if(is_tuning) { for(size_t i = 0; i < tuning_max_segment_lengths.size(); ++i) { key_inputs[i] = get_random_segments_iota(size, tuning_max_segment_lengths[i], seed.get_0()); } } else { key_inputs[0] = get_random_segments_iota(size, MaxSegmentLength, seed.get_0()); } std::vector value_input(size); std::iota(value_input.begin(), value_input.end(), 0); common::device_ptr d_key_inputs[num_input_arrays]; for(int i = 0; i < num_input_arrays; ++i) { d_key_inputs[i].store(key_inputs[i]); } common::device_ptr d_value_input(value_input); common::device_ptr d_unique_output(size); common::device_ptr d_aggregates_output(size); common::device_ptr d_unique_count_output(1); rocprim::plus reduce_op; rocprim::equal_to key_compare_op; const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_input = [&](KeyType* d_key_input) { if constexpr(!Deterministic) { HIP_CHECK(rocprim::reduce_by_key(d_temp_storage, temp_storage_size_bytes, d_key_input, d_value_input.get(), size, d_unique_output.get(), d_aggregates_output.get(), d_unique_count_output.get(), reduce_op, key_compare_op, stream)); } else { HIP_CHECK( rocprim::deterministic_reduce_by_key(d_temp_storage, temp_storage_size_bytes, d_key_input, d_value_input.get(), size, d_unique_output.get(), d_aggregates_output.get(), d_unique_count_output.get(), reduce_op, key_compare_op, stream)); } }; // One tuning iteration runs multiple inputs with different distributions, // preventing overfitting the config to a specific data distrubution. // Note that this does not weigh the inputs/distributions equally as // generally larger segments perform better. for(int i = 0; i < num_input_arrays; ++i) { dispatch_input(d_key_inputs[i].get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(KeyType) + sizeof(ValueType)); } static constexpr bool is_tuning = !std::is_same::value; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_reduce_by_key_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::reduce_by_key_config; // max segment length argument is irrelevant, tuning overrides segment length storage.emplace_back( std::make_unique< device_reduce_by_key_benchmark>()); } }; static void create(std::vector>& storage) { static constexpr unsigned int max_items_per_thread = std::min( TUNING_SHARED_MEMORY_MAX / std::max(sizeof(KeyType), sizeof(ValueType)) / BlockSize - 1, size_t{15}); static_for_each, create_ipt>( storage); } }; #endif // BENCHMARK_CONFIG_TUNING #define CREATE_BENCHMARK(KEY, VALUE, MAX_SEGMENT_LENGTH) \ executor.queue_instance( \ device_reduce_by_key_benchmark()); #define CREATE_BENCHMARK_TYPE(KEY, VALUE) \ CREATE_BENCHMARK(KEY, VALUE, 10) \ CREATE_BENCHMARK(KEY, VALUE, 1000) // some of the tuned types #define CREATE_BENCHMARK_TYPES(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half) \ CREATE_BENCHMARK_TYPE(KEY, int32_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::int128_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::uint128_t) \ CREATE_BENCHMARK_TYPE(KEY, float) \ CREATE_BENCHMARK_TYPE(KEY, double) // all of the tuned types #define CREATE_BENCHMARK_TYPE_TUNING(KEY) \ CREATE_BENCHMARK_TYPE(KEY, int8_t) \ CREATE_BENCHMARK_TYPE(KEY, int16_t) \ CREATE_BENCHMARK_TYPE(KEY, int32_t) \ CREATE_BENCHMARK_TYPE(KEY, int64_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::int128_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::uint128_t) \ CREATE_BENCHMARK_TYPE(KEY, rocprim::half) \ CREATE_BENCHMARK_TYPE(KEY, float) \ CREATE_BENCHMARK_TYPE(KEY, double) template void add_benchmarks(benchmark_utils::executor& executor) { // tuned types CREATE_BENCHMARK_TYPES(int8_t) CREATE_BENCHMARK_TYPES(int16_t) CREATE_BENCHMARK_TYPE_TUNING(int32_t) CREATE_BENCHMARK_TYPE_TUNING(int64_t) CREATE_BENCHMARK_TYPES(rocprim::half) CREATE_BENCHMARK_TYPES(float) CREATE_BENCHMARK_TYPES(double) CREATE_BENCHMARK_TYPES(rocprim::int128_t) CREATE_BENCHMARK_TYPES(rocprim::uint128_t) // custom types using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK_TYPE(int, custom_float2) CREATE_BENCHMARK_TYPE(int, custom_double2) CREATE_BENCHMARK_TYPE(long long, custom_float2) CREATE_BENCHMARK_TYPE(long long, custom_double2) } #endif // ROCPRIM_BENCHMARK_DEVICE_REDUCE_BY_KEY_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_reduce_by_key_deterministic.cpp000066400000000000000000000026361506507210100274570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_reduce_by_key.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 5); add_benchmarks(executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode.cpp000066400000000000000000000051701506507210100254010ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #define CREATE_ENCODE_BENCHMARK(T, ML) \ executor.queue_instance(device_run_length_encode_benchmark()); template void add_encode_benchmarks(benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; // all tuned types CREATE_ENCODE_BENCHMARK(int8_t, MaxLength) CREATE_ENCODE_BENCHMARK(int16_t, MaxLength) CREATE_ENCODE_BENCHMARK(int32_t, MaxLength) CREATE_ENCODE_BENCHMARK(int64_t, MaxLength) CREATE_ENCODE_BENCHMARK(rocprim::int128_t, MaxLength) CREATE_ENCODE_BENCHMARK(rocprim::uint128_t, MaxLength) CREATE_ENCODE_BENCHMARK(rocprim::half, MaxLength) CREATE_ENCODE_BENCHMARK(float, MaxLength) CREATE_ENCODE_BENCHMARK(double, MaxLength) // custom types CREATE_ENCODE_BENCHMARK(custom_float2, MaxLength) CREATE_ENCODE_BENCHMARK(custom_double2, MaxLength) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 10); #ifndef BENCHMARK_CONFIG_TUNING add_encode_benchmarks<1000>(executor); add_encode_benchmarks<10>(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode.parallel.cpp.in000066400000000000000000000027411506507210100276020ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_run_length_encode_benchmark_generator< @KeyType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode.parallel.hpp000066400000000000000000000165641506507210100272120ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif template std::string run_length_encode_config_name() { const rocprim::detail::reduce_by_key_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string run_length_encode_config_name() { return "default_config"; } template struct device_run_length_encode_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:run_length_encode,key_type:" + std::string(Traits::name()) + ",keys_max_length:" + std::to_string(MaxLength) + ",cfg:" + run_length_encode_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = T; using count_type = unsigned int; const size_t size = bytes / sizeof(T); // Generate data std::vector input(size); unsigned int runs_count = 0; const auto random_range = limit_random_range(1, MaxLength); std::vector key_counts = get_random_data(100000, random_range.first, random_range.second, seed.get_0()); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; ++i) { input[i] = runs_count; } ++runs_count; offset += key_count; } common::device_ptr d_input(input); common::device_ptr d_unique_output(runs_count); common::device_ptr d_counts_output(runs_count); common::device_ptr d_runs_count_output(1); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::run_length_encode(nullptr, temporary_storage_bytes, d_input.get(), size, d_unique_output.get(), d_counts_output.get(), d_runs_count_output.get(), stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::run_length_encode(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), size, d_unique_output.get(), d_counts_output.get(), d_runs_count_output.get(), stream, false)); }); state.set_throughput(size, sizeof(key_type)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_run_length_encode_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::reduce_by_key_config; storage.emplace_back( std::make_unique>()); storage.emplace_back( std::make_unique>()); } }; static void create(std::vector>& storage) { static constexpr unsigned int max_items_per_thread = std::min(TUNING_SHARED_MEMORY_MAX / sizeof(T) / BlockSize - 1, size_t{15}); static_for_each, create_ipt>( storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode_non_trivial_runs.cpp000066400000000000000000000056641506507210100310640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp" #include "benchmark_utils.hpp" // CmdParser #include "cmdparser.hpp" #include "../common/utils_custom_type.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include // CHANGE #define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T, ML) \ executor.queue_instance(device_non_trivial_runs_benchmark()); template void add_non_trivial_runs_benchmarks(benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int16_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int32_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int64_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(rocprim::int128_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(rocprim::uint128_t, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(rocprim::half, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(float, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(double, MaxLength) // custom types CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2, MaxLength) CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2, MaxLength) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 5); // Add benchmarks #ifndef BENCHMARK_CONFIG_TUNING add_non_trivial_runs_benchmarks<16>(executor); add_non_trivial_runs_benchmarks<256>(executor); add_non_trivial_runs_benchmarks<4096>(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.cpp.in000066400000000000000000000031201506507210100332450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp" #include "benchmark_utils.hpp" #include #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_non_trivial_runs_benchmark_generator< @KeyType@, @BlockSize@, rocprim::block_load_method::@BlockLoadMethod@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_run_length_encode_non_trivial_runs.parallel.hpp000066400000000000000000000205501506507210100326530ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #include #include #include #endif #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif template std::string non_trivial_runs_config_name() { const rocprim::detail::non_trivial_runs_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",load_method:" + get_block_load_method_name(config.load_input_method) + "}"; } template<> inline std::string non_trivial_runs_config_name() { return "default_config"; } template struct device_non_trivial_runs_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:run_length_encode,subalgo:non_trivial,key_type:" + std::string(Traits::name()) + ",keys_max_length:" + std::to_string(MaxLength) + ",cfg:" + non_trivial_runs_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using offset_type = unsigned int; using count_type = unsigned int; constexpr std::array tuning_max_segment_lengths = {10, 1000}; constexpr int num_input_arrays = is_tuning ? tuning_max_segment_lengths.size() : 1; constexpr size_t item_size = sizeof(T) + sizeof(offset_type) + sizeof(count_type); const size_t size = bytes / item_size; // Generate data std::vector input[num_input_arrays]; if(is_tuning) { for(size_t i = 0; i < tuning_max_segment_lengths.size(); ++i) { input[i] = get_random_segments_iota(size, tuning_max_segment_lengths[i], seed.get_0()); } } else { input[0] = get_random_segments_iota(size, MaxLength, seed.get_0()); } common::device_ptr d_input[num_input_arrays]; for(int i = 0; i < num_input_arrays; ++i) { d_input[i].store(input[i]); } common::device_ptr d_offsets_output(size); common::device_ptr d_counts_output(size); common::device_ptr d_runs_count_output(1); const auto dispatch = [&](void* d_temporary_storage, size_t& temporary_storage_bytes) { const auto dispatch_input = [&](T* d_input) { HIP_CHECK( rocprim::run_length_encode_non_trivial_runs(d_temporary_storage, temporary_storage_bytes, d_input, size, d_offsets_output.get(), d_counts_output.get(), d_runs_count_output.get(), stream, false)); }; for(int i = 0; i < num_input_arrays; ++i) { dispatch_input(d_input[i].get()); } }; // Allocate temporary storage memory size_t temporary_storage_bytes = 0; dispatch(nullptr, temporary_storage_bytes); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run([&] { dispatch(d_temporary_storage.get(), temporary_storage_bytes); }); state.set_throughput(size, sizeof(T) + sizeof(offset_type) + sizeof(count_type)); } static constexpr bool is_tuning = !std::is_same::value; }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_non_trivial_runs_benchmark_generator { using OffsetCountPairT = ::rocprim::tuple; static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = std::max(sizeof(T), sizeof(OffsetCountPairT)); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int min_items_per_thread_exponent = 3u; static constexpr unsigned int max_items_per_thread_exponent = std::max(static_cast(rocprim::Log2::VALUE), min_items_per_thread_exponent) - 1u; static constexpr bool is_load_warp_transpose = BlockLoadMethod == ::rocprim::block_load_method::block_load_warp_transpose; static constexpr bool is_warp_load_supp = is_load_warp_transpose && BlockSize == ROCPRIM_WARP_SIZE_64; template struct create_ipt { void operator()(std::vector>& storage) { if(!is_load_warp_transpose || is_warp_load_supp) { using config = rocprim::non_trivial_runs_config< BlockSize, items_per_thread, BlockLoadMethod, rocprim::block_scan_algorithm::using_warp_scan>; storage.emplace_back( std::make_unique>()); } } private: static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExp; }; static void create(std::vector>& storage) { static_for_each< make_index_range, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RUN_LENGTH_ENCODE_NON_TRIVIAL_RUNS_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan.cpp000066400000000000000000000026771506507210100226540ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING add_benchmarks(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan.parallel.cpp.in000066400000000000000000000027611506507210100250460ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" #include #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_scan_benchmark_generator<@DataType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan.parallel.hpp000066400000000000000000000304421506507210100244430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #include #else #include #include #endif #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #else #include #endif template std::string config_name() { const rocprim::detail::scan_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_algorithm_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_scan_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan,exclusive:" + (Exclusive ? "true"s : "false"s) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if constexpr(!Deterministic) { return rocprim::exclusive_scan(temporary_storage, storage_size, input, output, initial_value, input_size, scan_op, stream, debug); } else { return rocprim::deterministic_exclusive_scan(temporary_storage, storage_size, input, output, initial_value, input_size, scan_op, stream, debug); } } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { (void)initial_value; if constexpr(!Deterministic) { return rocprim::inclusive_scan(temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } else { return rocprim::deterministic_inclusive_scan(temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(T); ScanOp scan_op{}; const auto random_range = limit_random_range(0, 1000); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); T initial_value = T(123); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK((run_device_scan(nullptr, temp_storage_size_bytes, d_input.get(), d_output.get(), initial_value, size, scan_op, stream))); common::device_ptr d_temp_storage(temp_storage_size_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK((run_device_scan(d_temp_storage.get(), temp_storage_size_bytes, d_input.get(), d_output.get(), initial_value, size, scan_op, stream))); }); state.set_throughput(size, sizeof(T)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()( std::vector>& storage) { storage.emplace_back( std::make_unique, false, rocprim::scan_config>>()); } }; void operator()( std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min(65536 / (block_size * sizeof(T)) - 1, 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #else #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ executor.queue_instance(device_scan_benchmark()); #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) template void add_benchmarks(benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) CREATE_BENCHMARK(rocprim::int128_t, rocprim::plus) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::plus) } #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan_by_key.cpp000066400000000000000000000027061506507210100242070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING add_benchmarks(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan_by_key.parallel.cpp.in000066400000000000000000000030131506507210100263770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" #include #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_scan_by_key_benchmark_generator<@KeyType@, @ValueType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan_by_key.parallel.hpp000066400000000000000000000373611506507210100260140ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #include #else #include #include #endif #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #else #include #endif template std::string config_name() { const rocprim::detail::scan_by_key_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_algorithm_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_scan_by_key_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan_by_key,exclusive:" + (Exclusive ? "true"s : "false"s) + ",key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",max_segment_length:" + std::to_string(MaxSegmentLength) + ",cfg:" + config_name() + "}"); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value initial_value, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if constexpr(!Deterministic) { return rocprim::exclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, initial_value, input_size, scan_op, compare_op, stream, debug); } else { return rocprim::deterministic_exclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, initial_value, input_size, scan_op, compare_op, stream, debug); } } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value /*initial_value*/, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { if constexpr(!Deterministic) { return rocprim::inclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, input_size, scan_op, compare_op, stream, debug); } else { return rocprim::deterministic_inclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, input_size, scan_op, compare_op, stream, debug); } } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(Value); constexpr bool debug = false; const std::vector keys = get_random_segments(size, MaxSegmentLength, seed.get_0()); const auto random_range = limit_random_range(0, 1000); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_1()); ScanOp scan_op{}; CompareOp compare_op{}; Value initial_value = Value(123); common::device_ptr d_input(input); common::device_ptr d_keys(keys); common::device_ptr d_output(input.size()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK((run_device_scan_by_key(nullptr, temp_storage_size_bytes, d_keys.get(), d_input.get(), d_output.get(), initial_value, size, scan_op, compare_op, stream, debug))); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run( [&] { HIP_CHECK((run_device_scan_by_key(d_temp_storage.get(), temp_storage_size_bytes, d_keys.get(), d_input.get(), d_output.get(), initial_value, size, scan_op, compare_op, stream, debug))); }); state.set_throughput(size, sizeof(Key) + sizeof(Value)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_by_key_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()( std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::equal_to, 1024, false, rocprim::scan_by_key_config< block_size, ItemsPerThread, rocprim::block_load_method::block_load_transpose, rocprim::block_store_method::block_store_transpose, BlockScanAlgorithm>>>()); } }; void operator()( std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min( 65536 / (block_size * (sizeof(KeyType) + sizeof(ValueType) + (sizeof(KeyType) == 16 && sizeof(ValueType) == 1))), 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #else // BENCHMARK_CONFIG_TUNING #define CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, MAX_SEGMENT_LENGTH) \ executor.queue_instance(device_scan_by_key_benchmark, \ MAX_SEGMENT_LENGTH, \ Deterministic>()); #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 1) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 16) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 256) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 4096) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 65536) #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) template void add_benchmarks(benchmark_utils::executor& executor) { using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) CREATE_BENCHMARK(rocprim::int128_t, rocprim::plus) CREATE_BENCHMARK(rocprim::uint128_t, rocprim::plus) } #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan_by_key_deterministic.cpp000066400000000000000000000027051506507210100271310ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING add_benchmarks(executor); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_scan_deterministic.cpp000066400000000000000000000026271506507210100255720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); add_benchmarks(executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_search.cpp000066400000000000000000000060111506507210100231570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_search.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include #include #define CREATE_BENCHMARK_SEARCH(TYPE, KEY_SIZE, REPEATING) \ executor.queue_instance(device_search_benchmark(KEY_SIZE, REPEATING)); #define CREATE_BENCHMARK_PATTERN(TYPE, REPEATING) \ { \ CREATE_BENCHMARK_SEARCH(TYPE, 10, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 100, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 1000, REPEATING) \ CREATE_BENCHMARK_SEARCH(TYPE, 10000, REPEATING) \ } #define CREATE_BENCHMARK(TYPE) \ { \ CREATE_BENCHMARK_PATTERN(TYPE, true) CREATE_BENCHMARK_PATTERN(TYPE, false) \ } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) CREATE_BENCHMARK(float) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(custom_int2) CREATE_BENCHMARK(custom_char_double) CREATE_BENCHMARK(custom_longlong_double) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_search.hpp000066400000000000000000000124731506507210100231750ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include template struct device_search_benchmark : public benchmark_utils::autotune_interface { size_t key_size_ = 10; bool repeating_ = false; device_search_benchmark(size_t KeySize, bool repeating) { key_size_ = KeySize; repeating_ = repeating; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:search,value_pattern:" + (repeating_ ? "repeating"s : "random"s) + ",key_size:" + std::to_string(key_size_) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using key_type = Key; using output_type = size_t; // Calculate the number of elements size_t size = bytes / sizeof(key_type); size_t key_size = std::min(size, key_size_); // Generate data std::vector keys_input = get_random_data(key_size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector input(size); if(repeating_) { // Repeating similar pattern without early exits. keys_input[key_size - 1] = 0; for(size_t i = 0; i < size; ++i) { input[i] = keys_input[i % key_size]; } keys_input[key_size - 1] = 1; } else { input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0() + 1); } common::device_ptr d_keys_input(keys_input); common::device_ptr d_input(input); common::device_ptr d_output(1); rocprim::equal_to compare_op; size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::search(nullptr, temporary_storage_bytes, d_input.get(), d_keys_input.get(), d_output.get(), size, key_size, compare_op, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); state.run( [&] { HIP_CHECK(rocprim::search(d_temporary_storage.get(), temporary_storage_bytes, d_input.get(), d_keys_input.get(), d_output.get(), size, key_size, compare_op, stream, false)); }); state.set_throughput(size, sizeof(key_type)); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEARCH_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_search_n.cpp000066400000000000000000000051671506507210100235070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_search_n.parallel.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #include #include #define CREATE_BENCHMARK(T, S, C) executor.queue_instance(benchmark_search_n()); #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, size_t, count_equal_to<1>) \ CREATE_BENCHMARK(T, size_t, count_equal_to<6>) \ CREATE_BENCHMARK(T, size_t, count_equal_to<10>) \ CREATE_BENCHMARK(T, size_t, count_equal_to<14>) \ CREATE_BENCHMARK(T, size_t, count_equal_to<25>) \ CREATE_BENCHMARK(T, size_t, count_is_percent_of_size<50>) \ CREATE_BENCHMARK(T, size_t, count_is_percent_of_size<100>) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 2 * benchmark_utils::GiB, 10, 10); #ifndef BENCHMARK_CONFIG_TUNING using custom_int2 = common::custom_type; using custom_longlong_double = common::custom_type; CREATE_BENCHMARKS(custom_int2) CREATE_BENCHMARKS(custom_longlong_double) CREATE_BENCHMARKS(int8_t) CREATE_BENCHMARKS(int16_t) CREATE_BENCHMARKS(int32_t) CREATE_BENCHMARKS(int64_t) CREATE_BENCHMARKS(rocprim::int128_t) CREATE_BENCHMARKS(rocprim::uint128_t) CREATE_BENCHMARKS(rocprim::half) CREATE_BENCHMARKS(float) CREATE_BENCHMARKS(double) #endif // Run benchmarks executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_search_n.parallel.cpp.in000066400000000000000000000027651506507210100257100ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_search_n.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_search_n_benchmark_generator< @InputType@, @BlockSize@, @ItemsPerThread@, @Threshold@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_search_n.parallel.hpp000066400000000000000000000165231506507210100253050ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "cmdparser.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" // gbench #include // HIP #include // rocPRIM #include #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #else #include #include #include #endif namespace { template struct type_arr { using type = First; using next = type_arr; }; template struct type_arr { using type = First; }; template using void_type = void; template constexpr bool is_type_arr_end = true; template constexpr bool is_type_arr_end> = false; template std::string search_n_config_name() { const rocprim::detail::search_n_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",threshold:" + std::to_string(config.threshold) + "}"; } #ifndef BENCHMARK_CONFIG_TUNING template<> std::string search_n_config_name() { return "default_config"; } #endif template struct count_equal_to { std::string name() const { return "count_equal_to<" + std::to_string(Value) + ">"; } constexpr size_t resolve(size_t) const { return Value; } }; template struct count_is_percent_of_size { std::string name() const { return "count_is_percent_of_size<" + std::to_string(Value) + ">"; } constexpr size_t resolve(size_t size) const { return size * Value / 100; } }; } // namespace template class benchmark_search_n : public benchmark_utils::autotune_interface { public: void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& size_byte = state.bytes; InputType h_noise{0}; InputType h_value{1}; common::device_ptr d_temp_storage; size_t temp_storage_size = 0; size_t size; size_t count; std::vector input{}; common::device_ptr d_input; common::device_ptr d_output(1); common::device_ptr d_value(std::vector{h_value}, stream); size = size_byte / sizeof(InputType); count = CountCalculator{}.resolve(size); size_t cur_tile = 0; size_t last_tile = size / count - 1; input = std::vector(size, h_value); while(cur_tile != last_tile) { input[cur_tile * count + count - 1] = h_noise; ++cur_tile; } d_input.store_async(input, stream); auto launch_search_n = [&]() { HIP_CHECK(::rocprim::search_n(d_temp_storage.get(), temp_storage_size, d_input.get(), d_output.get(), size, count, d_value.get(), rocprim::equal_to{}, stream, false)); }; // allocate temp memory launch_search_n(); d_temp_storage.resize_async(temp_storage_size, stream); state.run([&] { launch_search_n(); }); state.set_throughput(size, sizeof(InputType)); } std::string name() const override { return bench_naming::format_name("{lvl:device,algo:search_n,data_type:" + std::string(Traits::name()) + ",count_calculator:" + CountCalculator{}.name() + ",cfg:" + search_n_config_name() + "}") .c_str(); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_search_n_benchmark_generator { static void create(std::vector>& storage) { using config = rocprim::search_n_config; storage.emplace_back( std::make_unique, config>>()); storage.emplace_back( std::make_unique, config>>()); storage.emplace_back( std::make_unique, config>>()); storage.emplace_back( std::make_unique, config>>()); storage.emplace_back( std::make_unique, config>>()); storage.emplace_back( std::make_unique< benchmark_search_n, config>>()); storage.emplace_back( std::make_unique< benchmark_search_n, config>>()); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SEARCH_N_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_keys.cpp000066400000000000000000000070421506507210100271630ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_segmented_radix_sort_keys.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif // This benchmark only handles the rocprim::segmented_radix_sort_keys function. The benchmark was separated into two (keys and pairs), // because the binary became too large to link. Runs into a "relocation R_X86_64_PC32 out of range" error. // This happens partially, because of the algorithm has 4 kernels, and decides at runtime which one to call. template void add_benchmarks(benchmark_utils::executor& executor, size_t bytes) { constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; constexpr size_t min_size = 30000; size_t max_size = bytes / sizeof(KeyT); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { // This check is also present in device_segmented_radix_sort_keys_benchmark its run() // We need it here to prevent Google Benchmark causing an infinite loop const auto number_of_elements = segment_count * segment_length; if(number_of_elements < min_size || number_of_elements > max_size) { continue; } executor.queue_instance( device_segmented_radix_sort_keys_benchmark(segment_count, segment_length)); } } } int main(int argc, char* argv[]) { size_t bytes = 128 * benchmark_utils::MiB; benchmark_utils::executor executor(argc, argv, bytes, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_keys.parallel.cpp.in000066400000000000000000000033251506507210100313630ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_segmented_radix_sort_keys.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_segmented_radix_sort_keys_benchmark_generator< @RadixBits@, @BlockSize@, @ItemsPerThread@, @WarpSmallLWS@, @WarpSmallIPT@, @WarpSmallBS@, @WarpPartition@, @WarpMediumLWS@, @WarpMediumIPT@, @WarpMediumBS@, @KeyType@, true >::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_keys.parallel.hpp000066400000000000000000000274471506507210100307760ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include template std::string warp_sort_config_name(T const& warp_sort_config) { return "{pa:" + std::to_string(warp_sort_config.partitioning_allowed) + ",lwss:" + std::to_string(warp_sort_config.logical_warp_size_small) + ",ipts:" + std::to_string(warp_sort_config.items_per_thread_small) + ",bss:" + std::to_string(warp_sort_config.block_size_small) + ",pt:" + std::to_string(warp_sort_config.partitioning_threshold) + ",lwsm:" + std::to_string(warp_sort_config.logical_warp_size_medium) + ",iptm:" + std::to_string(warp_sort_config.items_per_thread_medium) + ",bsm:" + std::to_string(warp_sort_config.block_size_medium) + "}"; } template std::string config_name() { const rocprim::detail::segmented_radix_sort_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",rb:" + std::to_string(config.radix_bits) + ",eupws:" + std::to_string(config.enable_unpartitioned_warp_sort) + ",wsc:" + warp_sort_config_name(config.warp_sort_config) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_segmented_radix_sort_keys_benchmark : public benchmark_utils::autotune_interface { private: std::vector segment_counts; std::vector segment_lengths; size_t total_size; public: device_segmented_radix_sort_keys_benchmark(size_t segment_count, size_t segment_length) { segment_counts.push_back(segment_count); segment_lengths.push_back(segment_length); } device_segmented_radix_sort_keys_benchmark(const std::vector& segment_counts, const std::vector& segment_lengths) { this->segment_counts = segment_counts; this->segment_lengths = segment_lengths; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:segmented_radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:empty_type" + (segment_counts.size() == 1 ? ",segment_count:"s + std::to_string(segment_counts[0]) : ""s) + (segment_lengths.size() == 1 ? ",segment_length:"s + std::to_string(segment_lengths[0]) : ""s) + ",cfg:" + config_name() + "}"); } void run_benchmark(benchmark_utils::state&& state, size_t num_segments, size_t mean_segment_length) { const auto& stream = state.stream; const auto& seed = state.seed; using offset_type = int; using key_type = Key; std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis( static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_offsets(offsets); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::segmented_radix_sort_keys(nullptr, temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), size, segments_count, d_offsets.get(), d_offsets.get() + 1, 0, sizeof(key_type) * 8, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::segmented_radix_sort_keys(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), size, segments_count, d_offsets.get(), d_offsets.get() + 1, 0, sizeof(key_type) * 8, stream, false)); }); total_size += size; } void run(benchmark_utils::state&& state) override { total_size = 0; if(segment_counts.size() == 1) { run_benchmark(std::forward(state), segment_counts[0], segment_lengths[0]); } else { state.accumulate_total_gbench_iterations_every_run(); constexpr size_t min_size = 300000; constexpr size_t max_size = 33554432; for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements < min_size || number_of_elements > max_size) { continue; } run_benchmark(std::forward(state), segment_count, segment_length); } } } state.set_throughput(total_size, sizeof(Key)); } }; template struct device_segmented_radix_sort_keys_benchmark_generator { template static auto _create(std::vector>& storage) -> std::enable_if_t<(key_size * BlockSize * ItemsPerThread < TUNING_SHARED_MEMORY_MAX)> { const std::vector segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; const std::vector segment_lengths{30, 256, 3000, 300000}; storage.emplace_back(std::make_unique, rocprim::WarpSortConfig, UnpartitionWarpAllowed>>>(segment_counts, segment_lengths)); } template static auto _create(std::vector>&) -> std::enable_if_t {} static void create(std::vector>& storage) { _create(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_KEYS_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_pairs.cpp000066400000000000000000000101021506507210100273150ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_segmented_radix_sort_pairs.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #include #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif // This benchmark only handles the rocprim::segmented_radix_sort_pairs function. The benchmark was separated into two (keys and pairs), // because the binary became too large to link. Runs into a "relocation R_X86_64_PC32 out of range" error. // This happens partially, because of the algorithm has 4 kernels, and decides at runtime which one to call. template void add_benchmarks(benchmark_utils::executor& executor, size_t bytes) { constexpr std::array segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; constexpr std::array segment_lengths{30, 256, 3000, 300000}; constexpr size_t min_size = 30000; size_t max_size = bytes / sizeof(KeyT); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { // This check is also present in device_segmented_radix_sort_pairs_benchmark its run() // We need it here to prevent Google Benchmark causing an infinite loop const auto number_of_elements = segment_count * segment_length; if(number_of_elements < min_size || number_of_elements > max_size) { continue; } executor.queue_instance( device_segmented_radix_sort_pairs_benchmark(segment_count, segment_length)); } } } int main(int argc, char* argv[]) { size_t bytes = 128 * benchmark_utils::MiB; benchmark_utils::executor executor(argc, argv, bytes, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); add_benchmarks(executor, bytes); #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.cpp.in000066400000000000000000000033541506507210100315300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_segmented_radix_sort_pairs.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_segmented_radix_sort_pairs_benchmark_generator< @RadixBits@, @BlockSize@, @ItemsPerThread@, @WarpSmallLWS@, @WarpSmallIPT@, @WarpSmallBS@, @WarpPartition@, @WarpMediumLWS@, @WarpMediumIPT@, @WarpMediumBS@, @KeyType@, @ValueType@, true >::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_radix_sort_pairs.parallel.hpp000066400000000000000000000316611506507210100311320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #include template std::string warp_sort_config_name(T const& warp_sort_config) { return "{pa:" + std::to_string(warp_sort_config.partitioning_allowed) + ",lwss:" + std::to_string(warp_sort_config.logical_warp_size_small) + ",ipts:" + std::to_string(warp_sort_config.items_per_thread_small) + ",bss:" + std::to_string(warp_sort_config.block_size_small) + ",pt:" + std::to_string(warp_sort_config.partitioning_threshold) + ",lwsm:" + std::to_string(warp_sort_config.logical_warp_size_medium) + ",iptm:" + std::to_string(warp_sort_config.items_per_thread_medium) + ",bsm:" + std::to_string(warp_sort_config.block_size_medium) + "}"; } template std::string config_name() { const rocprim::detail::segmented_radix_sort_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",rb:" + std::to_string(config.radix_bits) + ",eupws:" + std::to_string(config.enable_unpartitioned_warp_sort) + ",wsc:" + warp_sort_config_name(config.warp_sort_config) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_segmented_radix_sort_pairs_benchmark : public benchmark_utils::autotune_interface { private: std::vector segment_counts; std::vector segment_lengths; size_t total_size; public: device_segmented_radix_sort_pairs_benchmark(size_t segment_count, size_t segment_length) { segment_counts.push_back(segment_count); segment_lengths.push_back(segment_length); } device_segmented_radix_sort_pairs_benchmark(const std::vector& segment_counts, const std::vector& segment_lengths) { this->segment_counts = segment_counts; this->segment_lengths = segment_lengths; } std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:segmented_radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + (segment_counts.size() == 1 ? ",segment_count:"s + std::to_string(segment_counts[0]) : ""s) + (segment_lengths.size() == 1 ? ",segment_length:"s + std::to_string(segment_lengths[0]) : ""s) + ",cfg:" + config_name() + "}"); } void run_benchmark(benchmark_utils::state&& state, size_t num_segments, size_t mean_segment_length) { const auto& stream = state.stream; const auto& seed = state.seed; using offset_type = int; using key_type = Key; using value_type = Value; std::vector offsets; offsets.push_back(0); static constexpr int iseed = 716; engine_type gen(iseed); std::normal_distribution segment_length_dis( static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if(segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector values_input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); common::device_ptr d_offsets(offsets); common::device_ptr d_keys_input(keys_input); common::device_ptr d_keys_output(size); common::device_ptr d_values_input(values_input); common::device_ptr d_values_output(size); size_t temporary_storage_bytes = 0; HIP_CHECK(rocprim::segmented_radix_sort_pairs(nullptr, temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), d_values_input.get(), d_values_output.get(), size, segments_count, d_offsets.get(), d_offsets.get() + 1, 0, sizeof(key_type) * 8, stream, false)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rocprim::segmented_radix_sort_pairs(d_temporary_storage.get(), temporary_storage_bytes, d_keys_input.get(), d_keys_output.get(), d_values_input.get(), d_values_output.get(), size, segments_count, d_offsets.get(), d_offsets.get() + 1, 0, sizeof(key_type) * 8, stream, false)); }); total_size += size; } void run(benchmark_utils::state&& state) override { total_size = 0; if(segment_counts.size() == 1) { run_benchmark(std::forward(state), segment_counts[0], segment_lengths[0]); } else { state.accumulate_total_gbench_iterations_every_run(); constexpr size_t min_size = 300000; constexpr size_t max_size = 33554432; for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements < min_size || number_of_elements > max_size) { continue; } run_benchmark(std::forward(state), segment_count, segment_length); } } } state.set_throughput(total_size, sizeof(Key) + sizeof(Value)); } }; template struct device_segmented_radix_sort_pairs_benchmark_generator { template static auto _create(std::vector>& storage) -> std::enable_if_t<((key_size + value_type) * BlockSize * ItemsPerThread <= TUNING_SHARED_MEMORY_MAX)> { const std::vector segment_counts{10, 100, 1000, 2500, 5000, 7500, 10000, 100000}; const std::vector segment_lengths{30, 256, 3000, 300000}; storage.emplace_back(std::make_unique, rocprim::WarpSortConfig, UnpartitionWarpAllowed>>>(segment_counts, segment_lengths)); } template static auto _create(std::vector>&) -> std::enable_if_t {} static void create(std::vector>& storage) { _create(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_RADIX_SORT_PAIRS_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_reduce.cpp000066400000000000000000000047751506507210100252330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_segmented_reduce.parallel.hpp" #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include #include #define CREATE_BENCHMARK(T, SEGMENTS) \ executor.queue_instance(device_segmented_reduce_benchmark(SEGMENTS)); #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1) \ CREATE_BENCHMARK(type, 10) \ CREATE_BENCHMARK(type, 100) \ CREATE_BENCHMARK(type, 1000) \ CREATE_BENCHMARK(type, 10000) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; BENCHMARK_TYPE(float) BENCHMARK_TYPE(double) BENCHMARK_TYPE(int8_t) BENCHMARK_TYPE(uint8_t) BENCHMARK_TYPE(rocprim::half) BENCHMARK_TYPE(int) BENCHMARK_TYPE(custom_float2) BENCHMARK_TYPE(custom_double2) BENCHMARK_TYPE(rocprim::int128_t) BENCHMARK_TYPE(rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_reduce.parallel.cpp.in000066400000000000000000000030311506507210100274130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_segmented_reduce.parallel.hpp" namespace { auto unused = benchmark_utils::executor::queue_sorted_instance, rocprim::reduce_config<@BlockSize@u, @ItemsPerThread@u, rocprim::block_reduce_algorithm::using_warp_reduce>>>(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_segmented_reduce.parallel.hpp000066400000000000000000000156171506507210100270300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_REDUCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SEGMENTED_REDUCE_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include namespace rp = rocprim; constexpr const char* get_reduce_method_name(rocprim::block_reduce_algorithm alg) { switch(alg) { case rocprim::block_reduce_algorithm::raking_reduce: return "raking_reduce"; case rocprim::block_reduce_algorithm::raking_reduce_commutative_only: return "raking_reduce_commutative_only"; case rocprim::block_reduce_algorithm::using_warp_reduce: return "using_warp_reduce"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::reduce_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_reduce_method_name(config.block_reduce_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, typename Config = rocprim::default_config> struct device_segmented_reduce_benchmark : public benchmark_utils::autotune_interface { private: std::vector desired_segments; size_t total_size; public: device_segmented_reduce_benchmark() { this->desired_segments = std::vector{1, 10, 100, 1000, 10000}; } device_segmented_reduce_benchmark(size_t desired_segment) { desired_segments.push_back(desired_segment); } std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:segmented_reduce,key_type:" + std::string(Traits::name()) + (desired_segments.size() == 1 ? ",segment_count:" + std::to_string(desired_segments[0]) : "") + ",cfg:" + config_name() + "}"); } void run_benchmark(benchmark_utils::state&& state, size_t desired_segment) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using offset_type = int; using value_type = T; // Calculate the number of elements size_t size = bytes / sizeof(T); // Generate data engine_type gen(seed.get_0()); const double avg_segment_length = static_cast(size) / desired_segment; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); common::device_ptr d_offsets(offsets); common::device_ptr d_values_input(values_input); common::device_ptr d_aggregates_output(segments_count); rocprim::plus reduce_op; value_type init(0); size_t temporary_storage_bytes = 0; HIP_CHECK(rp::segmented_reduce(nullptr, temporary_storage_bytes, d_values_input.get(), d_aggregates_output.get(), segments_count, d_offsets.get(), d_offsets.get() + 1, reduce_op, init, stream)); common::device_ptr d_temporary_storage(temporary_storage_bytes); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { HIP_CHECK(rp::segmented_reduce(d_temporary_storage.get(), temporary_storage_bytes, d_values_input.get(), d_aggregates_output.get(), segments_count, d_offsets.get(), d_offsets.get() + 1, reduce_op, init, stream)); }); total_size += size; } void run(benchmark_utils::state&& state) override { total_size = 0; for(const auto desired_segment : desired_segments) { run_benchmark(std::forward(state), desired_segment); } state.set_throughput(total_size, sizeof(T)); } }; #endifrocPRIM-rocm-7.1.0/benchmark/benchmark_device_select.cpp000066400000000000000000000154461506507210100232050ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_select.parallel.hpp" #include "benchmark_utils.hpp" #define CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(T, F, p) \ executor.queue_instance( \ device_select_predicated_flag_benchmark()); #define CREATE_SELECT_FLAG_BENCHMARK(T, F, p) \ executor.queue_instance(device_select_flag_benchmark()); #define CREATE_SELECT_PREDICATE_BENCHMARK(T, p) \ executor.queue_instance(device_select_predicate_benchmark()); #define CREATE_UNIQUE_BENCHMARK(T, p) \ executor.queue_instance(device_select_unique_benchmark()); #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ executor.queue_instance( \ device_select_unique_by_key_benchmark()); #define BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(type, value) \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p005) \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p025) \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p050) \ CREATE_SELECT_PREDICATED_FLAG_BENCHMARK(type, value, select_probability::p075) #define BENCHMARK_SELECT_FLAG_TYPE(type, value) \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p005) \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p025) \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p050) \ CREATE_SELECT_FLAG_BENCHMARK(type, value, select_probability::p075) #define BENCHMARK_SELECT_PREDICATE_TYPE(type) \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p005) \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p025) \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p050) \ CREATE_SELECT_PREDICATE_BENCHMARK(type, select_probability::p075) #define BENCHMARK_UNIQUE_TYPE(type) \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p005) \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p025) \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p050) \ CREATE_UNIQUE_BENCHMARK(type, select_probability::p075) #define BENCHMARK_UNIQUE_BY_KEY_TYPE(K, V) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p005) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p025) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p050) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, select_probability::p075) int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_double2 = common::custom_type; using custom_int_double = common::custom_type; using huge_float2 = common::custom_huge_type<1024, float, float>; BENCHMARK_SELECT_FLAG_TYPE(int, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(float, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(double, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(uint8_t, uint8_t) BENCHMARK_SELECT_FLAG_TYPE(int8_t, int8_t) BENCHMARK_SELECT_FLAG_TYPE(rocprim::half, int8_t) BENCHMARK_SELECT_FLAG_TYPE(custom_double2, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(rocprim::int128_t, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(rocprim::uint128_t, unsigned char) BENCHMARK_SELECT_FLAG_TYPE(huge_float2, unsigned char) BENCHMARK_SELECT_PREDICATE_TYPE(int) BENCHMARK_SELECT_PREDICATE_TYPE(float) BENCHMARK_SELECT_PREDICATE_TYPE(double) BENCHMARK_SELECT_PREDICATE_TYPE(uint8_t) BENCHMARK_SELECT_PREDICATE_TYPE(int8_t) BENCHMARK_SELECT_PREDICATE_TYPE(rocprim::half) BENCHMARK_SELECT_PREDICATE_TYPE(custom_int_double) BENCHMARK_SELECT_PREDICATE_TYPE(rocprim::int128_t) BENCHMARK_SELECT_PREDICATE_TYPE(rocprim::uint128_t) BENCHMARK_SELECT_PREDICATE_TYPE(huge_float2) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(int, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(float, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(double, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(uint8_t, uint8_t) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(int8_t, int8_t) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(rocprim::half, int8_t) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(custom_double2, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(rocprim::int128_t, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(rocprim::uint128_t, unsigned char) BENCHMARK_SELECT_PREDICATED_FLAG_TYPE(huge_float2, unsigned char) BENCHMARK_UNIQUE_TYPE(int) BENCHMARK_UNIQUE_TYPE(float) BENCHMARK_UNIQUE_TYPE(double) BENCHMARK_UNIQUE_TYPE(uint8_t) BENCHMARK_UNIQUE_TYPE(int8_t) BENCHMARK_UNIQUE_TYPE(rocprim::half) BENCHMARK_UNIQUE_TYPE(custom_int_double) BENCHMARK_UNIQUE_TYPE(rocprim::int128_t) BENCHMARK_UNIQUE_TYPE(rocprim::uint128_t) BENCHMARK_UNIQUE_TYPE(huge_float2) BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int) BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double) BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2) BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t) BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double) BENCHMARK_UNIQUE_BY_KEY_TYPE(rocprim::half, rocprim::half) BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double) BENCHMARK_UNIQUE_BY_KEY_TYPE(rocprim::int128_t, rocprim::int128_t) BENCHMARK_UNIQUE_BY_KEY_TYPE(rocprim::uint128_t, rocprim::int128_t) BENCHMARK_UNIQUE_BY_KEY_TYPE(huge_float2, huge_float2) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_select.parallel.cpp.in000066400000000000000000000026771506507210100254070ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_select.parallel.hpp" #include "benchmark_utils.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_select_benchmark_generator<@KeyType@, @ValueType@, @BlockSize@>::create); } // namespace rocPRIM-rocm-7.1.0/benchmark/benchmark_device_select.parallel.hpp000066400000000000000000000576101506507210100250040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_device_ptr.hpp" #include "cmdparser.hpp" #include #include #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #include #ifdef BENCHMARK_CONFIG_TUNING #include #include #endif enum class select_probability { p005, p025, p050, p075, tuning }; inline float get_probability(select_probability probability) { switch(probability) { case select_probability::p005: return 0.05f; case select_probability::p025: return 0.25f; case select_probability::p050: return 0.50f; case select_probability::p075: return 0.75f; case select_probability::tuning: return 0.0f; // not used } return 0.0f; } inline const char* get_probability_name(select_probability probability) { switch(probability) { case select_probability::p005: return "0.05"; case select_probability::p025: return "0.25"; case select_probability::p050: return "0.50"; case select_probability::p075: return "0.75"; case select_probability::tuning: return "tuning"; } return "invalid"; } template struct device_select_flag_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } common::device_ptr d_input(input); common::device_ptr d_flags_0(flags_0); common::device_ptr d_flags_1; common::device_ptr d_flags_2; if(is_tuning) { d_flags_1.store(flags_1); d_flags_2.store(flags_2); } common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](FlagType* d_flags) { HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_flags, d_output.get(), d_selected_count_output.get(), size, stream)); }; dispatch_flags(d_flags_0.get()); if(is_tuning) { dispatch_flags(d_flags_1.get()); dispatch_flags(d_flags_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_predicate_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:predicate,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); // all data types can represent [0, 127], -1 so a predicate can select all std::vector input = get_random_data(size, static_cast(0), static_cast(126), seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicate = [&](float probability) { auto predicate = [probability](const DataType& value) -> bool { return value < static_cast(127 * probability); }; HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_output.get(), d_selected_count_output.get(), size, predicate, stream)); }; if(is_tuning) { dispatch_predicate(0.0f); dispatch_predicate(0.5f); dispatch_predicate(1.0f); } else { dispatch_predicate(get_probability(Probability)); } }; size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_predicated_flag_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:select,subalgo:predicated_flag,data_type:" + std::string(Traits::name()) + ",flag_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input = get_random_data(size, common::generate_limits::min(), common::generate_limits::max(), seed.get_0()); std::vector flags_0; std::vector flags_1; std::vector flags_2; if(is_tuning) { flags_0 = get_random_data01(size, 0.0f, seed.get_1()); flags_1 = get_random_data01(size, 0.5f, seed.get_1()); flags_2 = get_random_data01(size, 1.0f, seed.get_1()); } else { flags_0 = get_random_data01(size, get_probability(Probability), seed.get_1()); } common::device_ptr d_input(input); common::device_ptr d_flags_0(flags_0); common::device_ptr d_flags_1; common::device_ptr d_flags_2; if(is_tuning) { d_flags_1.store(flags_1); d_flags_2.store(flags_2); } common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_predicated_flags = [&](FlagType* d_flags) { auto predicate = [](const FlagType& value) -> bool { return value; }; HIP_CHECK(rocprim::select(d_temp_storage, temp_storage_size_bytes, d_input.get(), d_flags, d_output.get(), d_selected_count_output.get(), size, predicate, stream)); }; dispatch_predicated_flags(d_flags_0.get()); if(is_tuning) { dispatch_predicated_flags(d_flags_1.get()); dispatch_predicated_flags(d_flags_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template inline std::vector get_unique_input(size_t size, float probability, unsigned int seed) { using op_type = typename std::conditional::value, half_plus, rocprim::plus>::type; op_type op; std::vector input(size); auto input01 = get_random_data01(size, probability, seed); auto acc = input01[0]; input[0] = acc; for(size_t i = 1; i < input01.size(); ++i) { input[i] = op(acc, input01[i]); } return input; } template struct device_select_unique_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:unique,data_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(DataType); std::vector input_0; std::vector input_1; std::vector input_2; if(is_tuning) { input_0 = get_unique_input(size, 0.0f, seed.get_0()); input_1 = get_unique_input(size, 0.5f, seed.get_0()); input_2 = get_unique_input(size, 1.0f, seed.get_0()); } else { input_0 = get_unique_input(size, get_probability(Probability), seed.get_0()); } common::device_ptr d_input_0(input_0); common::device_ptr d_input_1; common::device_ptr d_input_2; if(is_tuning) { d_input_1.store(input_1); d_input_2.store(input_2); } common::device_ptr d_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](DataType* d_input) { HIP_CHECK(rocprim::unique(d_temp_storage, temp_storage_size_bytes, d_input, d_output.get(), d_selected_count_output.get(), size, rocprim::equal_to(), stream)); }; dispatch_flags(d_input_0.get()); if(is_tuning) { dispatch_flags(d_input_1.get()); dispatch_flags(d_input_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(DataType)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; template struct device_select_unique_by_key_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:select,subalgo:unique_by_key,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",probability:" + get_probability_name(Probability) + ",cfg:" + partition_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(KeyType); std::vector input_keys_0; std::vector input_keys_1; std::vector input_keys_2; if(is_tuning) { input_keys_0 = get_unique_input(size, 0.0f, seed.get_0()); input_keys_1 = get_unique_input(size, 0.5f, seed.get_0()); input_keys_2 = get_unique_input(size, 1.0f, seed.get_0()); } else { input_keys_0 = get_unique_input(size, get_probability(Probability), seed.get_0()); } const auto random_range = limit_random_range(-1000, 1000); const auto input_values = get_random_data(size, random_range.first, random_range.second, seed.get_1()); common::device_ptr d_keys_input_0(input_keys_0); common::device_ptr d_keys_input_1; common::device_ptr d_keys_input_2; if(is_tuning) { d_keys_input_1.store(input_keys_1); d_keys_input_2.store(input_keys_2); } common::device_ptr d_values_input(input_values); common::device_ptr d_keys_output(size); common::device_ptr d_values_output(size); common::device_ptr d_selected_count_output(1); const auto dispatch = [&](void* d_temp_storage, size_t& temp_storage_size_bytes) { const auto dispatch_flags = [&](KeyType* d_keys_input) { HIP_CHECK(rocprim::unique_by_key(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input.get(), d_keys_output.get(), d_values_output.get(), d_selected_count_output.get(), size, rocprim::equal_to(), stream)); }; dispatch_flags(d_keys_input_0.get()); if(is_tuning) { dispatch_flags(d_keys_input_1.get()); dispatch_flags(d_keys_input_2.get()); } }; // Allocate temporary storage memory size_t temp_storage_size_bytes{}; dispatch(nullptr, temp_storage_size_bytes); common::device_ptr d_temp_storage(temp_storage_size_bytes); state.run([&] { dispatch(d_temp_storage.get(), temp_storage_size_bytes); }); state.set_throughput(size, sizeof(KeyType) + sizeof(ValueType)); } static constexpr bool is_tuning = Probability == select_probability::tuning; }; #ifdef BENCHMARK_CONFIG_TUNING template struct create_benchmark { static constexpr unsigned int block_size = Config().kernel_config.block_size; static constexpr unsigned int items_per_thread = Config().kernel_config.items_per_thread; static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(KeyType) + sizeof(ValueType); static constexpr unsigned int max_items_per_thread = max_shared_memory / (block_size * max_size_per_element); void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique>()); if(items_per_thread <= max_items_per_thread) { storage.emplace_back( std::make_unique< device_select_predicated_flag_benchmark>()); } } }; template struct create_benchmark { void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique>()); storage.emplace_back( std::make_unique>()); storage.emplace_back(std::make_unique>()); } }; template struct device_select_benchmark_generator { template struct create_ipt { void operator()(std::vector>& storage) { using config = rocprim::select_config; create_benchmark{}(storage); } }; static void create(std::vector>& storage) { static constexpr int max_items_per_thread = std::min(64 / std::max(sizeof(KeyType), sizeof(ValueType)), size_t{32}); static_for_each, create_ipt>(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SELECT_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_transform.cpp000066400000000000000000000051531506507210100237330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_transform.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T) executor.queue_instance(device_transform_benchmark()); #define CREATE_BENCHMARK_BINARY(T) \ executor.queue_instance(device_transform_benchmark()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(float) CREATE_BENCHMARK(double) CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) CREATE_BENCHMARK_BINARY(int) CREATE_BENCHMARK_BINARY(float) CREATE_BENCHMARK_BINARY(int8_t) CREATE_BENCHMARK_BINARY(rocprim::int128_t) CREATE_BENCHMARK_BINARY(custom_double2) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_transform.parallel.cpp.in000066400000000000000000000027651506507210100261410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_transform.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_transform_benchmark_generator< @DataType@, false, @BlockSize@, rocprim::load_default>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_transform.parallel.hpp000066400000000000000000000143461506507210100255370ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include template std::string transform_config_name() { auto config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + ",lt:" + get_thread_load_method_name(config.load_type) + "}"; } template<> inline std::string transform_config_name() { return "default_config"; } template struct device_transform_benchmark : public benchmark_utils::autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:transform" + std::string(IsPointer ? "_pointer" : "") + ",op:" + std::string(IsBinary ? "binary" : "unary") + ",value_type:" + std::string(Traits::name()) + ",cfg:" + transform_config_name() + "}"); } void run(benchmark_utils::state&& state) override { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using output_type = T; // Calculate the number of elements size_t size = bytes / sizeof(T); static constexpr bool debug_synchronous = false; // Generate data const auto random_range = limit_random_range(1, 100); const std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); if constexpr(IsBinary) { const std::vector input2 = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input2(input2); // If it is not a unary operator, it can not make use of the pointer optimization. const auto launch = [&] { auto transform_op = [](T v1, T v2) { return v1 + v2; }; return rocprim::transform(rocprim::tuple(d_input.get(), d_input2.get()), d_output.get(), size, transform_op, stream, debug_synchronous); }; state.run([&] { HIP_CHECK(launch()); }); state.set_throughput(size, sizeof(T) + sizeof(T)); } else { const auto launch = [&] { auto transform_op = [](T v) { return v + T(5); }; return rocprim::detail::transform_impl(d_input.get(), d_output.get(), size, transform_op, stream, debug_synchronous); }; state.run([&] { HIP_CHECK(launch()); }); state.set_throughput(size, sizeof(T)); } } }; template struct device_transform_benchmark_generator { template struct create_ipt { using generated_config = rocprim:: transform_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_transform_benchmark>()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread = 0; static constexpr unsigned int max_items_per_thread = rocprim::Log2<16>::VALUE; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_TRANSFORM_PARALLEL_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_device_transform_pointer.cpp000066400000000000000000000044601506507210100254730ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_device_transform.parallel.hpp" #include "benchmark_utils.hpp" #ifndef BENCHMARK_CONFIG_TUNING #include "../common/utils_custom_type.hpp" #endif // HIP API #include // rocPRIM #ifndef BENCHMARK_CONFIG_TUNING #include #endif #include #include #include #ifndef BENCHMARK_CONFIG_TUNING #include #endif #define CREATE_BENCHMARK(T) executor.queue_instance(device_transform_benchmark()); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 10, 5); #ifndef BENCHMARK_CONFIG_TUNING using custom_float2 = common::custom_type; using custom_double2 = common::custom_type; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(float) CREATE_BENCHMARK(double) CREATE_BENCHMARK(custom_float2) CREATE_BENCHMARK(custom_double2) CREATE_BENCHMARK(rocprim::int128_t) CREATE_BENCHMARK(rocprim::uint128_t) #endif executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_device_transform_pointer.parallel.cpp.in000066400000000000000000000027441506507210100276760ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "benchmark_device_transform.parallel.hpp" #include #include namespace { auto unused = benchmark_utils::executor::queue_autotune( device_transform_benchmark_generator< @DataType@, true, @BlockSize@, @LoadType@>::create); } rocPRIM-rocm-7.1.0/benchmark/benchmark_predicate_iterator.cpp000066400000000000000000000135661506507210100242610ustar00rootroot00000000000000// MIT License // // Copyright (c) 2024-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/predicate_iterator.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" #include #include // rocPRIM #include #include #include #include #include #include #include #include #include template struct less_than { __device__ bool operator()(T value) const { return value < T{C}; } }; template struct transform_op { __device__ auto operator()(T v) const { return Predicate{}(v) ? Transform{}(v) : v; } }; template struct transform_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, transform_op{}); HIP_CHECK(rocprim::transform(t_it, d_output, size, rocprim::identity{}, stream)); } }; template struct read_predicate_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, Transform{}); auto r_it = rocprim::make_predicate_iterator(t_it, d_input, Predicate{}); HIP_CHECK(rocprim::transform(r_it, d_output, size, rocprim::identity{}, stream)); } }; template struct write_predicate_it { using value_type = T; void operator()(T* d_input, T* d_output, const size_t size, const hipStream_t stream) { auto t_it = rocprim::make_transform_iterator(d_input, Transform{}); auto w_it = rocprim::make_predicate_iterator(d_output, d_input, Predicate{}); HIP_CHECK(rocprim::transform(t_it, w_it, size, rocprim::identity{}, stream)); } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using T = typename IteratorBenchmark::value_type; // Calculate the number of elements size_t size = bytes / sizeof(T); const auto random_range = limit_random_range(0, 99); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run([&] { IteratorBenchmark{}(d_input.get(), d_output.get(), size, stream); }); state.set_throughput(size, sizeof(T)); } #define CREATE_BENCHMARK(B, T, C) \ executor.queue_fn(bench_naming::format_name("{lvl:device,algo:" #B ",p:p" #C ",key_type:" #T \ ",cfg:default_config}") \ .c_str(), \ run_benchmark, common::increment_by<5>>>); // clang-format off #define CREATE_TYPED_BENCHMARK(T) \ CREATE_BENCHMARK(transform_it, T, 0) \ CREATE_BENCHMARK(read_predicate_it, T, 0) \ CREATE_BENCHMARK(write_predicate_it, T, 0) \ CREATE_BENCHMARK(transform_it, T, 25) \ CREATE_BENCHMARK(read_predicate_it, T, 25) \ CREATE_BENCHMARK(write_predicate_it, T, 25) \ CREATE_BENCHMARK(transform_it, T, 50) \ CREATE_BENCHMARK(read_predicate_it, T, 50) \ CREATE_BENCHMARK(write_predicate_it, T, 50) \ CREATE_BENCHMARK(transform_it, T, 75) \ CREATE_BENCHMARK(read_predicate_it, T, 75) \ CREATE_BENCHMARK(write_predicate_it, T, 75) \ CREATE_BENCHMARK(transform_it, T, 100) \ CREATE_BENCHMARK(read_predicate_it, T, 100) \ CREATE_BENCHMARK(write_predicate_it, T, 100) // clang-format on int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 512 * benchmark_utils::MiB, 10, 5); using custom_128 = common::custom_type; CREATE_TYPED_BENCHMARK(int8_t) CREATE_TYPED_BENCHMARK(int16_t) CREATE_TYPED_BENCHMARK(int32_t) CREATE_TYPED_BENCHMARK(int64_t) CREATE_TYPED_BENCHMARK(custom_128) CREATE_TYPED_BENCHMARK(rocprim::int128_t) CREATE_TYPED_BENCHMARK(rocprim::uint128_t) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_utils.hpp000066400000000000000000001517531506507210100215560ustar00rootroot00000000000000// Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_BENCHMARK_UTILS_HPP_ #define ROCPRIM_BENCHMARK_UTILS_HPP_ #include "../common/utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_data_generation.hpp" #include "../common/utils_half.hpp" #include // rocPRIM #include #include #include #include #include // partition_config_params #include #include #include #include #include // CmdParser #include "cmdparser.hpp" #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #include #define TUNING_SHARED_MEMORY_MAX 65536u // Support half operators on host side inline const char* get_seed_message() { return "seed for input generation, either an unsigned integer value for determinisic results " "or 'random' for different inputs for each repetition"; } /// \brief Provides a sequence of seeds. class managed_seed { public: /// \param[in] seed_string Either "random" to get random seeds, /// or an unsigned integer to get (a sequence) of deterministic seeds. managed_seed(const std::string& seed_string) { is_random = seed_string == "random"; if(!is_random) { const unsigned int seed = std::stoul(seed_string); std::seed_seq seq{seed}; seq.generate(seeds.begin(), seeds.end()); } } managed_seed() {} unsigned int get_0() const { return is_random ? std::random_device{}() : seeds[0]; } unsigned int get_1() const { return is_random ? std::random_device{}() : seeds[1]; } unsigned int get_2() const { return is_random ? std::random_device{}() : seeds[2]; } private: std::array seeds; bool is_random; }; struct half_less { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a < b; #else return common::half_to_native(a) < common::half_to_native(b); #endif } }; struct half_plus { ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a + b; #else return common::native_to_half(common::half_to_native(a) + common::half_to_native(b)); #endif } }; struct half_equal_to { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a == b; #else return common::half_to_native(a) == common::half_to_native(b); #endif } }; using engine_type = std::minstd_rand; // generate_random_data_n() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto generate_random_data_n( OutputIter it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> typename std::enable_if_t>::value, OutputIter> { using T = common::it_value_t; using dis_type = typename std::conditional< common::is_valid_for_int_distribution::value, T, typename std::conditional::value, int, unsigned int>::type>::type; common::uniform_int_distribution distribution((T)min, (T)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template inline auto generate_random_data_n(OutputIterator it, size_t size, U min, V max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value, OutputIterator> { using T = typename std::iterator_traits::value_type; // Generate floats when T is half using dis_type = std::conditional_t::value || std::is_same::value, float, T>; std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); std::generate_n(it, std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(it, std::min(size - i, max_random_size), it + i); } return it + size; } template inline std::vector get_random_data01(size_t size, float p, unsigned int seed, size_t max_random_size = 1024 * 1024) { engine_type gen(seed); std::bernoulli_distribution distribution(p); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template struct is_comparable { private: // A dummy template function that attempts to compare two objects of types T and U template static auto test(V&& v, W&& w) -> decltype(std::declval() < std::declval(), std::true_type{}); // Fallback if the above template function is not valid template static std::false_type test(...); public: // Final result static constexpr bool value = decltype(test(std::declval(), std::declval()))::value; }; template struct is_comparable, T> : std::conditional_t::value || !std::is_same>::value, std::false_type, std::true_type> {}; template struct custom_type_decomposer { static_assert( common::is_custom_type::value, "custom_type_decomposer can only be used with instantiations of common::custom_type"); using T = typename CustomType::first_type; using U = typename CustomType::second_type; __host__ __device__ ::rocprim::tuple operator()(CustomType& key) const { return ::rocprim::tuple{key.x, key.y}; } }; namespace common { template struct generate_limits::value>> { using F = typename T::first_type; using S = typename T::second_type; static inline T min() { return T(generate_limits::min(), generate_limits::min()); } static inline T max() { return T(generate_limits::max(), generate_limits::max()); } }; } // namespace common template inline auto generate_random_data_n(OutputIterator it, size_t size, common::it_value_t min, common::it_value_t max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value, OutputIterator> { using T = common::it_value_t; using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector fdata(size); std::vector sdata(size); generate_random_data_n(fdata.begin(), size, min.x, max.x, gen, max_random_size); generate_random_data_n(sdata.begin(), size, min.y, max.y, gen, max_random_size); for(size_t i = 0; i < size; ++i) { it[i] = T(fdata[i], sdata[i]); } return it + size; } template inline auto generate_random_data_n(OutputIterator it, size_t size, common::it_value_t min, common::it_value_t max, Generator& gen, size_t max_random_size = 1024 * 1024) -> std::enable_if_t>::value && !std::is_same::value, OutputIterator> { using T = common::it_value_t; using field_type = decltype(max.x); std::vector field_data(size); generate_random_data_n(field_data.begin(), size, min.x, max.x, gen, max_random_size); for(size_t i = 0; i < size; ++i) { it[i] = T(field_data[i]); } return it + size; } template inline std::vector get_random_data( size_t size, U min, V max, unsigned int seed, size_t max_random_size = 1024 * 1024) { std::vector data(size); engine_type gen(seed); generate_random_data_n(data.begin(), size, min, max, gen, max_random_size); return data; } template auto limit_cast(U value) -> T { static_assert(rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && is_comparable::value, "Cannot use limit_cast with chosen types of T and U"); using common_type = typename std::common_type::type; if(rocprim::is_unsigned::value) { if(value < 0) { return rocprim::numeric_limits::min(); } if(static_cast(value) > static_cast(rocprim::numeric_limits::max())) { return rocprim::numeric_limits::max(); } } else if(rocprim::is_signed::value && rocprim::is_unsigned::value) { if(value > rocprim::numeric_limits::max()) { return rocprim::numeric_limits::max(); } } else if(rocprim::is_floating_point::value) { return static_cast(value); } else // Both T and U are signed { if(value < static_cast(rocprim::numeric_limits::min())) { return rocprim::numeric_limits::min(); } else if(value > static_cast(rocprim::numeric_limits::max())) { return rocprim::numeric_limits::max(); } } return static_cast(value); } // This overload below is selected for non-standard float types, e.g. half, which cannot be compared with the limit types. template inline auto limit_random_range(U range_start, V range_end) -> std::enable_if_t::value && (!is_comparable::value || !is_comparable::value), std::pair> { return {static_cast(range_start), static_cast(range_end)}; } template auto limit_random_range(U range_start, V range_end) -> std::enable_if_t<(common::is_custom_type::value && is_comparable::value && is_comparable::value && is_comparable::value && is_comparable::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value && rocprim::is_arithmetic::value), std::pair> { return { T{limit_cast(range_start), limit_cast(range_start)}, T{ limit_cast(range_end), limit_cast(range_end) } }; } template inline auto limit_random_range(U range_start, V range_end) -> std::enable_if_t::value && is_comparable::value && is_comparable::value, std::pair> { if(is_comparable::value) { using common_type = typename std::common_type::type; if(static_cast(range_start) > static_cast(range_end)) { throw std::range_error("limit_random_range: Incorrect range used!"); } } T start = limit_cast(range_start); T end = limit_cast(range_end); return std::make_pair(start, end); } inline bool is_warp_size_supported(const unsigned int required_warp_size, const int device_id) { unsigned int warp_size; HIP_CHECK(::rocprim::host_warp_size(device_id, warp_size)); return warp_size >= required_warp_size; } /// \brief Get segments of uniform random size in [1, max_segment_length] with random key. template std::vector get_random_segments(const size_t size, const size_t max_segment_length, unsigned int seed) { static_assert(rocprim::is_arithmetic::value, "Key type must be arithmetic"); engine_type prng(seed); common::uniform_int_distribution segment_length_distribution( std::numeric_limits::min(), max_segment_length); // std::uniform_real_distribution cannot handle rocprim::half, use float instead using dis_type = typename std::conditional::value, float, T>::type; using key_distribution_type = std::conditional_t::value, common::uniform_int_distribution, std::uniform_real_distribution>; key_distribution_type key_distribution(rocprim::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(keys.begin() + keys_start_index, keys.begin() + new_segment_end, key); keys_start_index += new_segment_length; } return keys; } /// \brief Get segments of uniform random size in [1, max_segment_length] with unique incrementing key. template std::vector get_random_segments_iota(const size_t size, const size_t max_segment_length, unsigned int seed) { engine_type prng(seed); common::uniform_int_distribution segment_length_distribution(1, max_segment_length); std::vector keys(size); size_t segment_index = 0; size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = segment_index++; std::fill(keys.begin() + keys_start_index, keys.begin() + new_segment_end, key); keys_start_index += new_segment_length; } return keys; } template inline auto get_random_value(U min, V max, size_t seed_value) -> std::enable_if_t::value, T> { T result; engine_type gen(seed_value); generate_random_data_n(&result, 1, min, max, gen); return result; } template inline auto get_random_value(T min, T max, size_t seed_value) -> std::enable_if_t::value, T> { typename T::first_type result_first; typename T::second_type result_second; engine_type gen(seed_value); generate_random_data_n(&result_first, 1, min.x, max.x, gen); generate_random_data_n(&result_second, 1, min.y, max.y, gen); return T{result_first, result_second}; } template struct make_index_range_impl; template struct make_index_range_impl> { using type = std::integer_sequence; }; // make a std::integer_sequence with values from Start to End inclusive template using make_index_range = typename make_index_range_impl>::type; template class Function, T... I, typename... Args> void static_for_each_impl(std::integer_sequence, Args&&... args) { int a[] = {(Function{}(std::forward(args)...), 0)...}; static_cast(a); } // call the supplied template with all values of the std::integer_sequence Indices template class Function, typename... Args> void static_for_each(Args&&... args) { static_for_each_impl(Indices{}, std::forward(args)...); } // Inserts spaces at beginning of string if string shorter than specified length. inline std::string pad_string(std::string str, const size_t len) { if(len > str.size()) { str.insert(str.begin(), len - str.size(), ' '); } return str; } struct bench_naming { public: enum format { json, human, txt }; static format& get_format() { static format storage = human; return storage; } static void set_format(const std::string& argument) { format result = human; if(argument == "json") { result = json; } else if(argument == "txt") { result = txt; } get_format() = result; } private: static std::string matches_as_json(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 1; result << "{"; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } result << "\"" << m[1].str() << "\":"; if(m[2].length() > 0) { if(m[2].str().find_first_not_of("0123456789") == std::string::npos) { result << m[2].str(); } else { result << "\"" << m[2].str() << "\""; } if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; ++c) { result << "}"; } } } else { ++brackets_count; result << "{"; insert_comma = false; } } while(brackets_count > 0) { --brackets_count; result << "}"; } return result.str(); } static std::string matches_as_human(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 0; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } if(m[2].length() > 0) { result << m[2].str(); if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; ++c) { result << ">"; } } } else { ++brackets_count; result << "<"; insert_comma = false; } } while(brackets_count > 0) { --brackets_count; result << ">"; } return result.str(); } public: static std::string format_name(std::string string) { format format = get_format(); std::regex r("([A-z0-9]*):\\s*((?:common::custom_type<[A-z0-9,]*>)|(?:common::custom_type_" "copyable<[A-z0-9,]*>)|[A-z:\\(\\)\\.<>\\s0-9]*)(\\}*)"); // First we perform some checks bool checks[4] = {false}; for(std::sregex_iterator i = std::sregex_iterator(string.begin(), string.end(), r); i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(m[1].str() == "lvl") { checks[0] = true; } else if(m[1].str() == "algo") { checks[1] = true; } else if(m[1].str() == "cfg") { checks[2] = true; } } std::string string_substitute = std::regex_replace(string, r, ""); checks[3] = string_substitute.find_first_not_of(" ,{}") == std::string::npos; for(bool check_name_format : checks) { if(!check_name_format) { std::cout << "Benchmark name \"" << string << "\" not in the correct format (e.g. " "{lvl:block,algo:reduce,cfg:default_config} )" << std::endl; exit(1); } } // Now we generate the desired format std::sregex_iterator matches = std::sregex_iterator(string.begin(), string.end(), r); switch(format) { case format::json: return matches_as_json(matches); case format::human: return matches_as_human(matches); case format::txt: return string; } return string; } }; template struct Traits { //static inline method instead of static inline attribute because that's only supported from C++17 onwards static inline const char* name() { static_assert(sizeof(T) == 0, "Traits::name() unknown"); return "unknown"; } }; // Explicit definitions template<> inline const char* Traits::name() { return "char"; } template<> inline const char* Traits::name() { return "int"; } template<> inline const char* Traits::name() { return "short"; } template<> inline const char* Traits::name() { return "int8_t"; } template<> inline const char* Traits::name() { return "uint8_t"; } template<> inline const char* Traits::name() { return "uint16_t"; } template<> inline const char* Traits::name() { return "uint32_t"; } template<> inline const char* Traits::name() { return "rocprim::half"; } template<> inline const char* Traits::name() { return "rocprim::bfloat16"; } template<> inline const char* Traits::name() { return "int64_t"; } // On MSVC `int64_t` and `long long` are the same, leading to multiple definition errors #ifndef _WIN32 template<> inline const char* Traits::name() { return "int64_t"; } #endif // On MSVC `uint64_t` and `unsigned long long` are the same, leading to multiple definition errors #ifndef _WIN32 template<> inline const char* Traits::name() { return "uint64_t"; } #else template<> inline const char* Traits::name() { return "unsigned long long"; } #endif template<> inline const char* Traits::name() { return "float"; } template<> inline const char* Traits::name() { return "double"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type<1024,float,float>"; } template<> inline const char* Traits>::name() { return "common::custom_type<2048,float,float>"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits>::name() { return "common::custom_type"; } template<> inline const char* Traits::name() { return "empty_type"; } template<> inline const char* Traits>::name() { return "float2"; } template<> inline const char* Traits>::name() { return "double2"; } template<> inline const char* Traits::name() { return "rocprim::int128_t"; } template<> inline const char* Traits::name() { return "rocprim::uint128_t"; } template<> inline const char* Traits>::name() { return "common::custom_type_copyable"; } template<> inline const char* Traits>::name() { return "common::custom_type_copyable"; } inline const char* get_block_scan_algorithm_name(rocprim::block_scan_algorithm alg) { switch(alg) { case rocprim::block_scan_algorithm::using_warp_scan: return "block_scan_algorithm::using_warp_scan"; case rocprim::block_scan_algorithm::reduce_then_scan: return "block_scan_algorithm::reduce_then_scan"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "default_algorithm"; } inline const char* get_block_load_method_name(rocprim::block_load_method method) { switch(method) { case rocprim::block_load_method::block_load_direct: return "block_load_method::block_load_direct"; case rocprim::block_load_method::block_load_striped: return "block_load_method::block_load_striped"; case rocprim::block_load_method::block_load_vectorize: return "block_load_method::block_load_vectorize"; case rocprim::block_load_method::block_load_transpose: return "block_load_method::block_load_transpose"; case rocprim::block_load_method::block_load_warp_transpose: return "block_load_method::block_load_warp_transpose"; } return "default_method"; } inline const char* get_thread_load_method_name(rocprim::cache_load_modifier method) { switch(method) { case rocprim::load_default: return "load_default"; case rocprim::load_ca: return "load_ca"; case rocprim::load_cg: return "load_cg"; case rocprim::load_nontemporal: return "load_nontemporal"; case rocprim::load_cv: return "load_cv"; case rocprim::load_ldg: return "load_ldg"; case rocprim::load_volatile: return "load_volatile"; case rocprim::load_count: return "load_count"; } return "load_default"; } template struct alignas(Alignment) custom_aligned_type { unsigned char data[Size]; }; template std::string partition_config_name() { const rocprim::detail::partition_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + "}"; } template<> inline std::string partition_config_name() { return "default_config"; } namespace benchmark_utils { constexpr size_t KiB = 1024; constexpr size_t MiB = 1024 * KiB; constexpr size_t GiB = 1024 * MiB; class state { public: state(hipStream_t stream, size_t size, const managed_seed& seed, size_t batch_iterations, benchmark::State& gbench_state, size_t warmup_iterations, bool cold, bool record_as_whole) : stream(stream) , size(size) , bytes(size) , seed(seed) , batch_iterations(batch_iterations) , gbench_state(gbench_state) , warmup_iterations(warmup_iterations) , cold(cold) , record_as_whole(record_as_whole) , events(record_as_whole ? 2 : batch_iterations * 2) {} // Used to reset the input array of algorithms like device_merge_inplace. void run_before_every_iteration(std::function lambda) { run_before_every_iteration_lambda = lambda; } // Used to accumulate the results of state.run() calls. void accumulate_total_gbench_iterations_every_run() { reset_total_gbench_iterations_every_run = false; } void run(std::function kernel) { for(auto& event : events) { HIP_CHECK(hipEventCreate(&event)); } // Warm-up for(size_t i = 0; i < warmup_iterations; ++i) { // Benchmarks may expect their kernel input to be prepared by this lambda, // so to prevent any potential crashes, we call the lambda during warm-up. if(run_before_every_iteration_lambda) { run_before_every_iteration_lambda(); } kernel(); } HIP_CHECK(hipDeviceSynchronize()); if(run_before_every_iteration_lambda && batch_iterations > 1 && record_as_whole) { std::cerr << "Error: This benchmark calls run_before_every_iteration() and has a " "batch_iterations count that is higher than 1, which means it does not " "support using --record_as_whole.\n"; exit(EXIT_FAILURE); } // Run for(auto _ : gbench_state) { if(record_as_whole) { if(run_before_every_iteration_lambda) { run_before_every_iteration_lambda(); } HIP_CHECK(hipEventRecord(events[0], stream)); for(size_t i = 0; i < batch_iterations; ++i) { kernel(); } HIP_CHECK(hipEventRecord(events[1], stream)); HIP_CHECK(hipEventSynchronize(events[1])); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, events[0], events[1])); times.emplace_back(elapsed_mseconds); gbench_state.SetIterationTime(elapsed_mseconds / 1000); } else { for(size_t i = 0; i < batch_iterations; ++i) { if(run_before_every_iteration_lambda) { run_before_every_iteration_lambda(); } if(cold) { clear_gpu_cache(stream); } // Even events record the start time. HIP_CHECK(hipEventRecord(events[i * 2], stream)); kernel(); // Odd events record the stop time. HIP_CHECK(hipEventRecord(events[i * 2 + 1], stream)); } // Wait until the last record event has completed. HIP_CHECK(hipEventSynchronize(events[batch_iterations * 2 - 1])); // Accumulate the total elapsed time. double elapsed_mseconds = 0.0; for(size_t i = 0; i < batch_iterations; i++) { float iteration_mseconds; HIP_CHECK( hipEventElapsedTime(&iteration_mseconds, events[i * 2], events[i * 2 + 1])); times.emplace_back(iteration_mseconds); elapsed_mseconds += iteration_mseconds; } gbench_state.SetIterationTime(elapsed_mseconds / 1000); } } if(reset_total_gbench_iterations_every_run) { total_gbench_iterations = 0; } total_gbench_iterations += gbench_state.iterations(); for(const auto& event : events) { HIP_CHECK(hipEventDestroy(event)); } } void set_throughput(size_t actual_size, size_t type_size) { if(has_set_throughput) { std::cerr << "Error: Benchmarks should only ever call set_throughput() once, at the " "very end.\n"; exit(EXIT_FAILURE); } has_set_throughput = true; gbench_state.SetBytesProcessed(total_gbench_iterations * batch_iterations * actual_size * type_size); gbench_state.SetItemsProcessed(total_gbench_iterations * batch_iterations * actual_size); output_statistics(); } hipStream_t stream; size_t size; size_t bytes; managed_seed seed; size_t batch_iterations; benchmark::State& gbench_state; private: // Zeros a 256 MiB buffer, used to clear the cache before each kernel call. // 256 MiB is the size of the largest cache on any AMD GPU. // It is currently not possible to fetch the L3 cache size from the runtime. inline void clear_gpu_cache(hipStream_t stream) { constexpr size_t buf_size = 256 * MiB; static void* buf = nullptr; if(!buf) { HIP_CHECK(hipMalloc(&buf, buf_size)); } HIP_CHECK(hipMemsetAsync(buf, 0, buf_size, stream)); } void output_statistics() { double mean = get_mean(); double median = get_median(); double stddev = get_stddev(mean); double cv = get_cv(stddev, mean); gbench_state.counters["mean"] = mean; gbench_state.counters["median"] = median; gbench_state.counters["stddev"] = stddev; gbench_state.counters["cv"] = cv; } double get_mean() { return std::reduce(times.begin(), times.end()) / times.size(); } // Technically when times.size() is even, the median is the arithmetic mean // of the elements k=N/2 and k=N/2+1. This would be overkill here, // as times.size() is large enough, and recorded times are similar enough. double get_median() { size_t center_index = times.size() / 2; std::nth_element(times.begin(), times.begin() + center_index, times.end()); return times[center_index]; } double get_stddev(double mean) { auto SumSquares = [](const std::vector& v) { return std::transform_reduce(v.begin(), v.end(), v.begin(), 0.0); }; auto Sqr = [](double dat) { return dat * dat; }; auto Sqrt = [](double dat) { return dat < 0.0 ? 0.0 : std::sqrt(dat); }; double stddev = 0.0; if(times.size() > 1) { double avg_squares = SumSquares(times) * (1.0 / times.size()); stddev = Sqrt(times.size() / (times.size() - 1.0) * (avg_squares - Sqr(mean))); } return stddev; } double get_cv(double stddev, double mean) { return times.size() >= 2 ? stddev / mean : 0.0; } size_t warmup_iterations; bool cold; bool record_as_whole; std::vector events; std::function run_before_every_iteration_lambda = nullptr; size_t total_gbench_iterations = 0; bool reset_total_gbench_iterations_every_run = true; std::vector times; bool has_set_throughput = false; }; struct autotune_interface { virtual std::string name() const = 0; virtual std::string sort_key() const { return name(); }; virtual ~autotune_interface() = default; virtual void run(state&& state) = 0; }; class executor { public: executor(int argc, char* argv[], size_t default_bytes, size_t default_batch_iterations, size_t default_warmup_iterations, bool default_cold = true, int default_trials = -1) { cli::Parser parser(argc, argv); set_optional_parser_flags(parser, default_bytes, default_batch_iterations, default_warmup_iterations, default_cold, default_trials); parser.run_and_exit_if_error(); benchmark::Initialize(&argc, argv); parse(parser); add_context(); } template void queue_fn(const std::string& name, T bench_fn) { apply_settings(benchmark::RegisterBenchmark(name.c_str(), [=](benchmark::State& gbench_state) { bench_fn(new_state(gbench_state)); })); } template void queue_instance(Benchmark&& instance) { apply_settings(benchmark::RegisterBenchmark( instance.name().c_str(), [=](benchmark::State& gbench_state) { // run() requires a mutable instance, so create a mutable copy. // Using [&instance] doesn't work, as it creates a dangling reference at runtime. // Marking the lambda mutable doesn't work, as the &&instance it copies is const. Benchmark(std::move(instance)).run(new_state(gbench_state)); })); } template static bool queue_sorted_instance() { sorted_benchmarks().push_back(std::make_unique()); return true; // Must return something, as this function gets called in global scope. } template static bool queue_autotune(BulkCreateFunction&& f) { std::forward(f)(sorted_benchmarks()); return true; // Must return something, as this function gets called in global scope. } void run() { register_sorted_subset(parallel_instance, parallel_instances); benchmark::ConsoleReporter cr; benchmark::RunSpecifiedBenchmarks(&cr); } private: void set_optional_parser_flags(cli::Parser& parser, size_t default_bytes, size_t default_batch_iterations, size_t default_warmup_iterations, bool default_cold, int default_trials) { parser.set_optional("size", "size", default_bytes, "size in bytes"); parser.set_optional("batch_iterations", "batch_iterations", default_batch_iterations, "number of batch iterations"); parser.set_optional("warmup_iterations", "warmup_iterations", default_warmup_iterations, "number of warmup iterations"); parser.set_optional("hot", "hot", !default_cold, "don't clear the gpu cache on every batch iteration"); parser.set_optional( "record_as_whole", "record_as_whole", false, "record the batch iterations as a whole, at the very start and end, which necessitates " "that gpu cache clearing between iterations can't be done"); parser.set_optional("seed", "seed", "random", get_seed_message()); parser.set_optional("trials", "trials", default_trials, "number of iterations"); parser.set_optional("name_format", "name_format", "json", "either json, human, or txt"); // Optionally run an evenly split subset of benchmarks for autotuning. parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); } void parse(cli::Parser& parser) { size = parser.get("size"); seed_type = parser.get("seed"); seed = managed_seed(seed_type); batch_iterations = parser.get("batch_iterations"); warmup_iterations = parser.get("warmup_iterations"); cold = !parser.get("hot"); record_as_whole = parser.get("record_as_whole"); trials = parser.get("trials"); parallel_instance = parser.get("parallel_instance"); parallel_instances = parser.get("parallel_instances"); bench_naming::set_format(parser.get("name_format")); } void add_context() { benchmark::AddCustomContext("size", std::to_string(size)); benchmark::AddCustomContext("seed", seed_type); benchmark::AddCustomContext("batch_iterations", std::to_string(batch_iterations)); benchmark::AddCustomContext("warmup_iterations", std::to_string(warmup_iterations)); hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); auto str = [](const std::string& name, const std::string& val) { benchmark::AddCustomContext(name, val); }; auto num = [](const std::string& name, const auto& value) { benchmark::AddCustomContext(name, std::to_string(value)); }; auto dim2 = [num](const std::string& name, const auto* values) { num(name + "_x", values[0]); num(name + "_y", values[1]); }; auto dim3 = [num, dim2](const std::string& name, const auto* values) { dim2(name, values); num(name + "_z", values[2]); }; str("hdp_name", devProp.name); num("hdp_total_global_mem", devProp.totalGlobalMem); num("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); num("hdp_regs_per_block", devProp.regsPerBlock); num("hdp_warp_size", devProp.warpSize); num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); dim3("hdp_max_threads_dim", devProp.maxThreadsDim); dim3("hdp_max_grid_size", devProp.maxGridSize); num("hdp_clock_rate", devProp.clockRate); num("hdp_memory_clock_rate", devProp.memoryClockRate); num("hdp_memory_bus_width", devProp.memoryBusWidth); num("hdp_total_const_mem", devProp.totalConstMem); num("hdp_major", devProp.major); num("hdp_minor", devProp.minor); num("hdp_multi_processor_count", devProp.multiProcessorCount); num("hdp_l2_cache_size", devProp.l2CacheSize); num("hdp_max_threads_per_multiprocessor", devProp.maxThreadsPerMultiProcessor); num("hdp_compute_mode", devProp.computeMode); num("hdp_clock_instruction_rate", devProp.clockInstructionRate); num("hdp_concurrent_kernels", devProp.concurrentKernels); num("hdp_pci_domain_id", devProp.pciDomainID); num("hdp_pci_bus_id", devProp.pciBusID); num("hdp_pci_device_id", devProp.pciDeviceID); num("hdp_max_shared_memory_per_multi_processor", devProp.maxSharedMemoryPerMultiProcessor); num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); num("hdp_can_map_host_memory", devProp.canMapHostMemory); str("hdp_gcn_arch_name", devProp.gcnArchName); num("hdp_integrated", devProp.integrated); num("hdp_cooperative_launch", devProp.cooperativeLaunch); num("hdp_cooperative_multi_device_launch", devProp.cooperativeMultiDeviceLaunch); num("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); num("hdp_max_texture_1d", devProp.maxTexture1D); dim2("hdp_max_texture_2d", devProp.maxTexture2D); dim3("hdp_max_texture_3d", devProp.maxTexture3D); num("hdp_mem_pitch", devProp.memPitch); num("hdp_texture_alignment", devProp.textureAlignment); num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); num("hdp_ecc_enabled", devProp.ECCEnabled); num("hdp_tcc_driver", devProp.tccDriver); num("hdp_cooperative_multi_device_unmatched_func", devProp.cooperativeMultiDeviceUnmatchedFunc); num("hdp_cooperative_multi_device_unmatched_grid_dim", devProp.cooperativeMultiDeviceUnmatchedGridDim); num("hdp_cooperative_multi_device_unmatched_block_dim", devProp.cooperativeMultiDeviceUnmatchedBlockDim); num("hdp_cooperative_multi_device_unmatched_shared_mem", devProp.cooperativeMultiDeviceUnmatchedSharedMem); num("hdp_is_large_bar", devProp.isLargeBar); num("hdp_asic_revision", devProp.asicRevision); num("hdp_managed_memory", devProp.managedMemory); num("hdp_direct_managed_mem_access_from_host", devProp.directManagedMemAccessFromHost); num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); num("hdp_pageable_memory_access_uses_host_page_tables", devProp.pageableMemoryAccessUsesHostPageTables); const auto arch = devProp.arch; num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); num("hdp_arch_has_doubles", arch.hasDoubles); num("hdp_arch_has_warp_vote", arch.hasWarpVote); num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); num("hdp_arch_has_3d_grid", arch.has3dGrid); num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); } static std::vector>& sorted_benchmarks() { static std::vector> sorted_benchmarks; return sorted_benchmarks; } state new_state(benchmark::State& gbench_state) { return state(stream, size, seed, batch_iterations, gbench_state, warmup_iterations, cold, record_as_whole); } void apply_settings(benchmark::internal::Benchmark* b) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); // trials is -1 by default. if(trials > 0) { b->Iterations(trials); } } // Register a subset of all benchmarks for the current parallel instance. void register_sorted_subset(int parallel_instance_index, int parallel_instance_count) { // Sort to get a consistent order, because the order of static variable initialization is undefined by the C++ standard. std::sort(sorted_benchmarks().begin(), sorted_benchmarks().end(), [](const auto& l, const auto& r) { return l->sort_key() < r->sort_key(); }); size_t configs_per_instance = (sorted_benchmarks().size() + parallel_instance_count - 1) / parallel_instance_count; size_t start = std::min(parallel_instance_index * configs_per_instance, sorted_benchmarks().size()); size_t end = std::min((parallel_instance_index + 1) * configs_per_instance, sorted_benchmarks().size()); for(size_t i = start; i < end; ++i) { autotune_interface* benchmark = sorted_benchmarks().at(i).get(); apply_settings(benchmark::RegisterBenchmark( benchmark->name().c_str(), [=](benchmark::State& gbench_state) { benchmark->run(new_state(gbench_state)); })); } } hipStream_t stream = hipStreamDefault; size_t size; std::string seed_type; managed_seed seed; size_t batch_iterations; size_t warmup_iterations; bool cold; bool record_as_whole; int trials; int parallel_instance; int parallel_instances; }; } // namespace benchmark_utils #endif // ROCPRIM_BENCHMARK_UTILS_HPP_ rocPRIM-rocm-7.1.0/benchmark/benchmark_warp_exchange.cpp000066400000000000000000000452471506507210100232240ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils.hpp" #include "../common/utils_device_ptr.hpp" #include "../common/warp_exchange.hpp" #include // HIP API #include #include #include #include #include #include #include #include #include #include struct ScatterToStripedOp { template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.scatter_to_striped(thread_data, thread_data, ranks, storage); } }; template __device__ auto warp_exchange_benchmark(T* d_output, unsigned int trials) -> std::enable_if_t && !std::is_same::value> { T thread_data[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { // generate unique value each data-element thread_data[i] = static_cast(threadIdx.x * ItemsPerThread + i); } using warp_exchange_type = ::rocprim::warp_exchange; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; ++i) { Op{}(warp_exchange_type(), thread_data, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { const unsigned int global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_benchmark(T* d_output, unsigned int trials) -> std::enable_if_t && std::is_same::value> { T thread_data[ItemsPerThread]; unsigned int thread_ranks[ItemsPerThread]; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = threadIdx.x / LogicalWarpSize; const unsigned int lane_id = threadIdx.x % LogicalWarpSize; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { // generate unique value each data-element thread_data[i] = static_cast(threadIdx.x * ItemsPerThread + i); // generate unique destination location for each data-element const unsigned int s_lane_id = i % 2 == 0 ? LogicalWarpSize - 1 - lane_id : lane_id; thread_ranks[i] = s_lane_id * ItemsPerThread + i; // scatter values in warp across whole storage } using warp_exchange_type = ::rocprim::warp_exchange; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; ++i) { Op{}(warp_exchange_type(), thread_data, thread_ranks, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; ++i) { const unsigned int global_idx = (BlockSize * blockIdx.x + threadIdx.x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template __device__ auto warp_exchange_benchmark(T* /*d_output*/, unsigned int /*trials*/) -> std::enable_if_t> {} template __global__ __launch_bounds__(BlockSize) void warp_exchange_kernel(T* d_output, unsigned int trials) { warp_exchange_benchmark(d_output, trials); } template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; // Calculate the number of elements size_t N = bytes / sizeof(T); constexpr uint64_t trials = 200; constexpr uint64_t items_per_block = BlockSize * ItemsPerThread; const uint64_t size = items_per_block * ((N + items_per_block - 1) / items_per_block); common::device_ptr d_output(size); state.run( [&] { warp_exchange_kernel <<>>(d_output.get(), trials); }); state.set_throughput(trials * size, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, IT, WS, OP) \ executor.queue_fn(bench_naming::format_name("{lvl:warp,algo:exchange,key_type:" #T \ ",operation:" #OP ",ws:" #WS ",cfg:{bs:" #BS \ ",ipt:" #IT "}}") \ .c_str(), \ run_benchmark); int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); CREATE_BENCHMARK(int, 256, 1, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 1, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 4, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 4, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 16, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 16, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 32, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 1, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 1, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 4, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 4, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 16, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 16, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 32, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 1, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 32, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 32, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 16, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 1, 32, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 4, 16, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 4, 32, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 16, 16, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 16, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 16, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 32, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 16, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 32, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 16, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 32, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 16, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 32, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 32, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 16, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 32, ScatterToStripedOp) int hip_device = 0; HIP_CHECK(::rocprim::detail::get_device_from_stream(hipStreamDefault, hip_device)); if(is_warp_size_supported(64, hip_device)) { CREATE_BENCHMARK(int, 256, 1, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 4, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 16, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 64, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(int, 256, 1, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 4, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 16, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 64, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(int, 256, 1, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 64, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 4, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 16, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 64, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(int, 256, 1, 64, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 4, 64, ScatterToStripedOp) CREATE_BENCHMARK(int, 256, 16, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 1, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 4, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::int128_t, 256, 16, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 64, common::BlockedToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 64, common::StripedToBlockedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 64, common::BlockedToStripedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 64, common::StripedToBlockedShuffleOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 1, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 4, 64, ScatterToStripedOp) CREATE_BENCHMARK(rocprim::uint128_t, 256, 16, 64, ScatterToStripedOp) } executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_warp_reduce.cpp000066400000000000000000000162711506507210100227040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_device_ptr.hpp" #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_reduce_kernel(const T* d_input, T* d_output) { if constexpr(VirtualWaveSize <= rocprim::arch::wavefront::max_size()) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { wreduce_t().reduce(value, value, storage); } d_output[i] = value; } } template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { if constexpr(VirtualWaveSize <= rocprim::arch::wavefront::max_size()) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { wreduce_t().head_segmented_reduce(value, value, flag, storage); } d_output[i] = value; } } template inline auto execute_warp_reduce_kernel( T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_reduce_kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, output); HIP_CHECK(hipGetLastError()); } template inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(segmented_warp_reduce_kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, input, flags, output); HIP_CHECK(hipGetLastError()); } template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; using flag_type = unsigned char; // Calculate the number of elements size_t N = bytes / sizeof(T); const auto size = BlockSize * ((N + BlockSize - 1) / BlockSize); const auto random_range = limit_random_range(0, 10); std::vector input = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector flags = get_random_data(size, 0, 1, seed.get_1()); common::device_ptr d_input(input); common::device_ptr d_flags(flags); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { execute_warp_reduce_kernel( d_input.get(), d_output.get(), d_flags.get(), size, stream); }); state.set_throughput(Trials * size, sizeof(T)); } #define CREATE_BENCHMARK(T, WS, BS) \ executor.queue_fn( \ bench_naming::format_name("{lvl:warp,algo:reduce,key_type:" #T ",broadcast_result:" \ + std::string(AllReduce ? "true" : "false") \ + ",segmented:" + std::string(Segmented ? "true" : "false") \ + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark); // clang-format off #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 32, 64) \ CREATE_BENCHMARK(type, 37, 64) \ CREATE_BENCHMARK(type, 61, 64) \ CREATE_BENCHMARK(type, 64, 64) // clang-format on template void add_benchmarks(benchmark_utils::executor& executor) { BENCHMARK_TYPE(int) BENCHMARK_TYPE(float) BENCHMARK_TYPE(double) BENCHMARK_TYPE(int8_t) BENCHMARK_TYPE(uint8_t) BENCHMARK_TYPE(rocprim::half) BENCHMARK_TYPE(rocprim::int128_t) BENCHMARK_TYPE(rocprim::uint128_t) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); add_benchmarks(executor); add_benchmarks(executor); add_benchmarks(executor); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_warp_scan.cpp000066400000000000000000000175701506507210100223640ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include "../common/utils_device_ptr.hpp" #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include enum class scan_type { inclusive_scan, exclusive_scan, broadcast }; template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void kernel(const T* input, T* output, const T init) { if constexpr(VirtualWaveSize <= rocprim::arch::wavefront::max_size()) { Runner::template run(input, output, init); } } struct inclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rocprim::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { wscan_t().inclusive_scan(value, value, storage); } output[i] = value; } }; struct exclusive_scan { template __device__ static void run(const T* input, T* output, const T init) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rocprim::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { wscan_t().exclusive_scan(value, value, init, storage); } output[i] = value; } }; struct broadcast { template __device__ static void run(const T* input, T* output, const T init) { (void)init; const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; const unsigned int warp_id = i / VirtualWaveSize; const unsigned int src_lane = warp_id % VirtualWaveSize; auto value = input[i]; using wscan_t = rocprim::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { value = wscan_t().broadcast(value, src_lane, storage); } output[i] = value; } }; template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; // Calculate the number of elements size_t size = bytes / sizeof(T); // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1) / BlockSize); // Allocate and fill memory std::vector input(size, (T)1); common::device_ptr d_input(input); common::device_ptr d_output(size); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { hipLaunchKernelGGL(HIP_KERNEL_NAME(kernel), dim3(size / BlockSize), dim3(BlockSize), 0, stream, d_input.get(), d_output.get(), input[0]); }); state.set_throughput(Trials * size, sizeof(T)); } #define CREATE_BENCHMARK(T, BS, WS) \ executor.queue_fn(bench_naming::format_name("{lvl:warp,algo:scan,key_type:" #T ",subalgo:" \ + method_name + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark); // clang-format off #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 64, 64) \ CREATE_BENCHMARK(type, 128, 64) \ CREATE_BENCHMARK(type, 256, 64) \ CREATE_BENCHMARK(type, 256, 32) \ CREATE_BENCHMARK(type, 256, 16) \ CREATE_BENCHMARK(type, 63, 63) \ CREATE_BENCHMARK(type, 62, 31) \ CREATE_BENCHMARK(type, 60, 15) // clang-format on // clang-format off #define BENCHMARK_TYPE_P2(type) \ CREATE_BENCHMARK(type, 64, 64) \ CREATE_BENCHMARK(type, 128, 64) \ CREATE_BENCHMARK(type, 256, 64) \ CREATE_BENCHMARK(type, 256, 32) \ CREATE_BENCHMARK(type, 256, 16) // clang-format on template auto add_benchmarks(benchmark_utils::executor& executor, const std::string& method_name) -> std::enable_if_t::value || std::is_same::value> { using custom_double2 = common::custom_type; using custom_int_double = common::custom_type; BENCHMARK_TYPE(int) BENCHMARK_TYPE(float) BENCHMARK_TYPE(double) BENCHMARK_TYPE(int8_t) BENCHMARK_TYPE(uint8_t) BENCHMARK_TYPE(rocprim::half) BENCHMARK_TYPE(custom_double2) BENCHMARK_TYPE(custom_int_double) BENCHMARK_TYPE(rocprim::int128_t) BENCHMARK_TYPE(rocprim::uint128_t) } template auto add_benchmarks(benchmark_utils::executor& executor, const std::string& method_name) -> std::enable_if_t::value> { using custom_double2 = common::custom_type; using custom_int_double = common::custom_type; BENCHMARK_TYPE_P2(int) BENCHMARK_TYPE_P2(float) BENCHMARK_TYPE_P2(double) BENCHMARK_TYPE_P2(int8_t) BENCHMARK_TYPE_P2(uint8_t) BENCHMARK_TYPE_P2(rocprim::half) BENCHMARK_TYPE_P2(custom_double2) BENCHMARK_TYPE_P2(custom_int_double) BENCHMARK_TYPE_P2(rocprim::int128_t) BENCHMARK_TYPE_P2(rocprim::uint128_t) } int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); add_benchmarks(executor, "inclusive_scan"); add_benchmarks(executor, "exclusive_scan"); add_benchmarks(executor, "broadcast"); executor.run(); } rocPRIM-rocm-7.1.0/benchmark/benchmark_warp_sort.cpp000066400000000000000000000252511506507210100224220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include "benchmark_utils.hpp" #include "../common/utils_custom_type.hpp" #include // HIP API #include // rocPRIM #include #include #include #include #include #include #include #include #include template __global__ __launch_bounds__(BlockSize) void warp_sort_kernel(K* input_keys, K* output_keys) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; rocprim::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rocprim::warp_sort wsort; wsort.sort(keys); rocprim::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void warp_sort_by_key_kernel(K* input_keys, V* input_values, K* output_keys, V* output_values) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; V values[ItemsPerThread]; rocprim::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rocprim::block_load_direct_striped(flat_tid, input_values + block_offset, values); rocprim::warp_sort wsort; wsort.sort(keys, values); rocprim::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); rocprim::block_store_direct_blocked(flat_tid, output_values + block_offset, values); } template void run_benchmark(benchmark_utils::state&& state) { const auto& stream = state.stream; const auto& bytes = state.bytes; const auto& seed = state.seed; // Calculate the number of elements size_t size = bytes / sizeof(Key); // Make sure size is a multiple of items_per_block constexpr auto items_per_block = BlockSize * ItemsPerThread; size = BlockSize * ((size + items_per_block - 1) / items_per_block); // Allocate and fill memory const auto random_range = limit_random_range(0, 10'000); std::vector input_key = get_random_data(size, random_range.first, random_range.second, seed.get_0()); std::vector input_value(size_t(1)); if(SortByKey) { const auto random_range = limit_random_range(0, 10'000); input_value = get_random_data(size, random_range.first, random_range.second, seed.get_1()); } Key* d_input_key = nullptr; Key* d_output_key = nullptr; Value* d_input_value = nullptr; Value* d_output_value = nullptr; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_key), size * sizeof(Key))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_key), size * sizeof(Key))); if(SortByKey) { HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_value), size * sizeof(Value))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_value), size * sizeof(Value))); } HIP_CHECK(hipMemcpy(d_input_key, input_key.data(), size * sizeof(Key), hipMemcpyHostToDevice)); if(SortByKey) HIP_CHECK(hipMemcpy(d_input_value, input_value.data(), size * sizeof(Value), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); state.run( [&] { if(SortByKey) { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL(HIP_KERNEL_NAME(warp_sort_by_key_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_input_value, d_output_key, d_output_value); } } else { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_output_key); } } }); auto type_size = SortByKey ? sizeof(Key) + sizeof(Value) : sizeof(Key); state.set_throughput(size * Trials, type_size); HIP_CHECK(hipFree(d_input_key)); HIP_CHECK(hipFree(d_output_key)); HIP_CHECK(hipFree(d_input_value)); HIP_CHECK(hipFree(d_output_value)); } #define CREATE_SORT_BENCHMARK(K, BS, WS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K ",value_type:" \ + std::string(Traits::name()) \ + ",ws:" #WS ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark); #define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS, IPT) \ executor.queue_fn(bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K \ ",value_type:" #V ",ws:" #WS ",cfg:{bs:" #BS \ ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark); // clang-format off #define BENCHMARK_TYPE(type) \ CREATE_SORT_BENCHMARK(type, 64, 64, 1) \ CREATE_SORT_BENCHMARK(type, 64, 64, 2) \ CREATE_SORT_BENCHMARK(type, 64, 64, 4) \ CREATE_SORT_BENCHMARK(type, 128, 64, 1) \ CREATE_SORT_BENCHMARK(type, 128, 64, 2) \ CREATE_SORT_BENCHMARK(type, 128, 64, 4) \ CREATE_SORT_BENCHMARK(type, 256, 64, 1) \ CREATE_SORT_BENCHMARK(type, 256, 64, 2) \ CREATE_SORT_BENCHMARK(type, 256, 64, 4) \ CREATE_SORT_BENCHMARK(type, 64, 32, 1) \ CREATE_SORT_BENCHMARK(type, 64, 32, 2) \ CREATE_SORT_BENCHMARK(type, 64, 16, 1) \ CREATE_SORT_BENCHMARK(type, 64, 16, 2) \ CREATE_SORT_BENCHMARK(type, 64, 16, 4) // clang-format on // clang-format off #define BENCHMARK_KEY_TYPE(type, value) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 1) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 2) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 4) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 1) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 2) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 4) // clang-format on int main(int argc, char* argv[]) { benchmark_utils::executor executor(argc, argv, 128 * benchmark_utils::MiB, 1, 0); using custom_double2 = common::custom_type; using custom_int_double = common::custom_type; using custom_int2 = common::custom_type; using custom_char_double = common::custom_type; using custom_longlong_double = common::custom_type; BENCHMARK_TYPE(int) BENCHMARK_TYPE(float) BENCHMARK_TYPE(double) BENCHMARK_TYPE(int8_t) BENCHMARK_TYPE(uint8_t) BENCHMARK_TYPE(rocprim::half) BENCHMARK_TYPE(rocprim::int128_t) BENCHMARK_TYPE(rocprim::uint128_t) BENCHMARK_KEY_TYPE(float, float) BENCHMARK_KEY_TYPE(unsigned int, int) BENCHMARK_KEY_TYPE(int, custom_double2) BENCHMARK_KEY_TYPE(int, custom_int_double) BENCHMARK_KEY_TYPE(custom_int2, custom_double2) BENCHMARK_KEY_TYPE(custom_int2, custom_char_double) BENCHMARK_KEY_TYPE(custom_int2, custom_longlong_double) BENCHMARK_KEY_TYPE(int8_t, int8_t) BENCHMARK_KEY_TYPE(uint8_t, uint8_t) BENCHMARK_KEY_TYPE(rocprim::half, rocprim::half) BENCHMARK_KEY_TYPE(rocprim::int128_t, rocprim::int128_t) BENCHMARK_KEY_TYPE(rocprim::uint128_t, rocprim::uint128_t) executor.run(); } rocPRIM-rocm-7.1.0/benchmark/cmdparser.hpp000066400000000000000000000420621506507210100203540ustar00rootroot00000000000000// The MIT License (MIT) // // Copyright (c) 2015 - 2016 Florian Rappl // Modifications Copyright (c) 2019-2024, Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /* This file is part of the C++ CmdParser utility. Copyright (c) 2015 - 2016 Florian Rappl */ #pragma once #include #include #include #include #include #include namespace cli { struct CallbackArgs { const std::vector& arguments; std::ostream& output; std::ostream& error; }; class Parser { private: class CmdBase { public: explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : name(name), command(name.size() > 0 ? "-" + name : ""), alternative(alternative.size() > 0 ? "--" + alternative : ""), description(description), required(required), handled(false), arguments({}), dominant(dominant), variadic(variadic) { } virtual ~CmdBase() { } std::string name; std::string command; std::string alternative; std::string description; bool required; bool handled; std::vector arguments; bool const dominant; bool const variadic; virtual std::string print_value() const = 0; virtual bool parse(std::ostream& output, std::ostream& error) = 0; bool is(const std::string& given) const { return given == command || given == alternative; } }; template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; template class CmdFunction final : public CmdBase { public: explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream& output, std::ostream& error) override { try { CallbackArgs args { arguments, output, error }; value = callback(args); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return ""; } std::function callback; T value; }; template class CmdArgument final : public CmdBase { public: explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic), value(T()) { } virtual bool parse(std::ostream&, std::ostream&) override { try { value = Parser::parse(arguments, value); return true; } catch (...) { return false; } } virtual std::string print_value() const override { return stringify(value); } T value; }; static int parse(const std::vector& elements, const int&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } static bool parse(const std::vector& elements, const bool& defval) { if (elements.size() != 0) throw std::runtime_error("A boolean command line parameter cannot have any arguments."); return !defval; } static double parse(const std::vector& elements, const double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } static float parse(const std::vector& elements, const float&) { if (elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } static long double parse(const std::vector& elements, const long double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } static unsigned int parse(const std::vector& elements, const unsigned int&) { if (elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } static unsigned long parse(const std::vector& elements, const unsigned long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } static unsigned long long parse(const std::vector& elements, const unsigned long long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoull(elements[0]); } static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } static std::string parse(const std::vector& elements, const std::string&) { if (elements.size() != 1) throw std::bad_cast(); return elements[0]; } template static std::vector parse(const std::vector& elements, const std::vector&) { const T defval = T(); std::vector values { }; std::vector buffer(1); for (const auto& element : elements) { buffer[0] = element; values.push_back(parse(buffer, defval)); } return values; } template static std::string stringify(const T& value) { return std::to_string(value); } template static std::string stringify(const std::vector& values) { std::stringstream ss { }; ss << "[ "; for (const auto& value : values) { ss << stringify(value) << " "; } ss << "]"; return ss.str(); } static std::string stringify(const std::string& str) { return str; } public: explicit Parser(int argc, const char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } explicit Parser(int argc, char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } ~Parser() { for (int i = 0, n = _commands.size(); i < n; ++i) { delete _commands[i]; } } bool has_help() const { for (const auto command : _commands) { if (command->name == "h" && command->alternative == "--help") { return true; } } return false; } void enable_help() { set_callback("h", "help", std::function([this](CallbackArgs& args){ args.output << this->usage(); /*exit(0);*/ return false; }), "", true); } void disable_help() { for (auto command = _commands.begin(); command != _commands.end(); ++command) { if ((*command)->name == "h" && (*command)->alternative == "--help") { _commands.erase(command); break; } } } template void set_default(bool is_required, const std::string& description = "") { auto command = new CmdArgument { "", "", description, is_required, false }; _commands.push_back(command); } template void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, true, dominant }; _commands.push_back(command); } template void set_optional(const std::string& name, const std::string& alternative, const T& defaultValue, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, false, dominant }; command->value = defaultValue; _commands.push_back(command); } template void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { auto command = new CmdFunction { name, alternative, description, false, dominant }; command->callback = callback; _commands.push_back(command); } inline void run_and_exit_if_error() { if (run() == false) { exit(1); } } inline bool run() { return run(std::cout, std::cerr); } inline bool run(std::ostream& output) { return run(output, std::cerr); } bool run(std::ostream& output, std::ostream& error) { if (_arguments.size() > 0) { auto current = find_default(); for (int i = 0, n = _arguments.size(); i < n; ++i) { auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; if (associated != nullptr) { current = associated; associated->handled = true; } else if (current == nullptr) { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; } else { current->arguments.push_back(_arguments[i]); current->handled = true; if (!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default // command. current = find_default(); } } } } // First, parse dominant arguments since they succeed even if required // arguments are missing. for (auto command : _commands) { if (command->handled && command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } // Next, check for any missing arguments. for (auto command : _commands) { if (command->required && !command->handled) { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. for (auto command : _commands) { if (command->handled && !command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } return true; } template T get(const std::string& name) const { for (const auto& command : _commands) { if (command->name == name) { auto cmd = dynamic_cast*>(command); if (cmd == nullptr) { throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); } return cmd->value; } } throw std::runtime_error("The parameter " + name + " could not be found."); } template T get_if(const std::string& name, std::function callback) const { auto value = get(name); return callback(value); } int requirements() const { int count = 0; for (const auto& command : _commands) { if (command->required) { ++count; } } return count; } int commands() const { return static_cast(_commands.size()); } inline const std::string& app_name() const { return _appname; } protected: CmdBase* find(const std::string& name) { for (auto command : _commands) { if (command->is(name)) { return command; } } return nullptr; } CmdBase* find_default() { for (auto command : _commands) { if (command->name == "") { return command; } } return nullptr; } std::string usage() const { std::stringstream ss { }; ss << "Available parameters:\n\n"; for (const auto& command : _commands) { ss << " " << command->command << "\t" << command->alternative; if (command->required == true) { ss << "\t(required)"; } ss << "\n " << command->description; if (command->required == false) { ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; } ss << "\n\n"; } return ss.str(); } void print_help(std::stringstream& ss) const { if (has_help()) { ss << "For more help use --help or -h.\n"; } } std::string howto_required(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string howto_use(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string no_default() const { std::stringstream ss { }; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); return ss.str(); } private: const std::string _appname; std::vector _arguments; std::vector _commands; }; } rocPRIM-rocm-7.1.0/cmake/000077500000000000000000000000001506507210100150055ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/cmake/ConfigAutotune.cmake000066400000000000000000000116321506507210100207440ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # Function to add a configured source file to a target. # It parses arguments, prepares the output file name, and configures the file. function(add_configured_source) # Parse arguments and ensure proper usage cmake_parse_arguments(PARSE_ARGV 0 ARG "" "INPUT;TARGET;OUTPUT_PATTERN" "NAMES;VALUES") list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_VALUES VALS_LEN) if (NOT NAMES_LEN EQUAL VALS_LEN) message(FATAL_ERROR "add_configured_source: The same number of names (${NAMES_LEN}) and values (${VALS_LEN}) must be provided!") endif() # Loop through the names and values, preparing the output pattern set(max ${VALS_LEN}) math(EXPR max "${max} - 1") foreach(i RANGE ${max}) list(GET ARG_NAMES ${i} curr_name) list(GET ARG_VALUES ${i} "${curr_name}") endforeach() # Configure the output file and add it to the target string(CONFIGURE "${ARG_OUTPUT_PATTERN}" output @ONLY) string(MAKE_C_IDENTIFIER ${output} output) set(output_path "${ARG_TARGET}.parallel/${output}.cpp") configure_file("${ARG_INPUT}" "${output_path}" @ONLY) set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_CLEAN_FILES "${ARG_TARGET}.parallel") target_sources("${ARG_TARGET}" PRIVATE "${output_path}") target_include_directories("${ARG_TARGET}" PRIVATE "../benchmark") # Ensure reconfiguration if necessary set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${ARG_INPUT}" "${output_path}") endfunction() # Function to divide two numbers and round up. function(div_round_up dividend divisor result_var) math(EXPR result "(${dividend} + ${divisor} - 1) / ${divisor}") set("${result_var}" "${result}" PARENT_SCOPE) endfunction() # Function to add a matrix of configured sources. # It handles permutations of input parameters and calls add_configured_source accordingly. function(add_matrix) set(single_value_args "TARGET" "INPUT" "OUTPUT_PATTERN" "SHARDS" "CURRENT_SHARD") cmake_parse_arguments(PARSE_ARGV 0 ARG "" "${single_value_args}" "NAMES;LISTS") # Validate argument lengths list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_LISTS LISTS_LEN) if (NOT NAMES_LEN EQUAL LISTS_LEN) message(FATAL_ERROR "add_matrix: The same number of names (${NAMES_LEN}) and lists (${LISTS_LEN}) must be provided!") endif() # Calculate the total number of permutations set(total_len 1) foreach(LIST IN LISTS ARG_LISTS) string(REPLACE " " ";" list ${LIST}) list(LENGTH list LIST_LEN) math(EXPR total_len "${total_len} * ${LIST_LEN}") endforeach() # Handle sharding if(NOT DEFINED ARG_SHARDS) set(ARG_SHARDS 1) endif() div_round_up("${total_len}" "${ARG_SHARDS}" per_shard) # Determine the range of permutations for the current shard math(EXPR start "${ARG_CURRENT_SHARD} * ${per_shard}") math(EXPR stop "${start} + ${per_shard} - 1") # Process each permutation foreach(i RANGE ${start} ${stop}) set(index ${i}) set(values "") foreach(input_list IN LISTS ARG_LISTS) string(REPLACE " " ";" curr_list ${input_list}) list(LENGTH curr_list curr_length) math(EXPR curr_index "${index} % ${curr_length}") list(GET curr_list ${curr_index} curr_item) list(APPEND values "${curr_item}") math(EXPR index "${index} / ${curr_length}") endforeach() # Add the configured source for each permutation add_configured_source(TARGET "${ARG_TARGET}" INPUT "${ARG_INPUT}" OUTPUT_PATTERN "${ARG_OUTPUT_PATTERN}" NAMES ${ARG_NAMES} VALUES ${values}) endforeach() endfunction() # Function to filter out odd block sizes. # It sets a variable in the parent scope based on the condition. function(reject_odd_blocksize RESULT BlockSize) math(EXPR res "${BlockSize} % 2") if(res EQUAL 0) set("${RESULT}" ON PARENT_SCOPE) else() set("${RESULT}" OFF PARENT_SCOPE) endif() endfunction()rocPRIM-rocm-7.1.0/cmake/Dependencies.cmake000066400000000000000000000231561506507210100204040ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ########################### # rocPRIM dependencies # ########################### # NOTE1: the reason we don't scope global state meddling using add_subdirectory # is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which # would promote IMPORTED targets of find_package(CONFIG) to be visible # by other parts of the build. So we save and restore global state. # # NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with # global state. This is consequence of abusing the CMake CXX language # which HIP piggybacks on top of. This kind of HIP support has one chance # at observing the global flags, at the find_package(HIP) invocation. # The device compiler won't be able to pick up changes after that, hence # the warning. set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS}) if(DEFINED BUILD_SHARED_LIBS) set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) endif() set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR}) set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "") # Turn off warnings and errors for all warnings in dependencies separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS}) list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors) if(MSVC) list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags list(APPEND CXX_FLAGS_LIST /w) else() list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags list(APPEND CXX_FLAGS_LIST -w) endif() list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS) # Don't build client dependencies as shared set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE) # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included. include(FetchContent) # For downloading, building, and installing required dependencies include(cmake/DownloadProject.cmake) # Test dependencies if(BUILD_TEST) # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake # # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND # # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main # # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets # will be duplicately defined. # # NOTE3: The following snippet first tries to find Google Test binary either in MODULE or CONFIG modes. # If neither succeeds it goes on to import Google Test into this build either from a system # source package (apt install googletest on Ubuntu 18.04 only) or GitHub and defines the MODULE # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets # with the MODULE IMPORTED names. if(NOT DEPENDENCIES_FORCE_DOWNLOAD) if(WIN32) # Older versions of gtest on Windows does not support printing of 128-bit values, # Causing compilation errors. find_package(GTest 1.11.0 REQUIRED) else() find_package(GTest QUIET) endif() endif() if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) option(BUILD_GTEST "Builds the googletest subproject" ON) option(BUILD_GMOCK "Builds the googlemock subproject" OFF) option(INSTALL_GTEST "Enable installation of googletest." OFF) if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD) FetchContent_Declare( googletest SOURCE_DIR /usr/src/googletest ) else() message(STATUS "Google Test not found. Fetching...") FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.11.0 ) endif() FetchContent_MakeAvailable(googletest) add_library(GTest::GTest ALIAS gtest) add_library(GTest::Main ALIAS gtest_main) else() find_package(GTest REQUIRED) if(TARGET GTest::gtest_main AND NOT TARGET GTest::Main) add_library(GTest::GTest ALIAS GTest::gtest) add_library(GTest::Main ALIAS GTest::gtest_main) endif() endif() endif(BUILD_TEST) if(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(benchmark CONFIG QUIET) endif() if(NOT TARGET benchmark::benchmark) message(STATUS "Google Benchmark not found. Fetching...") option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." OFF) option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark." OFF) FetchContent_Declare( googlebench GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG v1.8.0 ) set(HAVE_STD_REGEX ON) set(RUN_HAVE_STD_REGEX 1) FetchContent_MakeAvailable(googlebench) if(NOT TARGET benchmark::benchmark) add_library(benchmark::benchmark ALIAS benchmark) endif() else() find_package(benchmark CONFIG REQUIRED) endif() endif(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(ROCM 0.11.0 CONFIG QUIET PATHS "${ROCM_ROOT}") # rocm-cmake endif() if(NOT ROCM_FOUND) message(STATUS "ROCm CMake not found. Fetching...") # We don't really want to consume the build and test targets of ROCm CMake. # CMake 3.18 allows omitting them, even though there's a CMakeLists.txt in source root. if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.18) set(SOURCE_SUBDIR_ARG SOURCE_SUBDIR "DISABLE ADDING TO BUILD") else() set(SOURCE_SUBDIR_ARG) endif() set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") FetchContent_Declare( rocm-cmake GIT_REPOSITORY https://github.com/ROCm/rocm-cmake.git GIT_TAG rocm-6.1.2 ${SOURCE_SUBDIR_ARG} ) FetchContent_GetProperties(rocm-cmake) if(NOT rocm-cmake_POPULATED) # rocm-cmake 0.12.0 and higher needs to built from source FetchContent_Populate(rocm-cmake) message("Populated: ${rocm-cmake_SOURCE_DIR}") execute_process( WORKING_DIRECTORY ${rocm-cmake_SOURCE_DIR} COMMAND ${CMAKE_COMMAND} ${rocm-cmake_SOURCE_DIR} -DCMAKE_INSTALL_PREFIX=. ) execute_process( WORKING_DIRECTORY ${rocm-cmake_SOURCE_DIR} COMMAND ${CMAKE_COMMAND} --build ${rocm-cmake_SOURCE_DIR} --target install ) endif() FetchContent_MakeAvailable(rocm-cmake) find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH PATHS "${rocm-cmake_SOURCE_DIR}") else() find_package(ROCM 0.11.0 CONFIG REQUIRED PATHS "${ROCM_ROOT}") endif() # rocRAND (https://github.com/ROCmSoftwarePlatform/rocRAND) if(WITH_ROCRAND) find_package(rocrand QUIET) endif() if(WITH_ROCRAND AND NOT rocrand_FOUND) message(STATUS "Downloading and building rocrand.") set(ROCRAND_ROOT ${CMAKE_CURRENT_BINARY_DIR}/deps/rocrand CACHE PATH "") set(EXTRA_CMAKE_ARGS "-DGPU_TARGETS=${GPU_TARGETS}") # CMAKE_ARGS of download_project (or ExternalProject_Add) can't contain ; so another separator # is needed and LIST_SEPARATOR is passed to download_project() string(REPLACE ";" "|" EXTRA_CMAKE_ARGS "${EXTRA_CMAKE_ARGS}") # Pass launcher so sccache can be used to speed up building rocRAND if(CMAKE_CXX_COMPILER_LAUNCHER) set(EXTRA_CMAKE_ARGS "${EXTRA_CMAKE_ARGS} -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}") endif() download_project( PROJ rocrand GIT_REPOSITORY https://github.com/ROCmSoftwarePlatform/rocRAND.git GIT_TAG develop GIT_SHALLOW TRUE INSTALL_DIR ${ROCRAND_ROOT} LIST_SEPARATOR | CMAKE_ARGS -DCMAKE_CXX_COMPILER=hipcc -DBUILD_TEST=OFF -DCMAKE_INSTALL_PREFIX= -DCMAKE_PREFIX_PATH=/opt/rocm ${EXTRA_CMAKE_ARGS} LOG_DOWNLOAD TRUE LOG_CONFIGURE TRUE LOG_BUILD TRUE LOG_INSTALL TRUE LOG_OUTPUT_ON_FAILURE TRUE BUILD_PROJECT TRUE UPDATE_DISCONNECTED TRUE ) find_package(rocrand REQUIRED CONFIG PATHS ${ROCRAND_ROOT}) endif() # Restore user global state set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS}) if(DEFINED USER_BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS}) else() unset(BUILD_SHARED_LIBS CACHE ) endif() set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "") include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) if(BUILD_DOCS) include(ROCMSphinxDoc) endif() rocPRIM-rocm-7.1.0/cmake/DownloadProject.CMakeLists.cmake.in000066400000000000000000000020011506507210100235010ustar00rootroot00000000000000# Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. cmake_minimum_required(VERSION 2.8.2) project(${DL_ARGS_PROJ}-download NONE) include(ExternalProject) if(${DL_ARGS_BUILD_PROJECT}) ExternalProject_Add(${DL_ARGS_PROJ}-download ${DL_ARGS_UNPARSED_ARGUMENTS} SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" BUILD_IN_SOURCE TRUE TEST_COMMAND "" ) else() ExternalProject_Add(${DL_ARGS_PROJ}-download ${DL_ARGS_UNPARSED_ARGUMENTS} SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" BUILD_IN_SOURCE TRUE TEST_COMMAND "" UPDATE_COMMAND "" CONFIGURE_COMMAND "" BUILD_COMMAND "" INSTALL_COMMAND "" ) endif() rocPRIM-rocm-7.1.0/cmake/DownloadProject.cmake000066400000000000000000000167451506507210100211220ustar00rootroot00000000000000# Distributed under the OSI-approved MIT License. See accompanying # file LICENSE or https://github.com/Crascit/DownloadProject for details. # # MODULE: DownloadProject # # PROVIDES: # download_project( PROJ projectName # [PREFIX prefixDir] # [DOWNLOAD_DIR downloadDir] # [SOURCE_DIR srcDir] # [BINARY_DIR binDir] # [QUIET] # ... # ) # # Provides the ability to download and unpack a tarball, zip file, git repository, # etc. at configure time (i.e. when the cmake command is run). How the downloaded # and unpacked contents are used is up to the caller, but the motivating case is # to download source code which can then be included directly in the build with # add_subdirectory() after the call to download_project(). Source and build # directories are set up with this in mind. # # The PROJ argument is required. The projectName value will be used to construct # the following variables upon exit (obviously replace projectName with its actual # value): # # projectName_SOURCE_DIR # projectName_BINARY_DIR # # The SOURCE_DIR and BINARY_DIR arguments are optional and would not typically # need to be provided. They can be specified if you want the downloaded source # and build directories to be located in a specific place. The contents of # projectName_SOURCE_DIR and projectName_BINARY_DIR will be populated with the # locations used whether you provide SOURCE_DIR/BINARY_DIR or not. # # The DOWNLOAD_DIR argument does not normally need to be set. It controls the # location of the temporary CMake build used to perform the download. # # The PREFIX argument can be provided to change the base location of the default # values of DOWNLOAD_DIR, SOURCE_DIR and BINARY_DIR. If all of those three arguments # are provided, then PREFIX will have no effect. The default value for PREFIX is # CMAKE_BINARY_DIR. # # The QUIET option can be given if you do not want to show the output associated # with downloading the specified project. # # In addition to the above, any other options are passed through unmodified to # ExternalProject_Add() to perform the actual download, patch and update steps. # # Only those ExternalProject_Add() arguments which relate to downloading, patching # and updating of the project sources are intended to be used. Also note that at # least one set of download-related arguments are required. # # If using CMake 3.2 or later, the UPDATE_DISCONNECTED option can be used to # prevent a check at the remote end for changes every time CMake is run # after the first successful download. See the documentation of the ExternalProject # module for more information. It is likely you will want to use this option if it # is available to you. Note, however, that the ExternalProject implementation contains # bugs which result in incorrect handling of the UPDATE_DISCONNECTED option when # using the URL download method or when specifying a SOURCE_DIR with no download # method. Fixes for these have been created, the last of which is scheduled for # inclusion in CMake 3.8.0. Details can be found here: # # https://gitlab.kitware.com/cmake/cmake/commit/bdca68388bd57f8302d3c1d83d691034b7ffa70c # https://gitlab.kitware.com/cmake/cmake/issues/16428 # # If you experience build errors related to the update step, consider avoiding # the use of UPDATE_DISCONNECTED. # # EXAMPLE USAGE: # # include(DownloadProject) # download_project(PROJ googletest # GIT_REPOSITORY https://github.com/google/googletest.git # GIT_TAG master # UPDATE_DISCONNECTED 1 # QUIET # ) # # add_subdirectory(${googletest_SOURCE_DIR} ${googletest_BINARY_DIR}) # #======================================================================================== set(_DownloadProjectDir "${CMAKE_CURRENT_LIST_DIR}") include(CMakeParseArguments) function(download_project) set(options QUIET) set(oneValueArgs PROJ PREFIX DOWNLOAD_DIR SOURCE_DIR BINARY_DIR BUILD_PROJECT ) set(multiValueArgs "") cmake_parse_arguments(DL_ARGS "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN}) # Hide output if requested if (DL_ARGS_QUIET) set(OUTPUT_QUIET "OUTPUT_QUIET") else() unset(OUTPUT_QUIET) message(STATUS "Downloading/updating ${DL_ARGS_PROJ}") endif() # Set up where we will put our temporary CMakeLists.txt file and also # the base point below which the default source and binary dirs will be. # The prefix must always be an absolute path. if (NOT DL_ARGS_PREFIX) set(DL_ARGS_PREFIX "${CMAKE_BINARY_DIR}") else() get_filename_component(DL_ARGS_PREFIX "${DL_ARGS_PREFIX}" ABSOLUTE BASE_DIR "${CMAKE_CURRENT_BINARY_DIR}") endif() if (NOT DL_ARGS_DOWNLOAD_DIR) set(DL_ARGS_DOWNLOAD_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-download") endif() # Ensure the caller can know where to find the source and build directories if (NOT DL_ARGS_SOURCE_DIR) set(DL_ARGS_SOURCE_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-src") endif() if (NOT DL_ARGS_BINARY_DIR) set(DL_ARGS_BINARY_DIR "${DL_ARGS_PREFIX}/${DL_ARGS_PROJ}-build") endif() set(${DL_ARGS_PROJ}_SOURCE_DIR "${DL_ARGS_SOURCE_DIR}" PARENT_SCOPE) set(${DL_ARGS_PROJ}_BINARY_DIR "${DL_ARGS_BINARY_DIR}" PARENT_SCOPE) # The way that CLion manages multiple configurations, it causes a copy of # the CMakeCache.txt to be copied across due to it not expecting there to # be a project within a project. This causes the hard-coded paths in the # cache to be copied and builds to fail. To mitigate this, we simply # remove the cache if it exists before we configure the new project. It # is safe to do so because it will be re-generated. Since this is only # executed at the configure step, it should not cause additional builds or # downloads. file(REMOVE "${DL_ARGS_DOWNLOAD_DIR}/CMakeCache.txt") # Create and build a separate CMake project to carry out the download. # If we've already previously done these steps, they will not cause # anything to be updated, so extra rebuilds of the project won't occur. # Make sure to pass through CMAKE_MAKE_PROGRAM in case the main project # has this set to something not findable on the PATH. configure_file("${_DownloadProjectDir}/DownloadProject.CMakeLists.cmake.in" "${DL_ARGS_DOWNLOAD_DIR}/CMakeLists.txt") execute_process(COMMAND ${CMAKE_COMMAND} -G "${CMAKE_GENERATOR}" -D "CMAKE_MAKE_PROGRAM:FILE=${CMAKE_MAKE_PROGRAM}" . RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "CMake step for ${DL_ARGS_PROJ} failed: ${result}") endif() execute_process(COMMAND ${CMAKE_COMMAND} --build . RESULT_VARIABLE result ${OUTPUT_QUIET} WORKING_DIRECTORY "${DL_ARGS_DOWNLOAD_DIR}" ) if(result) message(FATAL_ERROR "Build step for ${DL_ARGS_PROJ} failed: ${result}") endif() endfunction() rocPRIM-rocm-7.1.0/cmake/GenerateResourceSpec.cmake000077500000000000000000000067141506507210100220770ustar00rootroot00000000000000#!/usr/bin/cmake -P find_program(ROCMINFO_EXECUTABLE rocminfo ) if(NOT ROCMINFO_EXECUTABLE) message(FATAL_ERROR "rocminfo not found") endif() execute_process( COMMAND ${ROCMINFO_EXECUTABLE} RESULT_VARIABLE ROCMINFO_EXIT_CODE OUTPUT_VARIABLE ROCMINFO_STDOUT ERROR_VARIABLE ROCMINFO_STDERR ) if(ROCMINFO_EXIT_CODE) message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") message(SEND_ERROR ${ROCMINFO_STDOUT}) message(FATAL_ERROR ${ROCMINFO_STDERR}) endif() string(REGEX MATCHALL [[--(gfx[0-9a-f]+)]] ROCMINFO_MATCHES ${ROCMINFO_STDOUT} ) # NOTE: Unfortunately we don't have structs in CMake, # neither do we have std::partition only list(SORT) # # Transform raw regex matches to pairs of gfx IP and device id # This will be our struct emulation. In C++ it would be # # struct device # { # std::string ip; # int id; # }; # # std::vector GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), # [](const device& lhs, const device& rhs) # { # return std::lexicographical_compare(lhs.ip.begin(), lhs.ip.end(), # rhs.ip.begin(), rhs.ip.end()); # }); # set(GFXIP_AND_ID) set(ID 0) foreach(ROCMINFO_MATCH IN LISTS ROCMINFO_MATCHES) string(REGEX REPLACE "--" "" ROCMINFO_MATCH ${ROCMINFO_MATCH} ) list(APPEND GFXIP_AND_ID "${ROCMINFO_MATCH}:${ID}") math(EXPR ID "${ID} + 1") endforeach() list(SORT GFXIP_AND_ID) # Now comes the tricky part: implementing the following C++ logic # # std::stringstream JSON_PAYLOAD; # auto it = GFXIP_AND_ID.begin(); # while (it != GFXIP_AND_ID.end()) # { # auto IT = std::find_if(it, GFXIP_AND_ID.end(), # [=](const device& ip_id){ return ip_id.ip.compare(it->ip) != 0; }); # JSON_PAYLOAD << "\n \"" << it->ip << "\": ["; # std::for_each(it, IT, [&](const device& ip_id) # { # JSON_PAYLOAD << # "\n {\n" << # " \"id\": \"" << ip_id.id << "\"\n" << # " },"; # }); # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # JSON_PAYLOAD << "\n ],"; # it = IT; # } # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # set(JSON_PAYLOAD) set(IT1 0) list(GET GFXIP_AND_ID ${IT1} I1) string(REGEX REPLACE ":[0-9a-f]+" "" IP1 ${I1}) list(LENGTH GFXIP_AND_ID COUNT) while(IT1 LESS COUNT) string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") set(IT2 ${IT1}) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2}) while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) string(APPEND JSON_PAYLOAD "\n {\n" " \"id\": \"${ID2}\"\n" " }," ) math(EXPR IT2 "${IT2} + 1") if(IT2 LESS COUNT) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9a-f]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9a-f]+:]] "" ID2 ${I2}) endif() endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) string(APPEND JSON_PAYLOAD "\n ],") set(IT1 ${IT2}) set(IP1 ${IP2}) endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) set(JSON_HEAD [[{ "version": { "major": 1, "minor": 0 }, "local": [ {]] ) set(JSON_TAIL [[ } ] }]] ) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/resources.json ${JSON_HEAD} ${JSON_PAYLOAD} ${JSON_TAIL} ) rocPRIM-rocm-7.1.0/cmake/Summary.cmake000066400000000000000000000111621506507210100174450ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(print_configuration_summary) find_package(Git) if(GIT_FOUND) execute_process( COMMAND ${GIT_EXECUTABLE} show --format=%H --no-patch WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} OUTPUT_VARIABLE COMMIT_HASH OUTPUT_STRIP_TRAILING_WHITESPACE ) execute_process( COMMAND ${GIT_EXECUTABLE} show --format=%s --no-patch WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} OUTPUT_VARIABLE COMMIT_SUBJECT OUTPUT_STRIP_TRAILING_WHITESPACE ) endif() execute_process( COMMAND ${CMAKE_CXX_COMPILER} --version WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} OUTPUT_VARIABLE CMAKE_CXX_COMPILER_VERBOSE_DETAILS OUTPUT_STRIP_TRAILING_WHITESPACE ) find_program(UNAME_EXECUTABLE uname) if(UNAME_EXECUTABLE) execute_process( COMMAND ${UNAME_EXECUTABLE} -a WORKING_DIRECTORY ${CMAKE_CURRENT_LIST_DIR} OUTPUT_VARIABLE LINUX_KERNEL_DETAILS OUTPUT_STRIP_TRAILING_WHITESPACE ) endif() string(REPLACE "\n" ";" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}") list(TRANSFORM CMAKE_CXX_COMPILER_VERBOSE_DETAILS PREPEND "-- ") string(REPLACE ";" "\n" CMAKE_CXX_COMPILER_VERBOSE_DETAILS "${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}") message(STATUS "") message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") if(USE_HIPCXX) message(STATUS " HIP compiler : ${CMAKE_HIP_COMPILER}") message(STATUS " HIP compiler version : ${CMAKE_HIP_COMPILER_VERSION}") string(STRIP "${CMAKE_HIP_FLAGS}" CMAKE_HIP_FLAGS_STRIP) message(STATUS " HIP flags : ${CMAKE_HIP_FLAGS_STRIP}") else() message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") endif() get_property(GENERATOR_IS_MULTI_CONFIG GLOBAL PROPERTY GENERATOR_IS_MULTI_CONFIG) if(GENERATOR_IS_MULTI_CONFIG) message(STATUS " Build types : ${CMAKE_CONFIGURATION_TYPES}") else() message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") endif() message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") if(USE_HIPCXX) message(STATUS " Device targets : ${CMAKE_HIP_ARCHITECTURES}") else() message(STATUS " Device targets : ${GPU_TARGETS}") endif() message(STATUS "") message(STATUS " ONLY_INSTALL : ${ONLY_INSTALL}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " WITH_ROCRAND : ${WITH_ROCRAND}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_NAIVE_BENCHMARK : ${BUILD_NAIVE_BENCHMARK}") message(STATUS " BUILD_EXAMPLE : ${BUILD_EXAMPLE}") message(STATUS " BUILD_DOCS : ${BUILD_DOCS}") message(STATUS " BUILD_OFFLOAD_COMPRESS : ${BUILD_OFFLOAD_COMPRESS}") message(STATUS " USE_SYSTEM_LIB : ${USE_SYSTEM_LIB}") message(STATUS "") message(STATUS "Detailed:") message(STATUS " C++ compiler details : \n${CMAKE_CXX_COMPILER_VERBOSE_DETAILS}") if(GIT_FOUND) message(STATUS " Commit : ${COMMIT_HASH}") message(STATUS " ${COMMIT_SUBJECT}") endif() if(UNAME_EXECUTABLE) message(STATUS " Unix name : ${LINUX_KERNEL_DETAILS}") endif() endfunction() rocPRIM-rocm-7.1.0/cmake/VerifyCompiler.cmake000066400000000000000000000033061506507210100207500ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm /opt/rocm/llvm /opt/rocm /opt/rocm/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) if(NOT USE_HIPCXX) if(HIP_COMPILER STREQUAL "clang") if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() else() message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform)") endif() endif() rocPRIM-rocm-7.1.0/common/000077500000000000000000000000001506507210100152155ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/common/README.md000066400000000000000000000016551506507210100165030ustar00rootroot00000000000000# Common utilities rocPRIM's tests and benchmarks employ numerous utilities that are common in implementation. This folder hosts these for an easier and less error-prone maintenance. ## When to add a common utility When adding a new test or benchmark that depends on a utility, the following cases must be considered: 1. If the utility is already implemented in some `common` header, then there's nothing to do except perhaps extending its functionality. 2. If the utility does not exit yet in any `common` header, then fisrt it must be checked whether some `benchmark` or `test`[^1] utility header implements this functionality. If so, then it must be moved to the appropriate common header. 3. If the utility does not exit yet in any `common` nor `test` nor `benchmark` utility header, then it must be added to the appropriate `test` or `benchmark` header. [^1]: When adding a new test check the `benchmark` utilities, and viceversa. rocPRIM-rocm-7.1.0/common/device_adjacent_difference.hpp000066400000000000000000000140761506507210100232000ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_DEVICE_ADJACENT_DIFFERENCE_HPP_ #define COMMON_DEVICE_ADJACENT_DIFFERENCE_HPP_ #include #include #include #include #include namespace common { enum class api_variant { no_alias, alias, in_place }; template auto dispatch_adjacent_difference( std::true_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::rocprim::adjacent_difference(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference( std::false_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::rocprim::adjacent_difference_right(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference( std::true_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::rocprim::adjacent_difference_inplace(temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference( std::false_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) { return ::rocprim::adjacent_difference_right_inplace(temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference( std::true_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::rocprim::adjacent_difference_inplace(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference( std::false_type /*left*/, std::integral_constant /*aliasing*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) { return ::rocprim::adjacent_difference_right_inplace(temporary_storage, storage_size, input, output, std::forward(args)...); } } // namespace common #endif // COMMON_DEVICE_ADJACENT_DIFFERENCE_HPP_ rocPRIM-rocm-7.1.0/common/device_batch_memcpy.hpp000066400000000000000000000133541506507210100217060ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_DEVICE_BATCH_MEMCPY_HPP_ #define COMMON_DEVICE_BATCH_MEMCPY_HPP_ #include #include #include #include #include #include #include #include #include namespace common { // Used for generating offsets. We generate a permutation map and then derive // offsets via a sum scan over the sizes in the order of the permutation. This // allows us to keep the order of buffers we pass to batch_memcpy, but still // have source and destinations mappings not be the identity function: // // batch_memcpy( // [&a0 , &b0 , &c0 , &d0 ], // from (note the order is still just a, b, c, d!) // [&a0', &b0', &c0', &d0'], // to (order is the same as above too!) // [3 , 2 , 1 , 2 ]) // size // // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │b0 │b1 │a0 │a1 │a2 │d0 │d1 │c0 │ buffer x contains buffers a, b, c, d // └───┴───┴───┴───┴───┴───┴───┴───┘ note that the order of buffers is shuffled! // ───┬─── ─────┬───── ───┬─── ─── // └─────────┼─────────┼───┐ // ┌───┘ ┌───┘ │ what batch_memcpy does // ▼ ▼ ▼ // ─── ─────────── ─────── ─────── // ┌───┬───┬───┬───┬───┬───┬───┬───┐ // │c0'│a0'│a1'│a2'│d0'│d1'│b0'│b1'│ buffer y contains buffers a', b', c', d' // └───┴───┴───┴───┴───┴───┴───┴───┘ template std::vector shuffled_exclusive_scan(const std::vector& input, RandomGenerator& rng) { const auto n = input.size(); assert(n > 0); std::vector result(n); std::vector permute(n); std::iota(permute.begin(), permute.end(), 0); std::shuffle(permute.begin(), permute.end(), rng); T sum = 0; for(size_t i = 0; i < n; ++i) { result[permute[i]] = sum; sum += input[permute[i]]; } return result; } template::type = 0> void init_input(ContainerMemCpy& h_input_for_memcpy, ContainerCopy& /*h_input_for_copy*/, std::mt19937_64& rng, byte_offset_type total_num_bytes) { std::independent_bits_engine bits_engine{rng}; const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t)); h_input_for_memcpy = std::vector(num_ints * sizeof(uint64_t)); // generate_n for uninitialized memory, pragmatically use placement-new, since there are no // uint64_t objects alive yet in the storage. std::for_each( reinterpret_cast(h_input_for_memcpy.data()), reinterpret_cast(h_input_for_memcpy.data() + num_ints * sizeof(uint64_t)), [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); } template::type = 0> void init_input(ContainerMemCpy& /*h_input_for_memcpy*/, ContainerCopy& h_input_for_copy, std::mt19937_64& rng, byte_offset_type total_num_bytes) { using value_type = typename ContainerCopy::value_type; std::independent_bits_engine bits_engine{rng}; const size_t num_ints = rocprim::detail::ceiling_div(total_num_bytes, sizeof(uint64_t)); const size_t num_of_elements = rocprim::detail::ceiling_div(num_ints * sizeof(uint64_t), sizeof(value_type)); h_input_for_copy = std::vector(num_of_elements); // generate_n for uninitialized memory, pragmatically use placement-new, since there are no // uint64_t objects alive yet in the storage. std::for_each(reinterpret_cast(h_input_for_copy.data()), reinterpret_cast(h_input_for_copy.data()) + num_ints, [&bits_engine](uint64_t& elem) { ::new(&elem) uint64_t{bits_engine()}; }); } } // namespace common #endif // COMMON_DEVICE_BATCH_MEMCPY_HPP_ rocPRIM-rocm-7.1.0/common/predicate_iterator.hpp000066400000000000000000000027561506507210100216110ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_PREDICATE_ITERATOR_HPP_ #define COMMON_PREDICATE_ITERATOR_HPP_ namespace common { template struct increment_by { template __host__ __device__ T constexpr operator()(const T& value) const { return value + T{V}; } }; } // namespace common #endif // COMMON_PREDICATE_ITERATOR_HPP_ rocPRIM-rocm-7.1.0/common/utils.hpp000066400000000000000000000104521506507210100170700ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_UTILS_HPP_ #define COMMON_UTILS_HPP_ #include #ifdef USE_GTEST // GoogleTest-compatible HIP_CHECK macro. FAIL is called to log the Google Test trace. // The lambda is invoked immediately as assertions that generate a fatal failure can // only be used in void-returning functions. #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ [error]() \ { FAIL() << "HIP error " << error << ": " << hipGetErrorString(error); }(); \ exit(error); \ } \ } #else #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ std::cout << "HIP error: " << hipGetErrorString(error) << " file: " << __FILE__ \ << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } #endif namespace common { template __device__ constexpr bool device_test_enabled_for_warp_size_v = ::rocprim::arch::wavefront::max_size() >= LogicalWarpSize; inline char* __get_env(const char* name) { char* env; #ifdef _MSC_VER errno_t err = _dupenv_s(&env, nullptr, name); if(err) { return nullptr; } #else env = std::getenv(name); #endif return env; } inline void clean_env(char* env) { #ifdef _MSC_VER free(env); #endif (void)env; } inline bool use_hmm() { char* env = __get_env("ROCPRIM_USE_HMM"); const bool hmm = (env != nullptr) && (strcmp(env, "1") == 0); clean_env(env); return hmm; } // Helper for HMM allocations: HMM is requested through ROCPRIM_USE_HMM=1 environment variable template hipError_t hipMallocHelper(T** devPtr, size_t size) { if(use_hmm()) { return hipMallocManaged(reinterpret_cast(devPtr), size); } else { return hipMalloc(reinterpret_cast(devPtr), size); } return hipSuccess; } } // namespace common #endif // COMMON_UTILS_HPP_ rocPRIM-rocm-7.1.0/common/utils_custom_type.hpp000066400000000000000000000205211506507210100215210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_UTILS_CUSTOM_TYPE_HPP_ #define COMMON_UTILS_CUSTOM_TYPE_HPP_ #include #include #include #include namespace common { template struct custom_type { using first_type = T; using second_type = U; // value_type is valid if T == U using value_type = std::conditional_t::value, T, void>; T x; U y; // Non-zero values in default constructor for checking reduce and scan: // ensure that scan_op(custom_type(), value) != value ROCPRIM_HOST_DEVICE constexpr inline custom_type() : x(NonZero ? 12 : 0), y(NonZero ? 34 : 0) {} ROCPRIM_HOST_DEVICE inline custom_type(T x, U y) : x(x), y(y) {} ROCPRIM_HOST_DEVICE inline custom_type(T xy) : x(xy), y(xy) {} template ROCPRIM_HOST_DEVICE inline custom_type(const custom_type& other) : x(static_cast(other.x)), y(static_cast(other.y)) {} ROCPRIM_HOST_DEVICE inline ~custom_type() = default; ROCPRIM_HOST_DEVICE inline custom_type operator+(const custom_type& other) const { rocprim::plus plus_T; rocprim::plus plus_U; return custom_type{plus_T(x, other.x), plus_U(y, other.y)}; } ROCPRIM_HOST_DEVICE inline custom_type operator-(const custom_type& other) const { rocprim::minus minus_T; rocprim::minus minus_U; return custom_type(minus_T(x, other.x), minus_U(y, other.y)); } ROCPRIM_HOST_DEVICE inline custom_type& operator=(const custom_type& other) { x = other.x; y = other.y; return *this; } ROCPRIM_HOST_DEVICE inline custom_type& operator+=(const custom_type& other) { x += other.x; y += other.y; return *this; } ROCPRIM_HOST_DEVICE inline bool operator<(const custom_type& other) const { rocprim::less less_T; rocprim::equal_to equal_to_T; rocprim::less less_U; return (less_T(x, other.x) || (equal_to_T(x, other.x) && less_U(y, other.y))); } ROCPRIM_HOST_DEVICE inline bool operator>(const custom_type& other) const { rocprim::greater greater_T; rocprim::equal_to equal_to_T; rocprim::greater greater_U; return (greater_T(x, other.x) || (equal_to_T(x, other.x) && greater_U(y, other.y))); } ROCPRIM_HOST_DEVICE inline bool operator==(const custom_type& other) const { rocprim::equal_to equal_to_T; rocprim::equal_to equal_to_U; return (equal_to_T(x, other.x) && equal_to_U(y, other.y)); } ROCPRIM_HOST_DEVICE inline bool operator!=(const custom_type& other) const { return !(*this == other); } friend inline std::ostream& operator<<(std::ostream& stream, const custom_type& value) { stream << "[" << value.x << "; " << value.y << "]"; return stream; } }; template struct custom_type_copyable { using first_type = T; using second_type = U; using value_type = std::conditional_t::value, T, void>; T x; U y; ROCPRIM_HOST_DEVICE constexpr inline custom_type_copyable() : x(NonZero ? 12 : 0), y(NonZero ? 34 : 0) {} ROCPRIM_HOST_DEVICE inline custom_type_copyable(T x, U y) : x(x), y(y) {} ROCPRIM_HOST_DEVICE inline custom_type_copyable(T xy) : x(xy), y(xy) {} template ROCPRIM_HOST_DEVICE inline custom_type_copyable( const custom_type_copyable& other) : x(static_cast(other.x)), y(static_cast(other.y)) {} ROCPRIM_HOST_DEVICE inline bool operator<(const custom_type_copyable& other) const { rocprim::less less_T; rocprim::equal_to equal_to_T; rocprim::less less_U; return (less_T(x, other.x) || (equal_to_T(x, other.x) && less_U(y, other.y))); } ROCPRIM_HOST_DEVICE inline bool operator>(const custom_type_copyable& other) const { rocprim::greater greater_T; rocprim::equal_to equal_to_T; rocprim::greater greater_U; return (greater_T(x, other.x) || (equal_to_T(x, other.x) && greater_U(y, other.y))); } ROCPRIM_HOST_DEVICE inline bool operator==(const custom_type_copyable& other) const { rocprim::equal_to equal_to_T; rocprim::equal_to equal_to_U; return (equal_to_T(x, other.x) && equal_to_U(y, other.y)); } ROCPRIM_HOST_DEVICE inline bool operator!=(const custom_type_copyable& other) const { return !(*this == other); } friend inline std::ostream& operator<<(std::ostream& stream, const custom_type_copyable& value) { stream << "[" << value.x << "; " << value.y << "]"; return stream; } }; static_assert(std::is_trivially_copyable>::value, "custom_type_copyable is not trivially copyable"); template struct is_custom_type_copyable : std::false_type {}; template struct is_custom_type_copyable> : std::true_type {}; template struct custom_huge_type : custom_type { static constexpr auto extra_bytes = Size - sizeof(T) - sizeof(U); std::uint8_t data[extra_bytes]; // Non-zero values in default constructor for checking reduce and scan: // ensure that scan_op(custom_type(), value) != value ROCPRIM_HOST_DEVICE constexpr inline custom_huge_type() : custom_type() {} ROCPRIM_HOST_DEVICE inline custom_huge_type(T x, U y) : custom_type(x, y) {} ROCPRIM_HOST_DEVICE inline custom_huge_type(T xy) : custom_type(xy) {} template ROCPRIM_HOST_DEVICE inline custom_huge_type(const custom_type& other) : custom_type(other) {} template ROCPRIM_HOST_DEVICE inline custom_huge_type( const custom_huge_type& other) : custom_type(static_cast(other.x), static_cast(other.y)) {} friend inline std::ostream& operator<<(std::ostream& stream, const custom_huge_type& value) { stream << "[" << value.x << "; " << value.y << "]"; return stream; } }; template struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; template struct is_custom_type> : std::true_type {}; template struct is_custom_type> : std::true_type {}; } // namespace common #endif // COMMON_UTILS_CUSTOM_TYPE_HPP_ rocPRIM-rocm-7.1.0/common/utils_data_generation.hpp000066400000000000000000000140441506507210100222750ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_UTILS_DATA_GENERATION_HPP_ #define COMMON_UTILS_DATA_GENERATION_HPP_ #include #include #include #include #include #include namespace common { // uniform_int_distribution is undefined for anything other than: // short, int, long, long long, rocprim::int128_t, unsigned short, unsigned int, unsigned long, unsigned long long, or rocprim::uint128_t template struct is_valid_for_int_distribution : std::integral_constant< bool, std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value> {}; // uniform_int_distribution is defined for supporting rocprim::int128_t and rocprim::uint128_t template class uniform_int_distribution { public: typedef IntType result_type; uniform_int_distribution() : uniform_int_distribution(0) {} explicit uniform_int_distribution(IntType _a, IntType _b = rocprim::numeric_limits::max()) : lower_bound{_a}, upper_bound{_b} {} void reset() {} result_type a() const { return lower_bound; } result_type b() const { return upper_bound; } result_type min() const { return a(); } result_type max() const { return b(); } template result_type operator()(Generator& urng) { rocprim::uint128_t range = upper_bound - lower_bound + 1; auto offset = helper(urng, range); return offset + lower_bound; } friend bool operator==(const uniform_int_distribution& d1, const uniform_int_distribution& d2) { return d1.lower_bound == d2.lower_bound && d1.upper_bound == d2.upper_bound; } friend bool operator!=(const uniform_int_distribution& d1, const uniform_int_distribution& d2) { return !(d1 == d2); } // third constructor, param(), operator<< and operator>> are not defined private: // Java approach in the reference below. // Returns an unbiased random number from urng downscaled to [0, range) template static rocprim::uint128_t helper(Generator& urng, const rocprim::uint128_t& range) { // reference: Fast Random Integer Geeneration in an Interval // ACM Transactions on Modeling and Computer Simulation 29 (1), 2019 // https://arxiv.org/abs/1805.10941 static std::uniform_int_distribution dists[2]; auto random_number = rocprim::uint128_t{dists[0](urng)} << 64 | dists[1](urng); if(!range) { return random_number; } auto result = random_number % range; auto threshold = rocprim::numeric_limits::max() - range + 1; while(random_number - result > threshold) { random_number = rocprim::uint128_t{dists[0](urng)} << 64 | dists[1](urng); result = random_number % range; } return result; } IntType lower_bound; IntType upper_bound; }; template class uniform_int_distribution< IntType, std::enable_if_t<(!(std::is_same::value || std::is_same::value))>> : public std::uniform_int_distribution { public: using std::uniform_int_distribution::uniform_int_distribution; }; template struct generate_limits { static inline T min() { return rocprim::numeric_limits::min(); } static inline T max() { return rocprim::numeric_limits::max(); } }; template struct generate_limits< T, std::enable_if_t().is_build_in() && rocprim::is_integral::value>> { static inline T min() { return rocprim::numeric_limits::min(); } static inline T max() { return rocprim::numeric_limits::max(); } }; template struct generate_limits().is_build_in() && rocprim::is_floating_point::value>> { static inline T min() { return T(-1000); } static inline T max() { return T(1000); } }; template using it_value_t = typename std::iterator_traits::value_type; } // namespace common #endif // COMMON_UTILS_DATA_GENERATION_HPP_ rocPRIM-rocm-7.1.0/common/utils_device_ptr.hpp000066400000000000000000000536601506507210100213040ustar00rootroot00000000000000// Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_UTILS_DEVICE_PTR_HPP #define ROCPRIM_UTILS_DEVICE_PTR_HPP #include "utils.hpp" #include #include #include #include namespace common { /// \brief An RAII friendly class to manage the memory allocated on device. /// /// \tparam A Template type used by the class. template class device_ptr { public: using decay_type = std::decay_t; using size_type = std::size_t; using value_type = ValueType; private: // If value_type is void we want to emulate allocating bytes (uchar). using value_type_proxy = std::conditional_t::value, unsigned char, ValueType>; public: static constexpr size_t value_size = sizeof(value_type_proxy); device_ptr() : device_raw_ptr_(nullptr), number_of_ele_(0){}; /// \brief Construct with a pre-allocated memory space. device_ptr(size_type pre_alloc_number_of_ele) : device_raw_ptr_(nullptr), number_of_ele_(pre_alloc_number_of_ele) { size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); }; device_ptr(device_ptr const&) = delete; device_ptr(device_ptr&& other) noexcept : device_raw_ptr_(other.device_raw_ptr_), number_of_ele_(other.number_of_ele_) { other.leak(); }; /// \brief Construct by host vectors with the same sized value_type template explicit device_ptr(std::vector const& data) : device_raw_ptr_(nullptr), number_of_ele_(data.size()) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::vector const& data, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(data.size()) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice, stream)); } template explicit device_ptr(std::array const& data) : device_raw_ptr_(nullptr), number_of_ele_(Size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = Size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::array const& data, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(Size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = Size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, data.data(), storage_size, hipMemcpyHostToDevice, stream)); } template explicit device_ptr(std::unique_ptr const& uptr, size_type size) : device_raw_ptr_(nullptr), number_of_ele_(size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpy(device_raw_ptr_, uptr.get(), storage_size, hipMemcpyHostToDevice)); } template explicit device_ptr(std::unique_ptr const& uptr, size_type size, hipStream_t stream) : device_raw_ptr_(nullptr), number_of_ele_(size) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); size_type storage_size = size * value_size; HIP_CHECK(common::hipMallocHelper(&device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, uptr.get(), storage_size, hipMemcpyHostToDevice, stream)); } ~device_ptr() { free_manually(); }; device_ptr& operator=(device_ptr const&) = delete; device_ptr& operator=(device_ptr&& other) noexcept { free_manually(); device_raw_ptr_ = other.device_raw_ptr_; number_of_ele_ = other.number_of_ele_; other.leak(); return *this; }; /// \brief Do copy on the device. /// /// \return A new `device_ptr` rvalue. device_ptr duplicate() const { device_ptr ret; ret.number_of_ele_ = number_of_ele_; size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&ret.device_raw_ptr_, storage_size)); HIP_CHECK( hipMemcpy(ret.device_raw_ptr_, device_raw_ptr_, storage_size, hipMemcpyDeviceToDevice)); return ret; } device_ptr duplicate_async(hipStream_t stream) const { device_ptr ret; ret.number_of_ele_ = number_of_ele_; size_type storage_size = number_of_ele_ * value_size; HIP_CHECK(common::hipMallocHelper(&ret.device_raw_ptr_, storage_size)); HIP_CHECK(hipMemcpyAsync(ret.device_raw_ptr_, device_raw_ptr_, storage_size, hipMemcpyDeviceToDevice, stream)); return ret; } /// \brief Do type cast and move the ownership to the new `device_ptr`. /// /// \return A new `device_ptr` rvalue. template device_ptr move_cast() noexcept { using target_value_t = typename device_ptr::value_type; auto ret_deivce_raw_ptr_ = static_cast(static_cast(device_raw_ptr_)); auto ret_number_of_ele_ = value_size * number_of_ele_ / sizeof(target_value_t); leak(); return {ret_deivce_raw_ptr_, ret_number_of_ele_}; } /// \brief Get the device raw pointer value_type* get() const noexcept { return device_raw_ptr_; } /// \brief Clean every thing on this instance, which could lead to memory leak. Should call `get()` and free the raw pointer manually void leak() noexcept { device_raw_ptr_ = nullptr; number_of_ele_ = 0; } /// \brief Call this function to garbage the memory in advance void free_manually() { if(device_raw_ptr_ != nullptr) { HIP_CHECK(hipFree(device_raw_ptr_)); } leak(); } void resize(size_type new_number_of_ele) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; HIP_CHECK(common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size)); HIP_CHECK(hipMemcpy(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } } void resize_async(size_type new_number_of_ele, hipStream_t stream) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; HIP_CHECK(common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size)); HIP_CHECK(hipMemcpyAsync(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice, stream)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } } // if got error hipErrorOutOfMemory` return false, else return `true` bool resize_with_memory_check(size_type new_number_of_ele) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; const auto err = common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size); if(err == hipErrorOutOfMemory) { (void) hipGetLastError(); // reset internally recorded HIP error return false; } HIP_CHECK(err); HIP_CHECK(hipMemcpy(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } return true; } bool resize_with_memory_check_async(size_type new_number_of_ele, hipStream_t stream) { if(new_number_of_ele == 0) { free_manually(); } else { value_type* device_temp_ptr = nullptr; const auto err = common::hipMallocHelper(&device_temp_ptr, new_number_of_ele * value_size); if(err == hipErrorOutOfMemory) { return false; } HIP_CHECK(err); HIP_CHECK(hipMemcpyAsync(device_temp_ptr, device_raw_ptr_, std::min(new_number_of_ele, number_of_ele_) * value_size, hipMemcpyDeviceToDevice, stream)); free_manually(); device_raw_ptr_ = device_temp_ptr; number_of_ele_ = new_number_of_ele; } return true; } /// \brief Get the size of this memory space size_type msize() const noexcept { return number_of_ele_ * value_size; } /// \brief Get the number of elements size_type size() const noexcept { return number_of_ele_; } /// \brief Copy from host to device template void store(std::vector const& host_vec, size_type offset = 0) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(host_vec.size() + offset > number_of_ele_) { resize(host_vec.size() + offset); } HIP_CHECK(hipMemcpy(device_raw_ptr_ + offset, host_vec.data(), host_vec.size() * value_size, hipMemcpyHostToDevice)); } template void store(std::array const& host_arr) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(Size > number_of_ele_) { resize(Size); } HIP_CHECK( hipMemcpy(device_raw_ptr_, host_arr.data(), Size * value_size, hipMemcpyHostToDevice)); } template void store(std::unique_ptr const& uptr, size_type offset, size_type number_of_ele) { static_assert( sizeof(InValueType) == value_size, "value_type of input unique_ptr must have the same size with device_ptr::value_type"); if(offset + number_of_ele > number_of_ele_) { resize(offset + number_of_ele); } HIP_CHECK(hipMemcpy(device_raw_ptr_ + offset, uptr.get(), number_of_ele * value_size, hipMemcpyHostToDevice)); } template void store_async(std::vector const& host_vec, hipStream_t stream) { static_assert( sizeof(InValueType) == value_size, "value_type of input vector must have the same size with device_ptr::value_type"); if(host_vec.size() > number_of_ele_) { resize(host_vec.size()); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, host_vec.data(), host_vec.size() * value_size, hipMemcpyHostToDevice, stream)); } template void store_async(std::array const& host_arr, hipStream_t stream) { static_assert(sizeof(InValueType) == value_size, "value_type of input must have the same size with device_ptr::value_type"); if(Size > number_of_ele_) { resize(Size); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, host_arr.data(), Size * value_size, hipMemcpyHostToDevice, stream)); } template void store_async(std::unique_ptr const& uptr, size_type offset, size_type number_of_ele, hipStream_t stream) { static_assert( sizeof(InValueType) == value_size, "value_type of input unique_ptr must have the same size with device_ptr::value_type"); if(offset + number_of_ele > number_of_ele_) { resize(offset + number_of_ele); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_ + offset, uptr.get(), number_of_ele * value_size, hipMemcpyHostToDevice, stream)); } // will not check the boundary void store_value_at(size_type pos, value_type_proxy const& value) { HIP_CHECK(hipMemcpy(device_raw_ptr_ + pos, &value, value_size, hipMemcpyHostToDevice)); } // will not check the boundary template void store_value_at_async(size_type pos, value_type_proxy const& value, hipStream_t stream) { HIP_CHECK( hipMemcpy(device_raw_ptr_ + pos, &value, value_size, hipMemcpyHostToDevice, stream)); } /// \brief Copy from device to device template void replace(device_ptr const& device_ptr) { static_assert(sizeof(InPtrValueType) == value_size, "sizeof(InPtrValueType) must equal to value_size"); if(device_ptr.number_of_ele_ > number_of_ele_) { resize(device_ptr.number_of_ele_); } HIP_CHECK(hipMemcpy(device_raw_ptr_, device_ptr.device_raw_ptr_, device_ptr.number_of_ele_ * value_size, hipMemcpyDeviceToDevice)); } template void replace_async(device_ptr const& device_ptr, hipStream_t stream) { static_assert(sizeof(InPtrValueType) == value_size, "sizeof(InPtrValueType) must equal to value_size"); if(device_ptr.number_of_ele_ > number_of_ele_) { resize(device_ptr.number_of_ele_); } HIP_CHECK(hipMemcpyAsync(device_raw_ptr_, device_ptr.device_raw_ptr_, device_ptr.number_of_ele_ * value_size, hipMemcpyDeviceToDevice, stream)); } void memset(size_type offset, int value, size_type size_bytes) { HIP_CHECK(hipMemset(reinterpret_cast(device_raw_ptr_) + offset, value, static_cast(size_bytes))); } void memset_async(size_type offset, int value, size_type size_bytes, hipStream_t stream) { HIP_CHECK(hipMemsetAsync(reinterpret_cast(device_raw_ptr_) + offset, value, static_cast(size_bytes), stream)); } /// \brief Copy from device to host /// This function will store loaded values into std::vector auto load() const { std::vector ret(number_of_ele_); HIP_CHECK(hipMemcpy(ret.data(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost)); return ret; } auto load_async(hipStream_t stream) const { std::vector ret(number_of_ele_); HIP_CHECK(hipMemcpyAsync(ret.data(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost, stream)); return ret; } template auto load_to_array() const { std::array ret; HIP_CHECK(hipMemcpy(ret.data(), device_raw_ptr_, std::min(number_of_ele_, Size) * value_size, hipMemcpyDeviceToHost)); return ret; } template auto load_to_array_async(hipStream_t stream) const { std::array ret; HIP_CHECK(hipMemcpyAsync(ret.data(), device_raw_ptr_, std::min(number_of_ele_, Size) * value_size, hipMemcpyDeviceToHost, stream)); return ret; } auto load_to_unique_ptr() const { std::unique_ptr ret(new value_type[number_of_ele_]); HIP_CHECK(hipMemcpy(ret.get(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost)); return ret; } auto load_to_unique_ptr_async(hipStream_t stream) const { std::unique_ptr ret(new value_type[number_of_ele_]); HIP_CHECK(hipMemcpyAsync(ret.get(), device_raw_ptr_, number_of_ele_ * value_size, hipMemcpyDeviceToHost, stream)); return ret; } auto load_value_at(size_type pos) const { value_type ret; HIP_CHECK(hipMemcpy(&ret, device_raw_ptr_ + pos, value_size, hipMemcpyDeviceToHost)); return ret; } auto load_value_at_async(size_type pos, hipStream_t stream) const { value_type ret; HIP_CHECK( hipMemcpyAsync(&ret, device_raw_ptr_ + pos, value_size, hipMemcpyDeviceToHost, stream)); return ret; } private: value_type* device_raw_ptr_; size_type number_of_ele_; }; } // namespace common #endif rocPRIM-rocm-7.1.0/common/utils_half.hpp000066400000000000000000000036701506507210100200660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2021-2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_UTILS_HALF_HPP_ #define COMMON_UTILS_HALF_HPP_ #include #include #include #include namespace common { // Support half operators on host side ROCPRIM_HOST inline rocprim::native_half half_to_native(const rocprim::half& x) { return *reinterpret_cast(&x); } ROCPRIM_HOST inline rocprim::half native_to_half(const rocprim::native_half& x) { return *reinterpret_cast(&x); } } // namespace common // For better Google Test reporting and debug output of half values inline std::ostream& operator<<(std::ostream& stream, const rocprim::half& value) { stream << static_cast(value); return stream; } #endif // COMMON_UTILS_HALF_HPP_ rocPRIM-rocm-7.1.0/common/warp_exchange.hpp000066400000000000000000000113571506507210100205500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef COMMON_WARP_EXCHANGE_HPP_ #define COMMON_WARP_EXCHANGE_HPP_ #include namespace common { struct BlockedToStripedOp { template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&input_data)[ItemsPerThread], T (&output_data)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.blocked_to_striped(input_data, output_data, storage); } template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.blocked_to_striped(thread_data, thread_data, storage); } }; struct BlockedToStripedShuffleOp { template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&input_data)[ItemsPerThread], T (&output_data)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.blocked_to_striped_shuffle(input_data, output_data); } template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.blocked_to_striped_shuffle(thread_data, thread_data); } }; struct StripedToBlockedOp { template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&input_data)[ItemsPerThread], T (&output_data)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.striped_to_blocked(input_data, output_data, storage); } template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) const { warp_exchange.striped_to_blocked(thread_data, thread_data, storage); } }; struct StripedToBlockedShuffleOp { template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&input_data)[ItemsPerThread], T (&output_data)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.striped_to_blocked_shuffle(input_data, output_data); } template ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) const { warp_exchange.striped_to_blocked_shuffle(thread_data, thread_data); } }; } // namespace common #endif // COMMON_WARP_EXCHANGE_HPP_ rocPRIM-rocm-7.1.0/conanfile.py000066400000000000000000000012051506507210100162330ustar00rootroot00000000000000# Copyright 2021 Advanced Micro Devices, Inc. # This conanfile is used to install development requirements, # e.g. # conan install -o clients=True -if build/deps . from conans import ConanFile, CMake class ConanPkgReqs(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake_find_package" options = { "shared": [True, False], "clients": [True, False], } default_options = { "shared": True, "clients": False, } def requirements(self): if self.options.clients: self.requires("gtest/1.11.0") self.requires("benchmark/1.5.2") rocPRIM-rocm-7.1.0/custom.properties000066400000000000000000000001361506507210100173550ustar00rootroot00000000000000booktitle=rocPRIM API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocPRIM-rocm-7.1.0/docs/000077500000000000000000000000001506507210100146555ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/docs/.gitignore000066400000000000000000000001171506507210100166440ustar00rootroot00000000000000/_build/ /_doxygen/ /doxygen/html /doxygen/xml /doxygen/*.tag /sphinx/_toc.yml rocPRIM-rocm-7.1.0/docs/CMakeLists.txt000066400000000000000000000025701506507210100174210ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2024 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # rocPRIM documentation include(GNUInstallDirs) rocm_add_sphinx_doc( "${CMAKE_CURRENT_SOURCE_DIR}" BUILDER html OUTPUT_DIR html USES_DOXYGEN ) install( DIRECTORY "${CMAKE_CURRENT_BINARY_DIR}/html" DESTINATION ${CMAKE_INSTALL_DOCDIR} ) rocPRIM-rocm-7.1.0/docs/block_ops/000077500000000000000000000000001506507210100166305ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/docs/block_ops/data_mov_funcs.rst000066400000000000000000000047321506507210100223600ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _data_mov_funcs: ******************************************************************** Data movement functions ******************************************************************** Direct Blocked =============== Load ------ .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_blocked(unsigned int flat_id, InputIterator block_input, T (&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_blocked (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_blocked (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Blocked Vectorized =========================== Load ------- .. doxygenfunction:: rocprim::block_load_direct_blocked_vectorized (unsigned int flat_id, T *block_input, U(&items)[ItemsPerThread]) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_blocked_vectorized (unsigned int flat_id, T *block_output, U(&items)[ItemsPerThread]) Direct Striped ================== Load --------- .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid) .. doxygenfunction:: rocprim::block_load_direct_striped (unsigned int flat_id, InputIterator block_input, T(&items)[ItemsPerThread], unsigned int valid, Default out_of_bounds) Store ---------- .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread]) .. doxygenfunction:: rocprim::block_store_direct_striped (unsigned int flat_id, OutputIterator block_output, T(&items)[ItemsPerThread], unsigned int valid) Direct Warp Striped ==================== Load --------- .. doxygengroup:: blockmodule_warp_load_functions :content-only: Store ---------- .. doxygengroup:: blockmodule_warp_store_functions :content-only: rocPRIM-rocm-7.1.0/docs/block_ops/index.rst000066400000000000000000000011441506507210100204710ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _block-index: ******************************************************************** Block-Wide Operations ******************************************************************** * :ref:`class-index` * :ref:`blk-load` * :ref:`blk-store` * :ref:`blk-adjacent_difference` * :ref:`blk-discontinuity` * :ref:`blk-scan` * :ref:`blk-reduce` * :ref:`blk-shuffle` * :ref:`blk-exchange` * :ref:`blk-sort` * :ref:`blk-histogram` * :ref:`data_mov_funcs` rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/000077500000000000000000000000001506507210100211465ustar00rootroot00000000000000rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/adjacent_difference.rst000066400000000000000000000005721506507210100256270ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-adjacent_difference: ******************************************************************** Adjacent difference ******************************************************************** .. doxygenclass:: rocprim::block_adjacent_difference :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/discontinuity.rst000066400000000000000000000005501506507210100246050ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-discontinuity: ******************************************************************** Discontinuity ******************************************************************** .. doxygenclass:: rocprim::block_discontinuity :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/exchange.rst000066400000000000000000000005311506507210100234610ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-exchange: ******************************************************************** Exchange ******************************************************************** .. doxygenclass:: rocprim::block_exchange :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/histogram.rst000066400000000000000000000006721506507210100237020ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-histogram: ******************************************************************** Histogram ******************************************************************** Class ========= .. doxygenclass:: rocprim::block_histogram :members: Algorithms =========== .. doxygenenum:: rocprim::block_histogram_algorithm rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/index.rst000066400000000000000000000010311506507210100230020ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _class-index: ******************************************************************** Operation classes ******************************************************************** * :ref:`blk-load` * :ref:`blk-store` * :ref:`blk-adjacent_difference` * :ref:`blk-discontinuity` * :ref:`blk-scan` * :ref:`blk-reduce` * :ref:`blk-shuffle` * :ref:`blk-exchange` * :ref:`blk-sort` * :ref:`blk-histogram` rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/load.rst000066400000000000000000000006471506507210100226260ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-load: ******************************************************************** Load ******************************************************************** Class ========== .. doxygenclass:: rocprim::block_load :members: Algorithms ============== .. doxygenenum:: rocprim::block_load_method rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/reduce.rst000066400000000000000000000006601506507210100231510ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-reduce: ******************************************************************** Reduce ******************************************************************** Class ========== .. doxygenclass:: rocprim::block_reduce :members: Algorithms ============ .. doxygenenum:: rocprim::block_reduce_algorithm rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/run_length_decode.rst000066400000000000000000000005641506507210100253550ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-run_length_decode: ******************************************************************** Run-length decode ******************************************************************** .. doxygenclass:: rocprim::block_run_length_decode :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/scan.rst000066400000000000000000000006471506507210100226330ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-scan: ******************************************************************** Scan ******************************************************************** Class ======= .. doxygenclass:: rocprim::block_scan :members: Algorithms ============== .. doxygenenum:: rocprim::block_scan_algorithm rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/shuffle.rst000066400000000000000000000005261506507210100233370ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-shuffle: ******************************************************************** Shuffle ******************************************************************** .. doxygenclass:: rocprim::block_shuffle :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/sort.rst000066400000000000000000000007661506507210100227000ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-sort: ******************************************************************** Sort ******************************************************************** Generic Block Sort ================== .. doxygenclass:: rocprim::block_sort :members: .. doxygenenum:: rocprim::block_sort_algorithm Radix sort =========== .. doxygenclass:: rocprim::block_radix_sort :members: rocPRIM-rocm-7.1.0/docs/block_ops/ops_classes/store.rst000066400000000000000000000006441506507210100230400ustar00rootroot00000000000000.. meta:: :description: rocPRIM documentation and API reference library :keywords: rocPRIM, ROCm, API, documentation .. _blk-store: ******************************************************************** Store ******************************************************************** Class ====== .. doxygenclass:: rocprim::block_store :members: Algorithms =========== .. doxygenenum:: rocprim::block_store_method rocPRIM-rocm-7.1.0/docs/classification-map.xml000066400000000000000000000206731506507210100211550ustar00rootroot00000000000000 Kanika Yadav (external) Microsoft Office User 2020-09-25T06:54:04Z 2021-12-22T19:07:50Z 16.00 true 2021-02-23T09:13:03Z Standard 90c2fedb-0da6-4717-8531-d16a1b9930f4 45597f60-6e37-4be7-acfb-4c9e23b261ea 0 true 2022-01-14T16:33:39Z Privileged AMD Official Use Only-AIP 2.0 3dd8961f-e488-4e60-8e11-a82d994e183d 3ab6c0f7-c658-4f6f-bd9d-6ef921551ff7 1 14235 32767 32767 32767 False False Filename Title Categories Version Doc Type MAP rocm;hip-sdk;hip;gpu;amd;prim;rocprim 4-5 apply-ALL default rocPRIM API Guide reference