pax_global_header00006660000000000000000000000064144620146670014524gustar00rootroot0000000000000052 comment=b54aaa79eaafd351d7ce3373468211eb42ecf31a rocPRIM-rocm-5.7.1/000077500000000000000000000000001446201466700137475ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.clang-format000066400000000000000000000076451446201466700163360ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 UseCRLF: false # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: true AlignArrayOfStructures: Right AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: false AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: Never AllowShortCaseLabelsOnASingleLine: true AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: Yes AttributeMacros: ['ROCPRIM_DEVICE', 'ROCPRIM_HOST', 'ROCPRIM_HOST_DEVICE', 'ROCPRIM_SHARED_MEMORY', 'ROCPRIM_KERNEL', 'ROCPRIM_INLINE'] BinPackArguments: false BinPackParameters: false BitFieldColonSpacing: Both # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: AfterCaseLabel: true AfterClass: true AfterControlStatement: Always AfterEnum: true AfterFunction: true AfterNamespace: true AfterStruct: true AfterUnion: true BeforeCatch: true BeforeElse: true AfterExternBlock: false BeforeCatch: true BeforeElse: true BeforeLambdaBody: true BeforeWhile: true IndentBraces: false SplitEmptyFunction: false SplitEmptyRecord: false SplitEmptyNamespace: false BreakBeforeBinaryOperators: All BreakBeforeTernaryOperators: true BreakConstructorInitializers: BeforeComma BreakInheritanceList: BeforeComma BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' CompactNamespaces: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true DeriveLineEnding: false DerivePointerAlignment: false EmptyLineAfterAccessModifier: Never EmptyLineBeforeAccessModifier: Always ExperimentalAutoDetectBinPacking: false FixNamespaceComments: true ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IfMacros: [] IncludeBlocks: Preserve IndentAccessModifiers: false IndentCaseBlocks: true IndentCaseLabels: true IndentExternBlock: NoIndent IndentPPDirectives: BeforeHash IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true LambdaBodyIndentation: Signature MacroBlockBegin: '' MacroBlockEnd: '' MaxEmptyLinesToKeep: 1 NamespaceIndentation: None PPIndentWidth: -1 PackConstructorInitializers: NextLine PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left QualifierAlignment: Leave ReferenceAlignment: Pointer ReflowComments: false ShortNamespaceLines: 0 SortIncludes: CaseSensitive SortUsingDeclarations: true SpaceAfterCStyleCast: false SpaceAfterLogicalNot: false SpaceAfterTemplateKeyword: false SpaceAroundPointerQualifiers: Default SpaceBeforeAssignmentOperators: true SpaceBeforeCaseColon: false SpaceBeforeCpp11BracedList: false SpaceBeforeCtorInitializerColon: true SpaceBeforeInheritanceColon: true SpaceBeforeParens: Never SpaceBeforeRangeBasedForLoopColon: true SpaceBeforeSquareBrackets: false SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: Never SpacesInCStyleCastParentheses: false SpacesInConditionalStatement: false SpacesInContainerLiterals: true SpacesInParentheses: false SpacesInSquareBrackets: false --- rocPRIM-rocm-5.7.1/.githooks/000077500000000000000000000000001446201466700156545ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.githooks/install000077500000000000000000000002121446201466700172430ustar00rootroot00000000000000#!/bin/sh cd "$(git rev-parse --git-dir)" cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocPRIM-rocm-5.7.1/.githooks/pre-commit000077500000000000000000000005161446201466700176600ustar00rootroot00000000000000#!/bin/sh # Redirect output to stderr. exec 1>&2 # Do the code format check if ! "$(git rev-parse --show-toplevel)/scripts/code-format/check-format.sh" HEAD --cached 1>&2; then printf " Pre-commit check failed, please fix the reported errors. Note: Use '\033[33mgit commit --no-verify\033[0m' to bypass checks.\n" exit 1 fi rocPRIM-rocm-5.7.1/.github/000077500000000000000000000000001446201466700153075ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.github/ISSUE_TEMPLATE/000077500000000000000000000000001446201466700174725ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.github/ISSUE_TEMPLATE/bug_report.md000066400000000000000000000022571446201466700221720ustar00rootroot00000000000000--- name: Bug report about: Create a report to help us improve title: '' labels: '' assignees: '' --- **Describe the bug** A clear and concise description of what the bug is. **To Reproduce** Steps to reproduce the behavior: 1. Install '...' version '...' 2. Run '...' with data '...' 3. See error on logfile '...' **Expected behavior** A clear and concise description of what you expected to happen. **Log-files** Add *full* logfiles to help explain your problem. **Environment** Make sure that ROCm is correctly installed and run the following command: ``` printf '=== environment\n' > environment.txt && printf '\n\n=== date\n' >> environment.txt && date >> environment.txt && printf '\n\n=== Linux Kernel\n' >> environment.txt && uname -a >> environment.txt && printf '\n\n=== rocm-smi' >> environment.txt && rocm-smi >> environment.txt && printf '\n\n' >> environment.txt && hipconfig >> environment.txt && printf '\n\n=== rocminfo\n' >> environment.txt && rocminfo >> environment.txt && printf '\n\n=== lspci VGA\n' >> environment.txt && lspci | grep -i vga >> environment.txt ``` Attach `environment.txt` **Additional context** Add any other context about the problem here. rocPRIM-rocm-5.7.1/.github/ISSUE_TEMPLATE/feature_request.md000066400000000000000000000011231446201466700232140ustar00rootroot00000000000000--- name: Feature request about: Suggest an idea for this project title: '' labels: '' assignees: '' --- **Is your feature request related to a problem? Please describe.** A clear and concise description of what the problem is. Ex. I'm always frustrated when [...] **Describe the solution you'd like** A clear and concise description of what you want to happen. **Describe alternatives you've considered** A clear and concise description of any alternative solutions or features you've considered. **Additional context** Add any other context or screenshots about the feature request here. rocPRIM-rocm-5.7.1/.github/dependabot.yml000066400000000000000000000010421446201466700201340ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/.sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" rocPRIM-rocm-5.7.1/.github/workflows/000077500000000000000000000000001446201466700173445ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.github/workflows/docs.yaml000066400000000000000000000045551446201466700211710ustar00rootroot00000000000000name: Upload to the upload server # Controls when the workflow will run on: push: branches: [develop, master] tags: - rocm-5.* release: types: [published] # Allows you to run this workflow manually from the Actions tab workflow_dispatch: # A workflow run is made up of one or more jobs that can run sequentially or in parallel jobs: # This workflow contains a single job called "build" build: # The type of runner that the job will run on runs-on: ubuntu-latest # Steps represent a sequence of tasks that will be executed as part of the job steps: # Checks-out your repository under $GITHUB_WORKSPACE, so your job can access it - uses: actions/checkout@v2 - name: getting branch name shell: bash run: echo "##[set-output name=branch;]$(echo ${GITHUB_REF#refs/heads/})" id: branch_name - name: getting tag name shell: bash run: echo "##[set-output name=tag;]$(echo ${GITHUB_REF_NAME})" id: tag_name - name: zipping files run: zip -r ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip . -x '*.git*' '*.idea*' - name: echo-step run: echo "${{ github.event.release.target_commitish }}" - name: uploading archive to prod if: ${{ steps.branch_name.outputs.branch == 'master' || github.event.release.target_commitish == 'master'}} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.PROD_UPLOAD_URL }}' args: '-o ConnectTimeout=5' - name: uploading archive to staging if: ${{ steps.branch_name.outputs.branch == 'develop' || github.event.release.target_commitish == 'develop' }} uses: wlixcc/SFTP-Deploy-Action@v1.0 with: username: ${{ secrets.USERNAME }} server: ${{ secrets.SERVER }} ssh_private_key: ${{ secrets.SSH_PRIVATE_KEY }} local_path: ${{ github.event.repository.name }}_${{ steps.tag_name.outputs.tag }}.zip remote_path: '${{ secrets.STG_UPLOAD_URL }}' args: '-o ConnectTimeout=5' rocPRIM-rocm-5.7.1/.gitignore000066400000000000000000000011201446201466700157310ustar00rootroot00000000000000### Build dirs ### build/ # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build # End of https://www.gitignore.io/api/c++,cmake # VS Code # .vscode rocPRIM-rocm-5.7.1/.gitignore.develop000066400000000000000000000012401446201466700173710ustar00rootroot00000000000000### Build dirs ### build/ ### Docs dirs ### doc/html/ doc/xml/ doc/latex/ doc/*.tag # Created by https://www.gitignore.io/api/c++,cmake ### C++ ### # Prerequisites *.d # Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod *.smod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app ### CMake ### CMakeCache.txt CMakeFiles CMakeScripts Testing Makefile cmake_install.cmake install_manifest.txt compile_commands.json CTestTestfile.cmake build ### Gtilab CI ### .gitlab-ci-gputest.yml # End of https://www.gitignore.io/api/c++,cmake rocPRIM-rocm-5.7.1/.gitlab-ci.yml000066400000000000000000000354361446201466700164160ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. include: - project: 'amd/ci-templates' ref: main file: - /defaults.yaml - /deps-cmake.yaml - /deps-docs.yaml - /deps-rocm.yaml - /deps-vcpkg.yaml - /gpus-rocm.yaml - /rules.yaml stages: - lint - autotune - build - test - benchmark variables: PACKAGE_DIR: $BUILD_DIR/package AUTOTUNE_CONFIG_DIR: ${CI_PROJECT_DIR}/autotune_config clang-format: extends: - .deps:rocm stage: lint needs: [] tags: - rocm-build variables: CLANG_FORMAT: "/opt/rocm/llvm/bin/clang-format" GIT_CLANG_FORMAT: "/opt/rocm/llvm/bin/git-clang-format" rules: - if: '$CI_PIPELINE_SOURCE == "merge_request_event"' script: - cd $CI_PROJECT_DIR - git config --global --add safe.directory $CI_PROJECT_DIR - scripts/code-format/check-format.sh $CI_MERGE_REQUEST_DIFF_BASE_SHA --binary "$CLANG_FORMAT" .cmake-minimum-vcpkg: extends: - .deps:rocm - .deps:cmake-minimum - .deps:vcpkg before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - !reference [".deps:vcpkg", before_script] - $VCPKG_DIR/vcpkg install gtest benchmark .cmake-minimum-apt: extends: - .deps:rocm - .deps:cmake-minimum before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] - $SUDO_CMD apt-get install -y -qq libgtest-dev libbenchmark-dev .build:vcpkg-apt: stage: build tags: - rocm-build extends: - .gpus:rocm-gpus - .rules:build # Missing -Werror and other diagnostic flags due to rocm-terminal sporting an old googletest APT package (Ubuntu 18.04). # Here we're only testing the consumption logic, and we want to avoid new errors breaking logic testing script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release "$(if [ -n "$VCPKG_DIR" ]; then echo "-DCMAKE_TOOLCHAIN_FILE=$VCPKG_DIR/scripts/buildsystems/vcpkg.cmake"; fi)" -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D BUILD_BENCHMARK=ON -D GPU_TARGETS=$GPU_TARGETS -D AMDGPU_TEST_TARGETS=$GPU_TARGETS -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR --target test_basic build:cmake-minimum-vcpkg: stage: build needs: [] extends: - .cmake-minimum-vcpkg - .build:vcpkg-apt build:cmake-minimum-apt: stage: build needs: [] extends: - .cmake-minimum-apt - .build:vcpkg-apt .cmake-latest: extends: - .deps:rocm - .deps:cmake-latest before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-latest", before_script] .cmake-minimum: extends: - .deps:rocm - .deps:cmake-minimum before_script: - !reference [".deps:rocm", before_script] - !reference [".deps:cmake-minimum", before_script] .build:common: stage: build tags: - rocm-build extends: - .gpus:rocm-gpus - .rules:build script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_CXX_FLAGS="-Wall -Wextra -Werror" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=ON -D BUILD_EXAMPLE=ON -D BUILD_BENCHMARK=OFF -D GPU_TARGETS=$GPU_TARGETS -D AMDGPU_TEST_TARGETS=$GPU_TARGETS -S $CI_PROJECT_DIR -B $BUILD_DIR - cmake --build $BUILD_DIR artifacts: paths: - $BUILD_DIR/test/test_* - $BUILD_DIR/test/rocprim/test_* - $BUILD_DIR/test/CTestTestfile.cmake - $BUILD_DIR/test/rocprim/CTestTestfile.cmake - $BUILD_DIR/gtest/ - $BUILD_DIR/CMakeCache.txt - $BUILD_DIR/.ninja_log - $BUILD_DIR/CTestTestfile.cmake expire_in: 2 weeks build:cmake-latest: stage: build needs: [] extends: - .cmake-latest - .build:common build:cmake-minimum: stage: build needs: [] extends: - .cmake-minimum - .build:common build:package: stage: build needs: [] tags: - rocm-build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:build script: - mkdir -p $PACKAGE_DIR - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -B $PACKAGE_DIR -S $CI_PROJECT_DIR - cd $PACKAGE_DIR - cpack -G "DEB;ZIP" artifacts: paths: - $PACKAGE_DIR/rocprim*.deb - $PACKAGE_DIR/rocprim*.zip expire_in: 2 weeks build:benchmark: stage: build needs: - job: "autotune:generate-config" optional: true tags: - rocm-build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:build script: # If we have a custom config created by autotune:create-config - "[ -d ${AUTOTUNE_CONFIG_DIR} ] && cp -r -f ${AUTOTUNE_CONFIG_DIR}/* ${CI_PROJECT_DIR}/" - mkdir -p $BUILD_DIR - cd $BUILD_DIR - cmake -B $BUILD_DIR -S $CI_PROJECT_DIR -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D BUILD_BENCHMARK=ON -D GPU_TARGETS=$GPU_TARGETS - cmake --build . artifacts: paths: - $BUILD_DIR/benchmark/* - $BUILD_DIR/.ninja_log - $BUILD_DIR/deps/googlebenchmark/ expire_in: 2 weeks autotune:build: stage: autotune needs: [] tags: - rocm-build extends: - .cmake-minimum - .gpus:rocm-gpus - .rules:manual variables: BENCHMARK_TARGETS: benchmark_config_tuning script: - mkdir -p $BUILD_DIR - cd $BUILD_DIR - 'printf "Building benchmark targets: %s\n" "$BENCHMARK_TARGETS"' - cmake -B $BUILD_DIR -S $CI_PROJECT_DIR -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D BUILD_TEST=OFF -D BUILD_EXAMPLE=OFF -D BUILD_BENCHMARK=ON -D BENCHMARK_CONFIG_TUNING=ON -D GPU_TARGETS=$GPU_TARGETS - cmake --build . --target $BENCHMARK_TARGETS - 'rm -rf $BUILD_DIR/benchmark/benchmark*.parallel' artifacts: paths: - $BUILD_DIR/benchmark/benchmark* - $BUILD_DIR/.ninja_log - $BUILD_DIR/deps/googlebenchmark/ expire_in: 1 week test: stage: test extends: - .cmake-minimum - .rules:test - .gpus:rocm needs: - build:cmake-minimum script: - cd $BUILD_DIR - cmake -D CMAKE_PREFIX_PATH=/opt/rocm -P $CI_PROJECT_DIR/cmake/GenerateResourceSpec.cmake - cat ./resources.json - ctest --output-on-failure --repeat-until-fail 2 --tests-regex "hip|$GPU_TARGET" --resource-spec-file ./resources.json --parallel $PARALLEL_JOBS .test-package: script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -D GPU_TARGETS=$GPU_TARGETS -S "$CI_PROJECT_DIR/test/extra" -B "$CI_PROJECT_DIR/package_test" - cmake --build "$CI_PROJECT_DIR/package_test" - "$CI_PROJECT_DIR/package_test/test_rocprim_package" - cd "$CI_PROJECT_DIR/package_test" - ctest --output-on-failure --repeat-until-fail 2 test:install: stage: test needs: [] tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - cmake -G Ninja -D CMAKE_CXX_COMPILER="$AMDCLANG" -D CMAKE_BUILD_TYPE=Release -B build -S $CI_PROJECT_DIR - $SUDO_CMD cmake --build build --target install - !reference [.test-package, script] test:deb: stage: test needs: - build:package tags: - rocm extends: - .cmake-minimum - .rules:test - .gpus:rocm-gpus script: - $SUDO_CMD dpkg -i $PACKAGE_DIR/rocprim*.deb - !reference [.test-package, script] test:docs: stage: test extends: - .rules:test - .build:docs .benchmark-base: stage: benchmark extends: - .rules:benchmark variables: BENCHMARK_RESULT_DIR: ${CI_PROJECT_DIR}/benchmark_results BENCHMARK_RESULT_CACHE_DIR: ${BENCHMARK_RESULT_DIR}_cache benchmark: needs: - build:benchmark extends: - .cmake-minimum - .gpus:rocm - .benchmark-base variables: BENCHMARK_FILENAME_REGEX: ^benchmark BENCHMARK_ALGORITHM_REGEX: "benchmark_device_merge_sort_block_sort" script: - 'printf "CI Variables used in benchmarks:\nBENCHMARK_RESULT_DIR: %s\nBENCHMARK_FILENAME_REGEX: %s\nBENCHMARK_ALGORITHM_REGEX: %s \n" "$BENCHMARK_RESULT_DIR" "$BENCHMARK_FILENAME_REGEX" "$BENCHMARK_ALGORITHM_REGEX"' - cd "${CI_PROJECT_DIR}" - mkdir -p "${BENCHMARK_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir "${BUILD_DIR}/benchmark" --benchmark_gpu_architecture "${GPU_TARGET}" --benchmark_output_dir "${BENCHMARK_RESULT_DIR}" --benchmark_filename_regex "${BENCHMARK_FILENAME_REGEX}" --benchmark_filter_regex "${BENCHMARK_ALGORITHM_REGEX}" artifacts: paths: - ${BENCHMARK_RESULT_DIR} expire_in: 1 week benchmark:cache-or-report: needs: - benchmark extends: - .benchmark-base tags: - single-cache cache: key: benchmark-cache paths: - ${BENCHMARK_RESULT_CACHE_DIR} script: # If on MR branch, generate report, else cache results - > if [ ! -z "${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" ]; then if [ ! -d "${BENCHMARK_RESULT_CACHE_DIR}" ]; then echo 'ERROR: Cache directory does not exist' exit 1 elif [ ! -d "${BENCHMARK_RESULT_DIR}" ]; then echo 'ERROR: Benchmark results directory does not exist' exit 1 else echo 'INFO: Files in cache (reference benchmarks):' ls -al ${BENCHMARK_RESULT_CACHE_DIR} echo 'INFO: Generating report...' python3 .gitlab/generate_report.py --old ${BENCHMARK_RESULT_CACHE_DIR} --new ${BENCHMARK_RESULT_DIR} fi elif [ "${CI_COMMIT_BRANCH}" == "${CI_DEFAULT_BRANCH}" ]; then echo 'INFO: Caching benchmark results...' mkdir -p ${BENCHMARK_RESULT_CACHE_DIR} cp -R ${BENCHMARK_RESULT_DIR}/*.json ${BENCHMARK_RESULT_CACHE_DIR} else echo 'ERROR: Neither on a merge-request branch or the default branch' exit 1 fi .autotune-base: stage: autotune extends: - .rules:manual variables: AUTOTUNE_RESULT_DIR: ${CI_PROJECT_DIR}/autotune_results autotune:execute-tuning: needs: - autotune:build extends: - .autotune-base - .cmake-minimum - .gpus:rocm variables: AUTOTUNE_FILENAME_REGEX: ^benchmark AUTOTUNE_ALGORITHM_REGEX: "" timeout: 8h artifacts: paths: - ${AUTOTUNE_RESULT_DIR}/*.json script: - 'printf "CI Variables used in benchmarks:\nAUTOTUNE_RESULT_DIR: %s\nAUTOTUNE_FILENAME_REGEX: %s\nAUTOTUNE_ALGORITHM_REGEX: %s \n" "$AUTOTUNE_RESULT_DIR" "$AUTOTUNE_FILENAME_REGEX" "$AUTOTUNE_ALGORITHM_REGEX"' - cd "${CI_PROJECT_DIR}" - mkdir -p "${AUTOTUNE_RESULT_DIR}" - python3 .gitlab/run_benchmarks.py --benchmark_dir "${BUILD_DIR}/benchmark" --benchmark_gpu_architecture "${GPU_TARGET}" --benchmark_output_dir "${AUTOTUNE_RESULT_DIR}" --benchmark_filename_regex "${AUTOTUNE_FILENAME_REGEX}" --benchmark_filter_regex "${AUTOTUNE_ALGORITHM_REGEX}" autotune:generate-config: image: python:3.10.5-buster needs: - job: "autotune:execute-tuning" optional: true extends: - .rules:manual - .autotune-base variables: AUTOTUNE_CONFIG_REPO_PATH: /rocprim/include/rocprim/device/detail/config AUTOTUNE_RESULT_CACHE_DIR: ${AUTOTUNE_RESULT_DIR}_cache tags: - single-cache cache: key: autotune-cache paths: - autotune_results_cache/ script: # Set cache dir variables depending on if this is a MR or not - > if [ ! -z "${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" ]; then AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_SOURCE_BRANCH_NAME}" AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_MERGE_REQUEST_TARGET_BRANCH_NAME}" else AUTOTUNE_RESULT_CACHE_BRANCH_DIR="${AUTOTUNE_RESULT_CACHE_DIR}/${CI_COMMIT_BRANCH}" fi # If the global cache dir does not exist, create it - mkdir -p $AUTOTUNE_RESULT_CACHE_DIR # If there are fresh results in the artifacts, cache them in the branch cache # If there are no fresh results, check branch cache # If there are no branch cache results, check TARGET branch cache # If there are TARGET branch cache results, cache them in the branch cache - > if [ -d "$AUTOTUNE_RESULT_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} elif [ -d "$AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR" ]; then mkdir -p $AUTOTUNE_RESULT_DIR cp -R -u ${AUTOTUNE_RESULT_CACHE_TARGET_BRANCH_DIR}/*.json ${AUTOTUNE_RESULT_DIR} mkdir -p $AUTOTUNE_RESULT_CACHE_BRANCH_DIR cp -R -u ${AUTOTUNE_RESULT_DIR}/*.json ${AUTOTUNE_RESULT_CACHE_BRANCH_DIR} else echo 'ERROR: No autotune results found in previous artifacts, the branch cache or the target branch cache...' exit 1 fi # List the final .json files to use for config generation - ls -al ${AUTOTUNE_RESULT_DIR} - cd "${CI_PROJECT_DIR}" - python3 -m pip install jinja2 - mkdir -p ${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH} - python3 scripts/autotune/create_optimization.py --benchmark_files ${AUTOTUNE_RESULT_DIR}/*.json --out_basedir "${AUTOTUNE_CONFIG_DIR}${AUTOTUNE_CONFIG_REPO_PATH}" artifacts: paths: - ${AUTOTUNE_CONFIG_DIR} scheduled-check-changes: stage: autotune extends: .rules:scheduled-check-changes rocPRIM-rocm-5.7.1/.gitlab/000077500000000000000000000000001446201466700152675ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.gitlab/generate_report.py000066400000000000000000000122301446201466700210240ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import json import argparse import os import re import stat import sys class bcolors: OKGREEN = '\033[92m' WARNING = '\033[93m' FAIL = '\033[91m' ENDC = '\033[0m' def load_benchmarks(benchmark_dir): def is_benchmark_json(filename): if not re.match(r'.*\.json$', filename): return False path = os.path.join(benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether it is a regular file (S_IFREG) return (st_mode & stat.S_IFREG) def add_results(results, file_path: str): """ Adds a single file to the results. The file contains the results of benchmarks executed on a single architecture. The benchmarks within the file may belong to different algorithms. """ with open(file_path, "r+") as file_handle: # Fix Google Benchmark comma issue contents = file_handle.read() contents = re.sub(r"(\s*\"[^\"]*\"[^,])(^\s*\"[^\"]*\":)", "\\1,\\2", contents, 0, re.MULTILINE) file_handle.seek(0) file_handle.write(contents) file_handle.truncate() with open(file_path) as file_handle: benchmark_run_data = json.load(file_handle) try: arch = benchmark_run_data['context']['hdp_gcn_arch_name'].split(":")[0] results.setdefault(arch, {}) for single_benchmark in benchmark_run_data['benchmarks']: name = single_benchmark['name'].replace('/manual_time','') name = re.sub(r"(^device.*?)(,\s[A-z_]*_config.*>)$", "\\1>", name, 0, re.MULTILINE) results[arch][name] = single_benchmark['bytes_per_second'] except KeyError as err: print(f'KeyError: {err}, while reading file: {file_path}', file=sys.stderr, flush=True) benchmark_names = [name for name in os.listdir(benchmark_dir) if is_benchmark_json(name)] print('The following benchmark results will be reported:\n{}'.format('\n'.join(benchmark_names))) # Results is: {arch : {algorithm : bytes_per_second}, ...} results = {} for benchmark_name in benchmark_names: path = os.path.join(benchmark_dir, benchmark_name) add_results(results, path) return results def compare_results(old, new): results = [] incomparable = 0 for (arch, names) in new.items(): if arch in old: for (name, value_new) in names.items(): if name in old[arch]: results.append((f'{name} ({arch})', ((value_new - old[arch][name]) / old[arch][name]) * 100)) else: incomparable = incomparable + 1 if(incomparable > 0): print(f'Could not compare {incomparable} benchmarks.') print(f'----------------------------------------') success = True results.sort(key = lambda x: x[0]) for (name, difference) in results: if difference < -10: success = False print(f'{bcolors.FAIL}X {bcolors.ENDC} {name}: {bcolors.FAIL}{difference:.0f}{bcolors.ENDC}%') elif difference < -2: success = False print(f'{bcolors.WARNING}! {bcolors.ENDC} {name}: {bcolors.WARNING}{difference:.0f}{bcolors.ENDC}%') else: print(f'{bcolors.OKGREEN}OK{bcolors.ENDC} {name}: {bcolors.OKGREEN}{difference:.0f}{bcolors.ENDC}%') return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--old', help='The local directory that contains the old benchmark json files', required=True) parser.add_argument('--new', help='The local directory that contains the new benchmark json files', required=True) args = parser.parse_args() old_benchmarks = load_benchmarks(args.old) new_benchmarks = load_benchmarks(args.new) return compare_results(old_benchmarks, new_benchmarks) if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-5.7.1/.gitlab/run_benchmarks.py000077500000000000000000000105161446201466700206500ustar00rootroot00000000000000#!/usr/bin/env python3 # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. import argparse from collections import namedtuple import os import re import stat import subprocess import sys BenchmarkContext = namedtuple('BenchmarkContext', ['gpu_architecture', 'benchmark_output_dir', 'benchmark_dir', 'benchmark_filename_regex', 'benchmark_filter_regex']) def run_benchmarks(benchmark_context): def is_benchmark_executable(filename): if not re.match(benchmark_context.benchmark_filename_regex, filename): return False path = os.path.join(benchmark_context.benchmark_dir, filename) st_mode = os.stat(path).st_mode # we are not interested in permissions, just whether there is any execution flag set # and it is a regular file (S_IFREG) return (st_mode & (stat.S_IXUSR | stat.S_IXGRP | stat.S_IXOTH)) and (st_mode & stat.S_IFREG) success = True benchmark_names = [name for name in os.listdir(benchmark_context.benchmark_dir) if is_benchmark_executable(name)] print('The following benchmarks will be ran:\n{}'.format('\n'.join(benchmark_names)), file=sys.stderr, flush=True) for benchmark_name in benchmark_names: results_json_name = f'{benchmark_name}_{benchmark_context.gpu_architecture}.json' benchmark_path = os.path.join(benchmark_context.benchmark_dir, benchmark_name) results_json_path = os.path.join(benchmark_context.benchmark_output_dir, results_json_name) args = [ benchmark_path, '--name_format', 'json', '--benchmark_out_format=json', f'--benchmark_out={results_json_path}', f'--benchmark_filter={benchmark_context.benchmark_filter_regex}' ] try: subprocess.check_call(args) except subprocess.CalledProcessError as error: print(f'Could not run benchmark at {benchmark_path}. Error: "{error}"', file=sys.stderr, flush=True) success = False return success def main(): parser = argparse.ArgumentParser() parser.add_argument('--benchmark_dir', help='The local directory that contains the benchmark executables', required=True) parser.add_argument('--benchmark_gpu_architecture', help='The architecture of the currently enabled GPU', required=True) parser.add_argument('--benchmark_output_dir', help='The directory to write the benchmarks to', required=True) parser.add_argument('--benchmark_filename_regex', help='Regular expression that controls the list of benchmark executables to run', default=r'^benchmark', required=False) parser.add_argument('--benchmark_filter_regex', help='Regular expression that controls the list of benchmarks to run in each benchmark executable', default='', required=False) args = parser.parse_args() benchmark_context = BenchmarkContext( args.benchmark_gpu_architecture, args.benchmark_output_dir, args.benchmark_dir, args.benchmark_filename_regex, args.benchmark_filter_regex) benchmark_run_successful = run_benchmarks(benchmark_context) return benchmark_run_successful if __name__ == '__main__': success = main() if success: exit(0) else: exit(1) rocPRIM-rocm-5.7.1/.jenkins/000077500000000000000000000000001446201466700154665ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/.jenkins/common.groovy000066400000000000000000000057251446201466700202360ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${amdgpuTargets} -DBUILD_TEST=ON -DBUILD_BENCHMARK=ON ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project) { String sudo = auxiliary.sudo(platform.jenkinsLabel) def testCommand = "ctest --output-on-failure " def testCommandExcludeRegex = /(rocprim.device_reduce_by_key|rocprim.device_radix_sort)/ def testCommandExclude = "--exclude-regex \"${testCommandExcludeRegex}\"" def hmmExcludeRegex = /(rocprim.device_scan|rocprim.device_reduce_by_key|rocprim.block_sort_bitonic|rocprim.device_merge|rocprim.device_merge_sort|rocprim.device_partition|rocprim.device_segmented_radix_sort|rocprim.device_segmented_scan)/ def hmmTestCommandExclude = "--exclude-regex \"${hmmExcludeRegex}\"" def hmmTestCommand = '' if (platform.jenkinsLabel.contains('gfx90a')) { echo("HMM TESTS DISABLED") /*hmmTestCommand = """ export HSA_XNACK=1 export ROCPRIM_USE_HMM=1 ${testCommand} ${hmmTestCommandExclude} """*/ } echo(env.JOB_NAME) if (env.JOB_NAME.contains('bleeding-edge')) { testCommand = '' testCommandExclude = '' hmmTestCommand = '' echo("TESTS DISABLED") } def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} cd ${project.testDirectory} ${testCommand} ${testCommandExclude} if (( \$? != 0 )); then exit 1 fi ${hmmTestCommand} """ platform.runCommand(this, command) } def runPackageCommand(platform, project) { def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/release") platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) } return this rocPRIM-rocm-5.7.1/.jenkins/precheckin.groovy000066400000000000000000000043441446201466700210550ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocPRIM', 'PreCheckin') prj.paths.build_command = './install -c' prj.timeout.compile = 600 def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList, 'rocPRIM') propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } rocPRIM-rocm-5.7.1/.jenkins/staticlibrary.groovy000066400000000000000000000045561446201466700216230ustar00rootroot00000000000000#!/usr/bin/env groovy @Library('rocJenkins@pong') _ import com.amd.project.* import com.amd.docker.* import java.nio.file.Path; def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocPRIM', 'Static Library PreCheckin') def nodes = new dockerNodes(nodeDetails, jobName, prj) def commonGroovy boolean formatCheck = false def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, false) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi":[pipelineTriggers([cron('0 1 * * 0')])], "compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])], "rocm-docker":[]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']]), "rocm-docker":([ubuntu16:['gfx900'],centos7:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } }rocPRIM-rocm-5.7.1/.readthedocs.yaml000066400000000000000000000004171446201466700172000ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip] python: version: "3.8" install: - requirements: docs/.sphinx/requirements.txt rocPRIM-rocm-5.7.1/CHANGELOG.md000066400000000000000000000260421446201466700155640ustar00rootroot00000000000000# Change Log for rocPRIM Full documentation for rocPRIM is available at [https://rocprim.readthedocs.io/en/latest/](https://rocprim.readthedocs.io/en/latest/) ## [Unreleased rocPRIM-2.13.1 for ROCm 5.7.0] ### Changed - Deprecated configuration `radix_sort_config` for device-level radix sort as it no longer matches the algorithm's parameters. New configuration `radix_sort_config_v2` is preferred instead. - Removed erroneous implementation of device-level `inclusive_scan` and `exclusive_scan`. The prior default implementation using lookback-scan now is the only available implementation. - The benchmark metric indicating the bytes processed for `exclusive_scan_by_key` and `inclusive_scan_by_key` has been changed to incorporate the key type. Furthermore, the benchmark log has been changed such that these algorithms are reported as `scan` and `scan_by_key` instead of `scan_exclusive` and `scan_inclusive`. - Deprecated configurations `scan_config` and `scan_by_key_config` for device-level scans, as they no longer match the algorithm's parameters. New configurations `scan_config_v2` and `scan_by_key_config_v2` are preferred instead. ### Fixed - Fixed build issue caused by missing header in `thread/thread_search.hpp`. ## [rocPRIM-2.13.0 for ROCm 5.5.0] ### Added - New block level `radix_rank` primitive. - New block level `radix_rank_match` primitive. - Added a stable block sorting implementation. This be used with `block_sort` by using the `block_sort_algorithm::stable_merge_sort` algorithm. ### Changed - Improved the performance of `block_radix_sort` and `device_radix_sort`. - Improved the performance of `device_merge_sort`. - Updated `docs` directory structure to match the standard of [rocm-docs-core](https://github.com/RadeonOpenCompute/rocm-docs-core). Contributed by: [v01dXYZ](https://github.com/v01dXYZ). ### Known Issues - Disabled GPU error messages relating to incorrect warp operation usage with Navi GPUs on Windows, due to GPU printf performance issues on Windows. - When `ROCPRIM_DISABLE_LOOKBACK_SCAN` is set, `device_scan` fails for input sizes bigger than `scan_config::size_limit`, which defaults to `std::numeric_limits::max()`. ## [rocPRIM-2.12.0 for ROCm 5.4.0] ### Changed - `device_partition`, `device_unique`, and `device_reduce_by_key` now support problem sizes larger than 2^32 items. - Device algorithms now return `hipErrorInvalidValue` if the amount of passed temporary memory is insufficient. - Lists of sizes for tests are unified, restored scan/reduce tests for `half` and `bfloat16` values. ### Removed - `block_sort::sort()` overload for keys and values with a dynamic size. This overload was documented but the implementation is missing. To avoid further confusion the documentation is removed until a decision is made on implementing the function. ### Fixed - Fixed the compilation failure in `device_merge` if the two key iterators don't match. ## [rocPRIM-2.11.1 for ROCm 5.3.3] ### Fixed - Fixed the compilation failure in device_merge if the two key iterators don't match. ## [rocPRIM-2.11.0 for ROCm 5.3.2] ### Known Issue - device_merge no longer correctly supports using different types for `keys_input1` and `keys_input2` (starting from the 5.3.0 release). ## [rocPRIM-2.11.0 for ROCm 5.3.0] ### Added - New functions `subtract_left` and `subtract_right` in `block_adjacent_difference` to apply functions on pairs of adjacent items distributed between threads in a block. - New device level `adjacent_difference` primitives. - Added experimental tooling for automatic kernel configuration tuning for various architectures - Benchmarks collect and output more detailed system information - CMake functionality to improve build parallelism of the test suite that splits compilation units by function or by parameters. - Reverse iterator. - Support for problem sizes over `UINT_MAX` in device functions `inclusive_scan_by_key` and `exclusive_scan_by_key`. ## Changed - Improved the performance of warp primitives using the swizzle operation on Navi - Improved build parallelism of the test suite by splitting up large compilation units - `device_select` now supports problem sizes larger than 2^32 items. - `device_segmented_radix_sort` now partitions segments to groups small, medium and large segments. Each segment group can be sorted by specialized kernels to improve throughput. - Improved performance of histogram for the case of highly uneven sample distribution. ## [rocPRIM-2.10.14 for ROCm 5.2.0] ### Added - Packages for tests and benchmark executable on all supported OSes using CPack. - Added File/Folder Reorg Changes and Enabled Backward compatibility support using wrapper headers. ## [Released rocPRIM-2.10.13 for ROCm 5.1.0] ### Fixed - Fixed radix sort int64_t bug introduced in [2.10.11] ### Added - Future value - Added device partition_three_way to partition input to three output iterators based on two predicates ### Changed - The reduce/scan algorithm precision issues in the tests has been resolved for half types. - The device radix sort algorithm supports indexing with 64 bit unsigned integers. - The indexer type is chosen based on the type argument of parameter `size`. - If `sizeof(size)` is not larger than 4 bytes, the indexer type is 32 bit unsigned int, - Else the indexer type is 64 bit unsigned int. - The maximum problem size is based on the compile time configuration of the algorithm according to the following formula: - `max_problem_size = (UINT_MAX + 1) * config::scan::block_size * config::scan::items_per_thread`. - The flags API of `block_adjacent_difference` is now deprecated and will be removed in a future version. ### Known issues - device_segmented_radix_sort unit test failing for HIP on Windows ## [Released rocPRIM-2.10.12 for ROCm 5.0.0] ### Fixed - Enable bfloat16 tests and reduce threshold for bfloat16 - Fix device scan limit_size feature - Non-optimized builds no longer trigger local memory limit errors ### Added - Added scan size limit feature - Added reduce size limit feature - Added transform size limit feature - Add block_load_striped and block_store_striped - Add gather_to_blocked to gather values from other threads into a blocked arrangement - The block sizes for device merge sorts initial block sort and its merge steps are now separate in its kernel config - the block sort step supports multiple items per thread ### Changed - size_limit for scan, reduce and transform can now be set in the config struct instead of a parameter - Device_scan and device_segmented_scan: `inclusive_scan` now uses the input-type as accumulator-type, `exclusive_scan` uses initial-value-type. - This particularly changes behaviour of small-size input types with large-size output types (e.g. `short` input, `int` output). - And low-res input with high-res output (e.g. `float` input, `double` output) - Revert old Fiji workaround, because they solved the issue at compiler side - Update README cmake minimum version number - Block sort support multiple items per thread - currently only powers of two block sizes, and items per threads are supported and only for full blocks - Bumped the minimum required version of CMake to 3.16 ### Known issues - Unit tests may soft hang on MI200 when running in hipMallocManaged mode. - device_segmented_radix_sort, device_scan unit tests failing for HIP on Windows - ReduceEmptyInput cause random faulire with bfloat16 ## [rocPRIM-2.10.11 for ROCm 4.5.0] ### Added - Initial HIP on Windows support. See README for instructions on how to build and install. - bfloat16 support added. ### Changed - Packaging split into a runtime package called rocprim and a development package called rocprim-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release. - As rocPRIM is a header-only library, the runtime package is an empty placeholder used to aid in the transition. This package is also a deprecated feature and will be removed in a future rocm release. ### Known issues - Unit tests may soft hang on MI200 when running in hipMallocManaged mode. ### Deprecated - The warp_size() function is now deprecated; please switch to host_warp_size() and device_warp_size() for host and device references respectively. ## [rocPRIM-2.10.11 for ROCm 4.4.0] ### Added - Code coverage tools build option - Address sanitizer build option - gfx1030 support added. - Experimental [HIP-CPU](https://github.com/ROCm-Developer-Tools/HIP-CPU) support; build using GCC/Clang/MSVC on Win/Linux. It is work in progress, many algorithms still known to fail. ### Optimizations - Added single tile radix sort for smaller sizes. - Improved performance for radix sort for larger element sizes. ### Deprecated - The warp_size() function is now deprecated; please switch to host_warp_size() and device_warp_size() for host and device references respectively. ## [rocPRIM-2.10.10 for ROCm 4.3.0] ### Fixed - Bugfix & minor performance improvement for merge_sort when input and output storage are the same. ### Added - gfx90a support added. ### Deprecated - The warp_size() function is now deprecated; please switch to host_warp_size() and device_warp_size() for host and device references respectively. ## [rocPRIM-2.10.9 for ROCm 4.2.0] ### Fixed - Size zero inputs are now properly handled with newer ROCm builds that no longer allow zero-size kernel grid/block dimensions ### Changed - Minimum cmake version required is now 3.10.2 ### Known issues - Device scan unit test currently failing due to LLVM bug. ## [rocPRIM-2.10.8 for ROCm 4.1.0] ### Fixed - Texture cache iteration support has been re-enabled. - Benchmark builds have been re-enabled. - Unique operator no longer called on invalid elements. ### Known issues - Device scan unit test currently failing due to LLVM bug. ## [rocPRIM-2.10.7 for ROCm 4.0.0] ### Added - No new features ## [rocPRIM-2.10.6 for ROCm 3.10] ### Optimizations - Updates to DPP instructions for warp shuffle ### Known issues - Benchmark builds are disabled due to compiler bug. ## [rocPRIM-2.10.5 for ROCm 3.9.0] ### Added - Added HIP cmake dependency ### Optimizations - Updates to warp shuffle for gfx10 - Disable DPP functions on gfx10++ ### Known issues - Benchmark builds are disabled due to compiler bug. ## [rocPRIM-2.10.4 for ROCm 3.8.0] ### Fixed - Fix for rocPRIM texture cache iterator ### Known issues - None ## [rocPRIM-2.10.3 for ROCm 3.7.0] ### Fixed - Package dependency correct to hip-rocclr ### Known issues - rocPRIM texture cache iterator functionality is broken in the runtime. It will be fixed in the next release. Please use the prior release if calling this function. ## [rocPRIM-2.10.2 for ROCm 3.6.0] ### Added - No new features ## [rocPRIM-2.10.1 for ROCm 3.5.1] ### Fixed - Point release with compilation fix. ## [rocPRIM-2.10.1 for ROCm 3.5.0] ### Added - Improved tests with fixed and random seeds for test data - Network interface improvements with API v3 ### Changed - Switched to hip-clang as default compiler - CMake searches for rocPRIM locally first; downloads from github if local search fails rocPRIM-rocm-5.7.1/CMakeLists.txt000066400000000000000000000155421446201466700165160ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. cmake_minimum_required(VERSION 3.16 FATAL_ERROR) cmake_policy(VERSION 3.16...3.21) # Install prefix set(CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories") # rocPRIM project project(rocprim LANGUAGES CXX) #Adding CMAKE_PREFIX_PATH list(APPEND CMAKE_PREFIX_PATH /opt/rocm) # Build options option(BUILD_TEST "Build tests (requires googletest)" OFF) option(BUILD_BENCHMARK "Build benchmarks" OFF) option(BUILD_EXAMPLE "Build examples" OFF) option(USE_HIP_CPU "Prefer HIP-CPU runtime instead of HW acceleration" OFF) # Disables building tests, benchmarks, examples option(ONLY_INSTALL "Only install" OFF) option(BUILD_CODE_COVERAGE "Build with code coverage enabled" OFF) # CMake modules list(APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ${ROCM_PATH}/lib/cmake/hip ${HIP_PATH}/cmake /opt/rocm/lib/cmake/hip /opt/rocm/hip/cmake # FindHIP.cmake ) # Set a default build type if none was specified if(NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES) message(STATUS "Setting build type to 'Release' as none was specified.") set(CMAKE_BUILD_TYPE "Release" CACHE STRING "Choose the type of build." FORCE) set_property(CACHE CMAKE_BUILD_TYPE PROPERTY STRINGS "" "Debug" "Release" "MinSizeRel" "RelWithDebInfo") endif() set(CMAKE_INSTALL_RPATH_USE_LINK_PATH TRUE CACHE BOOL "Add paths to linker search and installed rpath") # Set CXX flags set(CMAKE_CXX_STANDARD 14) set(CMAKE_CXX_STANDARD_REQUIRED ON) set(CMAKE_CXX_EXTENSIONS OFF) set(BUILD_SHARED_LIBS OFF) # don't build client dependencies as shared if(NOT USE_HIP_CPU) # Get dependencies (required here to get rocm-cmake) include(cmake/Dependencies.cmake) # Use target ID syntax if supported for GPU_TARGETS if (NOT DEFINED AMDGPU_TARGETS) set(GPU_TARGETS "all" CACHE STRING "GPU architectures to compile for") else() set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for") endif() set_property(CACHE GPU_TARGETS PROPERTY STRINGS "all") if(GPU_TARGETS STREQUAL "all") rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx803;gfx900:xnack-;gfx906:xnack-;gfx908:xnack-;gfx90a:xnack-;gfx90a:xnack+;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102" ) set(GPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "GPU architectures to compile for" FORCE) endif() # TODO: Fix VerifyCompiler for HIP on Windows if (NOT WIN32) include(cmake/VerifyCompiler.cmake) endif() list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm /opt/rocm/llvm /opt/rocm /opt/rocm/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) endif() # FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( "${PROJECT_SOURCE_DIR}/rocprim/include/rocprim" WRAPPER_LOCATIONS rocprim/include/rocprim OUTPUT_LOCATIONS rocprim/wrapper/include/rocprim PATTERNS *.hpp ) endif() if(BUILD_CODE_COVERAGE) add_compile_options(-fprofile-arcs -ftest-coverage) add_link_options(--coverage) endif() if(USE_HIP_CPU) # Get dependencies include(cmake/Dependencies.cmake) endif() # Setup VERSION set(VERSION_STRING "2.13.1") rocm_setup_version(VERSION ${VERSION_STRING}) # Print configuration summary include(cmake/Summary.cmake) print_configuration_summary() # rocPRIM library add_subdirectory(rocprim) if(NOT ONLY_INSTALL AND (BUILD_TEST OR BUILD_BENCHMARK)) rocm_package_setup_component(clients) endif() # Tests if(BUILD_TEST AND NOT ONLY_INSTALL) rocm_package_setup_client_component(tests) enable_testing() add_subdirectory(test) endif() # Benchmarks if(BUILD_BENCHMARK AND NOT ONLY_INSTALL) rocm_package_setup_client_component(benchmarks) add_subdirectory(benchmark) endif() # Examples if(BUILD_EXAMPLE AND NOT ONLY_INSTALL) add_subdirectory(example) endif() # Package set(BUILD_SHARED_LIBS ON) # Build as though shared library for naming rocm_package_add_dependencies(DEPENDS "hip-rocclr >= 3.5.0") set(CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.txt") set(CPACK_RPM_PACKAGE_LICENSE "MIT") set(CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) rocm_create_package( NAME rocprim DESCRIPTION "Radeon Open Compute Parallel Primitives Library" MAINTAINER "rocPRIM Maintainer " HEADER_ONLY ) # # ADDITIONAL TARGETS FOR CODE COVERAGE # if(BUILD_CODE_COVERAGE) # # > make coverage_cleanup (clean coverage related files.) # > # run your tests # > make coverage (generate html documentation) # # # Prepare coverage output # This little script is generated because the option '--gcov-tool ' of lcov cannot take arguments. # add_custom_target(coverage DEPENDS rocprim COMMAND mkdir -p lcoverage COMMAND echo "\\#!/bin/bash" > llvm-gcov.sh COMMAND echo "\\# THIS FILE HAS BEEN GENERATED" >> llvm-gcov.sh COMMAND printf "exec /opt/rocm/llvm/bin/llvm-cov gcov $$\\@" >> llvm-gcov.sh COMMAND chmod +x llvm-gcov.sh ) # # Generate coverage output. # add_custom_command(TARGET coverage COMMAND lcov --directory . --base-directory . --gcov-tool ${CMAKE_BINARY_DIR}/llvm-gcov.sh --capture -o lcoverage/raw_main_coverage.info COMMAND lcov --remove lcoverage/raw_main_coverage.info "'/opt/*'" "'/usr/*'" -o lcoverage/main_coverage.info COMMAND genhtml lcoverage/main_coverage.info --output-directory lcoverage ) # # Coverage cleanup # add_custom_target(coverage_cleanup COMMAND find ${CMAKE_BINARY_DIR} -name *.gcda -delete WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocPRIM-rocm-5.7.1/LICENSE.txt000066400000000000000000000021261446201466700155730ustar00rootroot00000000000000MIT License Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.rocPRIM-rocm-5.7.1/NOTICES.txt000066400000000000000000000066161446201466700156250ustar00rootroot00000000000000Notices and Licenses file ______________________________________________________________________________ AMD copyrighted code (MIT) Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. ROCmSoftwarePlatform-rocPRIM v2.5.0 (MIT) Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. florianrappl-cmdparser v-u (MIT) Copyright (c) 2015 - 2016 Florian Rappl Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. rocPRIM-rocm-5.7.1/README.md000066400000000000000000000232211446201466700152260ustar00rootroot00000000000000# rocPRIM The rocPRIM is a header-only library providing HIP parallel primitives for developing performant GPU-accelerated code on AMD ROCm platform. ## Requirements * Git * CMake (3.16 or later) * AMD [ROCm](https://rocm.github.io/install.html) platform (1.8.2 or later) * Including [HIP-clang](https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#hip-clang) compiler * C++14 * Python 3.6 or higher (HIP on Windows only, required only for install script) * Visual Studio 2019 with clang support (HIP on Windows only) * Strawberry Perl (HIP on Windows only) Optional: * [GTest](https://github.com/google/googletest) * Required only for tests. Building tests is enabled by default. * It will be automatically downloaded and built by cmake script. * [Google Benchmark](https://github.com/google/benchmark) * Required only for benchmarks. Building benchmarks is off by default. * It will be automatically downloaded and built by cmake script. ## Build and Install ### Linux ```shell git clone https://github.com/ROCmSoftwarePlatform/rocPRIM.git # Go to rocPRIM directory, create and go to the build directory. cd rocPRIM; mkdir build; cd build # Configure rocPRIM, setup options for your system. # Build options: # ONLY_INSTALL - OFF by default, If this flag is on, the build ignore the BUILD_* flags # BUILD_TEST - OFF by default, # BUILD_EXAMPLE - OFF by default, # BUILD_BENCHMARK - OFF by default. # BENCHMARK_CONFIG_TUNING - OFF by default. The purpose of this flag to find the best kernel config parameters. # At ON the compilation time can be increased significantly. # AMDGPU_TARGETS - list of AMD architectures, default: gfx803;gfx900;gfx906;gfx908. # You can make compilation faster if you want to test/benchmark only on one architecture, # for example, add -DAMDGPU_TARGETS=gfx906 to 'cmake' parameters. # AMDGPU_TEST_TARGETS - list of AMD architectures, default: "" (default system device) # If you want to detect failures on a per GFX IP basis, setting it to some set of ips will create # separate tests with the ip name embedded into the test name. Building for all, but selecting # tests only of a specific architecture is possible for eg: ctest -R gfx803|gfx900 # # ! IMPORTANT ! # Set C++ compiler to HIP-clang. You can do it by adding 'CXX=' # before 'cmake' or setting cmake option 'CMAKE_CXX_COMPILER' to path to the compiler. # Using HIP-clang: [CXX=hipcc] cmake -DBUILD_BENCHMARK=ON ../. # # ! EXPERIMENTAL ! # Alternatively one may build using the experimental (and highly incomplete) HIP-CPU back-end for host-side # execution using any C++17 conforming compiler (supported by HIP-CPU). AMDGPU_* options are unavailable in this case. # USE_HIP_CPU - OFF by default # Build make -j4 # Optionally, run tests if they're enabled. ctest --output-on-failure # Install [sudo] make install ``` ### Windows Initial support for HIP on Windows has been added. To install, use the provided rmake.py python script: ```shell git clone https://github.com/ROCmSoftwarePlatform/rocPRIM.git cd rocPRIM # the -i option will install rocPRIM to C:\hipSDK by default python rmake.py -i # the -c option will build all clients including unit tests python rmake.py -c ``` ### Using rocPRIM Include `` header: ```cpp #include ``` Recommended way of including rocPRIM into a CMake project is by using its package configuration files. rocPRIM package name is `rocprim`. ```cmake # "/opt/rocm" - default install prefix find_package(rocprim REQUIRED CONFIG PATHS "/opt/rocm/rocprim") ... # Includes only rocPRIM headers, HIP libraries have # to be linked manually by user target_link_libraries( roc::rocprim) # Includes rocPRIM headers and required HIP dependencies target_link_libraries( roc::rocprim_hip) ``` ## Running Unit Tests Unit tests are implemented in terms of Google Test and collections of tests are wrapped to be invoked from CTest for convenience. ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # List available tests ctest --show-only # To run all tests ctest # Run specific test(s) ctest -R # To run the Google Test manually ./test/rocprim/test_ ``` ### Using multiple GPUs concurrently for testing This feature requires CMake 3.16+ to be used for building / testing. _(Prior versions of CMake cannot assign ids to tests when running in parallel. Assigning tests to distinct devices could only be done at the cost of extreme complexity._) The unit tests can make use of [CTest Resource Allocation](https://cmake.org/cmake/help/latest/manual/ctest.1.html#resource-allocation) feature enabling distributing tests across multiple GPUs in an intelligent manner. The feature can accelerate testing when multiple GPUs of the same family are in a system as well as test multiple family of products from one invocation without having to resort to `HIP_VISIBLE_DEVICES` environment variable. The feature relies on the presence of a resource spec file. > IMPORTANT: trying to use `RESOURCE_GROUPS` and `--resource-spec-file` with CMake/CTest respectively of versions prior to 3.16 omits the feature silently. No warnings issued about unknown properties or command-line arguments. Make sure that `cmake`/`ctest` invoked are sufficiently recent. #### Auto resource spec generation There is a utility script in the repo that may be called independently: ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # Invoke directly or use CMake script mode via cmake -P ../cmake/GenerateResourceSpec.cmake # Assuming you have 2 compatible GPUs in the system ctest --resource-spec-file ./resources.json --parallel 2 ``` #### Manual Assuming the user has 2 GPUs from the gfx900 family and they are the first devices enumerated by the system one may specify during configuration `-D AMDGPU_TEST_TARGETS=gfx900` stating only one family will be tested. Leaving this var empty (default) results in targeting the default device in the system. To let CMake know there are 2 GPUs that should be targeted, one has to feed CTest a JSON file via the `--resource-spec-file ` flag. For example: ```json { "version": { "major": 1, "minor": 0 }, "local": [ { "gfx900": [ { "id": "0" }, { "id": "1" } ] } ] } ``` Invoking CTest as `ctest --resource-spec-file --parallel 2` will allow two tests to run concurrently which will be distributed among the two GPUs. ### Using custom seeds for the tests Go to the `rocPRIM/test/rocprim/test_seed.hpp` file. ```cpp //(1) static constexpr int random_seeds_count = 10; //(2) static constexpr unsigned int seeds [] = {0, 2, 10, 1000}; //(3) static constexpr size_t seed_size = sizeof(seeds) / sizeof(seeds[0]); ``` (1) defines a constant that sets how many passes over the tests will be done with runtime-generated seeds. Modify at will. (2) defines the user generated seeds. Each of the elements of the array will be used as seed for all tests. Modify at will. If no static seeds are desired, the array should be left empty. ```cpp static constexpr unsigned int seeds [] = {}; ``` (3) this line should never be modified. ## Running Benchmarks ```shell # Go to rocPRIM build directory cd rocPRIM; cd build # To run benchmark for warp functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_warp_ [--size ] [--trials ] # To run benchmark for block functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_block_ [--size ] [--trials ] # To run benchmark for device functions: # Further option can be found using --help # [] Fields are optional ./benchmark/benchmark_device_ [--size ] [--trials ] ``` ### Performance configuration Most of device-wide primitives provided by rocPRIM can be tuned for different AMD device, different types or different operations using compile-time configuration structures passed to them as a template parameter. Main "knobs" are usually size of the block and number of items processed by a single thread. rocPRIM has built-in default configurations for each of its primitives. In order to use included configurations user should define macro `ROCPRIM_TARGET_ARCH` to `803` if algorithms should be optimized for gfx803 GCN version, or to `900` for gfx900. ## Documentation The latest rocPRIM documentation and API description can be found [here](https://rocprim.readthedocs.io/en/latest/). It can also be built using the following commands: ```shell # Go to rocPRIM docs directory cd rocPRIM; cd docs # Install Python dependencies python3 -m pip install -r .sphinx/requirements.txt # Build the documentation python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html # For e.g. serve the HTML docs locally cd _build/html python3 -m http.server ``` ## hipCUB [hipCUB](https://github.com/ROCmSoftwarePlatform/hipCUB/) is a thin wrapper library on top of [rocPRIM](https://github.com/ROCmSoftwarePlatform/rocPRIM) or [CUB](https://github.com/NVlabs/cub). It enables developers to port project that uses CUB library to the [HIP](https://github.com/ROCm-Developer-Tools/HIP) layer and to run them on AMD hardware. In [ROCm](https://rocm.github.io/) environment hipCUB uses rocPRIM library as the backend, however, on CUDA platforms it uses CUB instead. ## Support Bugs and feature requests can be reported through [the issue tracker](https://github.com/ROCmSoftwarePlatform/rocPRIM/issues). ## Contributions and License Contributions of any kind are most welcome! More details are found at [CONTRIBUTING](./CONTRIBUTING.md) and [LICENSE](./LICENSE.txt). rocPRIM-rocm-5.7.1/benchmark/000077500000000000000000000000001446201466700157015ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/benchmark/CMakeLists.txt000066400000000000000000000145441446201466700204510ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. option(BENCHMARK_CONFIG_TUNING "Benchmark device-level functions using various configs" OFF) include(../cmake/ConfigAutotune.cmake) include(ConfigAutotuneSettings.cmake) if(BENCHMARK_CONFIG_TUNING) add_custom_target("benchmark_config_tuning") endif() function(add_rocprim_benchmark BENCHMARK_SOURCE) get_filename_component(BENCHMARK_TARGET ${BENCHMARK_SOURCE} NAME_WE) if(BENCHMARK_CONFIG_TUNING) if(EXISTS "${CMAKE_CURRENT_SOURCE_DIR}/${BENCHMARK_TARGET}.parallel.cpp.in") message(STATUS "found ${BENCHMARK_TARGET}.parallel.cpp.in file, compiling in parallel.") read_config_autotune_settings(${BENCHMARK_TARGET} list_across_names list_across output_pattern_suffix) #make sure that variables are not empty, i.e. there actually is an entry for that benchmark in benchmark/ConfigAutotuneSettings.cmake if(list_across_names) add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) target_compile_definitions(${BENCHMARK_TARGET} PRIVATE BENCHMARK_CONFIG_TUNING) add_matrix(TARGET ${BENCHMARK_TARGET} SHARDS 1 CURRENT_SHARD 0 INPUT "${BENCHMARK_TARGET}.parallel.cpp.in" OUTPUT_PATTERN "${BENCHMARK_TARGET}_${output_pattern_suffix}" NAMES ${list_across_names} LISTS ${list_across}) add_dependencies(benchmark_config_tuning ${BENCHMARK_TARGET}) else() message(WARNING "No config-tuning entry in benchmark/ConfigAutotuneSettings.cmake for ${BENCHMARK_TARGET}!") return() endif() else() #do nothing if BENCHMARK_CONFIG_TUNING is ON but no ${BENCHMARK_TARGET}.parallel.cpp.in exists return() endif() else() add_executable(${BENCHMARK_TARGET} ${BENCHMARK_SOURCE}) endif() target_link_libraries(${BENCHMARK_TARGET} PRIVATE rocprim benchmark::benchmark ) if(NOT USE_HIP_CPU) target_link_libraries(${BENCHMARK_TARGET} PRIVATE rocprim_hip ) else() target_link_libraries(${BENCHMARK_TARGET} PRIVATE Threads::Threads hip_cpu_rt::hip_cpu_rt ) if(STL_DEPENDS_ON_TBB) target_link_libraries(${BENCHMARK_TARGET} PRIVATE TBB::tbb ) endif() endif() target_compile_options(${BENCHMARK_TARGET} PRIVATE $<$: /bigobj # number of sections exceeded object file format limit: compile with /bigobj > ) set_target_properties(${BENCHMARK_TARGET} PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${CMAKE_BINARY_DIR}/benchmark" ) rocm_install(TARGETS ${BENCHMARK_TARGET} COMPONENT benchmarks) if (WIN32 AND NOT DEFINED DLLS_COPIED) set(DLLS_COPIED "YES") set(DLLS_COPIED ${DLLS_COPIED} PARENT_SCOPE) # for now adding in all .dll as dependency chain is not cmake based on win32 file( GLOB third_party_dlls LIST_DIRECTORIES ON CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll ${CMAKE_SOURCE_DIR}/rtest.* ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET ${BENCHMARK_TARGET} POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy_if_different ${file_i} ${PROJECT_BINARY_DIR}/benchmark ) endforeach( file_i ) endif() endfunction() # **************************************************************************** # Benchmarks # **************************************************************************** add_rocprim_benchmark(benchmark_block_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_block_discontinuity.cpp) add_rocprim_benchmark(benchmark_block_exchange.cpp) add_rocprim_benchmark(benchmark_block_histogram.cpp) add_rocprim_benchmark(benchmark_block_radix_sort.cpp) add_rocprim_benchmark(benchmark_block_radix_rank.cpp) add_rocprim_benchmark(benchmark_block_reduce.cpp) add_rocprim_benchmark(benchmark_block_scan.cpp) add_rocprim_benchmark(benchmark_block_sort.cpp) add_rocprim_benchmark(benchmark_config_dispatch.cpp) add_rocprim_benchmark(benchmark_device_adjacent_difference.cpp) add_rocprim_benchmark(benchmark_device_binary_search.cpp) add_rocprim_benchmark(benchmark_device_histogram.cpp) add_rocprim_benchmark(benchmark_device_merge.cpp) add_rocprim_benchmark(benchmark_device_merge_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_merge_sort_block_merge.cpp) add_rocprim_benchmark(benchmark_device_partition.cpp) add_rocprim_benchmark(benchmark_device_radix_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_block_sort.cpp) add_rocprim_benchmark(benchmark_device_radix_sort_onesweep.cpp) add_rocprim_benchmark(benchmark_device_reduce_by_key.cpp) add_rocprim_benchmark(benchmark_device_reduce.cpp) add_rocprim_benchmark(benchmark_device_run_length_encode.cpp) add_rocprim_benchmark(benchmark_device_scan.cpp) add_rocprim_benchmark(benchmark_device_scan_by_key.cpp) add_rocprim_benchmark(benchmark_device_select.cpp) add_rocprim_benchmark(benchmark_device_segmented_radix_sort.cpp) add_rocprim_benchmark(benchmark_device_segmented_reduce.cpp) add_rocprim_benchmark(benchmark_device_transform.cpp) add_rocprim_benchmark(benchmark_warp_exchange.cpp) add_rocprim_benchmark(benchmark_warp_reduce.cpp) add_rocprim_benchmark(benchmark_warp_scan.cpp) add_rocprim_benchmark(benchmark_warp_sort.cpp) add_rocprim_benchmark(benchmark_device_memory.cpp) rocPRIM-rocm-5.7.1/benchmark/ConfigAutotuneSettings.cmake000066400000000000000000000112421446201466700233560ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # All default fallback types as listed in scripts/autotune/fallback_config.json set(TUNING_TYPES "int64_t int short int8_t double float rocprim::half") # If config selection happens based on two types, the second type has limited fallbacks. The selection is based # on the size and it is ignored whether the type is floating-point or integral. The autotuning script uses the # benchmarks for the integral types as fallback, hence tuning for the floating-point types is not needed. set(LIMITED_TUNING_TYPES "int64_t int short int8_t") function(read_config_autotune_settings file list_across_names list_across output_pattern_suffix) if(file STREQUAL "benchmark_device_adjacent_difference") set(list_across_names "DataType;Left;InPlace;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "${TUNING_TYPES};\ true false;true false;64 128;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Left@_@InPlace@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_histogram") set(list_across_names "DataType;BlockSize" PARENT_SCOPE) set(list_across "${TUNING_TYPES};64 128 256" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_merge") set(list_across_names "KeyType;ValueType;BlockSize;UseMergePath" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type;\ 128 256 512 1024;true" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@UseMergePath@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_merge_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize;BlockSortMethod" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES} custom_type;\ 256 512 1024;rocprim::block_sort_algorithm::stable_merge_sort" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@BlockSortMethod@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_block_sort") set(list_across_names "KeyType;ValueType;BlockSize" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 64 128 256 512 1024" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_radix_sort_onesweep") set(list_across_names "KeyType;ValueType;BlockSize;RadixBits" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};rocprim::empty_type ${LIMITED_TUNING_TYPES};\ 128 256 512 1024;4 5 6 7 8" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@BlockSize@_@RadixBits@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_reduce") set(list_across_names "DataType;BlockSize;ItemsPerThread" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};64 128 256;1 2 4 8 16" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@BlockSize@_@ItemsPerThread@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan") set(list_across_names "DataType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@DataType@_@Algo@" PARENT_SCOPE) elseif(file STREQUAL "benchmark_device_scan_by_key") set(list_across_names "KeyType;ValueType;Algo" PARENT_SCOPE) set(list_across "\ ${TUNING_TYPES};${LIMITED_TUNING_TYPES};using_warp_scan reduce_then_scan" PARENT_SCOPE) set(output_pattern_suffix "@KeyType@_@ValueType@_@Algo@" PARENT_SCOPE) endif() endfunction() rocPRIM-rocm-5.7.1/benchmark/benchmark_block_adjacent_difference.cpp000066400000000000000000000403171446201466700255210ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #include #include #include #include #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void kernel(Args ...args) { Benchmark::template run(args...); } struct subtract_left { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_left(input, output, rp::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_left_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, T(123), tile_size, storage); } else { adjacent_diff_t().subtract_left_partial(input, output, rp::minus<>{}, tile_size, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right { template __device__ static void run(const T* d_input, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; if(WithTile) { adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, T(123), storage); } else { adjacent_diff_t().subtract_right(input, output, rp::minus<>{}, storage); } for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct subtract_right_partial { template __device__ static void run(const T* d_input, const unsigned int* tile_sizes, T* d_output, unsigned int trials) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); using adjacent_diff_t = rp::block_adjacent_difference; __shared__ typename adjacent_diff_t::storage_type storage; unsigned int tile_size = tile_sizes[blockIdx.x]; // Try to evenly distribute the length of tile_sizes between all the trials const auto tile_size_diff = (BlockSize * ItemsPerThread) / trials + 1; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < trials; trial++) { T output[ItemsPerThread]; adjacent_diff_t().subtract_right_partial(input, output, rp::minus<>{}, tile_size, storage); for(unsigned int i = 0; i < ItemsPerThread; ++i) { input[i] += output[i]; } // Change the tile_size to even out the distribution tile_size = (tile_size + tile_size_diff) % (BlockSize * ItemsPerThread); rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value && !std::is_same::value> { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = get_random_data(size, T(0), T(10)); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_output, Trials ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template auto run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) -> std::enable_if_t::value || std::is_same::value> { static constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto num_blocks = (N + items_per_block - 1) / items_per_block; // Round up size to the next multiple of items_per_block const auto size = num_blocks * items_per_block; const std::vector input = get_random_data(size, T(0), T(10)); const std::vector tile_sizes = get_random_data(num_blocks, 0, items_per_block); T* d_input; unsigned int* d_tile_sizes; T* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_tile_sizes, tile_sizes.size() * sizeof(tile_sizes[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0]))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_tile_sizes, tile_sizes.data(), tile_sizes.size() * sizeof(tile_sizes[0]), hipMemcpyHostToDevice ) ); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(num_blocks), dim3(BlockSize), 0, stream, d_input, d_tile_sizes, d_output, Trials ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_tile_sizes)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:adjacent_difference,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block, with_tile) \ CREATE_BENCHMARK(type, block, 1, with_tile), \ CREATE_BENCHMARK(type, block, 3, with_tile), \ CREATE_BENCHMARK(type, block, 4, with_tile), \ CREATE_BENCHMARK(type, block, 8, with_tile), \ CREATE_BENCHMARK(type, block, 16, with_tile), \ CREATE_BENCHMARK(type, block, 32, with_tile) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(float, 256, false), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(rocprim::half, 256, false), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(double, 256, false) }; if(!std::is_same::value) { bs.insert(bs.end(), { BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(float, 256, true), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(rocprim::half, 256, true), BENCHMARK_TYPE(long long, 256, true), BENCHMARK_TYPE(double, 256, true) }); } benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks("subtract_left", benchmarks, stream, size); add_benchmarks("subtract_right", benchmarks, stream, size); add_benchmarks("subtract_left_partial", benchmarks, stream, size); add_benchmarks("subtract_right_partial", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_discontinuity.cpp000066400000000000000000000256121446201466700245040ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, T * d_output) { Runner::template run(d_input, d_output); } struct flag_heads { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads(head_flags, T(123), input, rp::equal_to()); } else { bdiscontinuity.flag_heads(head_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_tails(tail_flags, T(123), input, rp::equal_to()); } else { bdiscontinuity.flag_tails(tail_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += tail_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct flag_heads_and_tails { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials > __device__ static void run(const T * d_input, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_discontinuity bdiscontinuity; bool head_flags[ItemsPerThread]; bool tail_flags[ItemsPerThread]; if(WithTile) { bdiscontinuity.flag_heads_and_tails(head_flags, T(123), tail_flags, T(234), input, rp::equal_to()); } else { bdiscontinuity.flag_heads_and_tails(head_flags, tail_flags, input, rp::equal_to()); } for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] += head_flags[i]; input[i] += tail_flags[i]; } rp::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, bool WithTile, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input = get_random_data(size, T(0), T(10)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, WITH_TILE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:discontinuity,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT \ ",with_tile:" #WITH_TILE "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block, bool) \ CREATE_BENCHMARK(type, block, 1, bool), \ CREATE_BENCHMARK(type, block, 2, bool), \ CREATE_BENCHMARK(type, block, 3, bool), \ CREATE_BENCHMARK(type, block, 4, bool), \ CREATE_BENCHMARK(type, block, 8, bool) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 256, false), BENCHMARK_TYPE(int, 256, true), BENCHMARK_TYPE(int8_t, 256, false), BENCHMARK_TYPE(int8_t, 256, true), BENCHMARK_TYPE(uint8_t, 256, false), BENCHMARK_TYPE(uint8_t, 256, true), BENCHMARK_TYPE(rocprim::half, 256, false), BENCHMARK_TYPE(rocprim::half, 256, true), BENCHMARK_TYPE(long long, 256, false), BENCHMARK_TYPE(long long, 256, true), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks("flag_heads", benchmarks, stream, size); add_benchmarks("flag_tails", benchmarks, stream, size); add_benchmarks("flag_heads_and_tails", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_exchange.cpp000066400000000000000000000330251446201466700233560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T * d_input, const unsigned int * d_ranks, T * d_output) { Runner::template run(d_input, d_ranks, d_output); } struct blocked_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.blocked_to_striped(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.striped_to_blocked(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct blocked_to_warp_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.blocked_to_warp_striped(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct warp_striped_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int *, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.warp_striped_to_blocked(input, input); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_blocked { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.scatter_to_blocked(input, input, ranks); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; struct scatter_to_striped { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T * d_input, const unsigned int * d_ranks, T * d_output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T input[ItemsPerThread]; unsigned int ranks[ItemsPerThread]; rp::block_load_direct_striped(lid, d_input + block_offset, input); rp::block_load_direct_striped(lid, d_ranks + block_offset, ranks); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_exchange exchange; exchange.scatter_to_striped(input, input, ranks); ::rocprim::syncthreads(); } rp::block_store_direct_striped(lid, d_output + block_offset, input); } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input(size); // Fill input for(size_t i = 0; i < size; i++) { input[i] = T(i); } std::vector ranks(size); // Fill ranks (for scatter operations) std::mt19937 gen; for(size_t bi = 0; bi < size / items_per_block; bi++) { auto block_ranks = ranks.begin() + bi * items_per_block; std::iota(block_ranks, block_ranks + items_per_block, 0); std::shuffle(block_ranks, block_ranks + items_per_block, gen); } T * d_input; unsigned int * d_ranks; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_ranks), size * sizeof(unsigned int))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_ranks, ranks.data(), size * sizeof(unsigned int), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_ranks, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_ranks)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:exchange,subalgo:" + name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 7), \ CREATE_BENCHMARK(type, block, 8) template void add_benchmarks(const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(custom_float2, 256), BENCHMARK_TYPE(float2, 256), BENCHMARK_TYPE(custom_double2, 256), BENCHMARK_TYPE(double2, 256), BENCHMARK_TYPE(float4, 256), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks("blocked_to_striped", benchmarks, stream, size); add_benchmarks("striped_to_blocked", benchmarks, stream, size); add_benchmarks("blocked_to_warp_striped", benchmarks, stream, size); add_benchmarks("warp_striped_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_blocked", benchmarks, stream, size); add_benchmarks("scatter_to_striped", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_histogram.cpp000066400000000000000000000213041446201466700235660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct histogram { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize, unsigned int Trials > __device__ static void run(const T* input, T* output) { // TODO: Move global_offset into final loop const unsigned int index = ((blockIdx.x * BlockSize) + threadIdx.x) * ItemsPerThread; unsigned int global_offset = blockIdx.x * BinSize; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[index + k]; } using bhistogram_t = rp::block_histogram; __shared__ T histogram[BinSize]; __shared__ typename bhistogram_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bhistogram_t().histogram(values, histogram, storage); } ROCPRIM_UNROLL for (unsigned int offset = 0; offset < BinSize; offset += BlockSize) { if(offset + threadIdx.x < BinSize) { output[global_offset + threadIdx.x] = histogram[offset + threadIdx.x]; global_offset += BlockSize; } } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int BinSize = BlockSize, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); const auto bin_size = BinSize * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, 0.0f); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), bin_size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:histogram,key_type:" #T ",cfg:{bs:" #BS \ ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t size) { std::vector new_benchmarks = { BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(unsigned long long, 256), BENCHMARK_TYPE(unsigned long long, 320) }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; // using_atomic using histogram_a_t = histogram; add_benchmarks(benchmarks, "using_atomic", stream, size); // using_sort using histogram_s_t = histogram; add_benchmarks(benchmarks, "using_sort", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_radix_rank.cpp000066400000000000000000000223031446201466700237130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "benchmark_utils.hpp" #include "cmdparser.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void rank_kernel(const T* keys_input, unsigned int* ranks_output) { using rank_type = rp::block_radix_rank; const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, keys_input + block_offset, keys); unsigned int ranks[ItemsPerThread]; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { ROCPRIM_SHARED_MEMORY typename rank_type::storage_type storage; unsigned int begin_bit = 0; const unsigned int end_bit = sizeof(T) * 8; while(begin_bit < end_bit) { const unsigned pass_bits = min(RadixBits, end_bit - begin_bit); if ROCPRIM_IF_CONSTEXPR(Descending) { rank_type().rank_keys_desc(keys, ranks, storage, begin_bit, pass_bits); } else { rank_type().rank_keys(keys, ranks, storage, begin_bit, pass_bits); } begin_bit += RadixBits; } } rp::block_store_direct_striped(lid, ranks_output + block_offset, ranks); } template void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int grid_size = ((N + items_per_block - 1) / items_per_block); const unsigned int size = items_per_block * grid_size; std::vector input; if ROCPRIM_IF_CONSTEXPR(std::is_floating_point::value) { input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } T* d_input; unsigned int* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(unsigned int))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); for(auto _ : state) { auto start = std::chrono::high_resolution_clock::now(); hipLaunchKernelGGL(HIP_KERNEL_NAME(rank_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output); HIP_CHECK(hipPeekAtLastError()); HIP_CHECK(hipDeviceSynchronize()); auto end = std::chrono::high_resolution_clock::now(); auto elapsed_seconds = std::chrono::duration_cast>(end - start); state.SetIterationTime(elapsed_seconds.count()); } state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT, KIND) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:radix_rank,key_type:" #T ",cfg:{bs:" #BS \ ",ipt:" #IPT ",method:" #KIND "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) // clang-format off #define CREATE_BENCHMARK_KINDS(type, block, ipt) \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic), \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::basic_memoize), \ CREATE_BENCHMARK(type, block, ipt, rp::block_radix_rank_algorithm::match) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK_KINDS(type, block, 1), \ CREATE_BENCHMARK_KINDS(type, block, 4), \ CREATE_BENCHMARK_KINDS(type, block, 8), \ CREATE_BENCHMARK_KINDS(type, block, 12), \ CREATE_BENCHMARK_KINDS(type, block, 16), \ CREATE_BENCHMARK_KINDS(type, block, 20) // clang-format on void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_radix_sort.cpp000066400000000000000000000234111446201466700237500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif enum class benchmark_kinds { sort_keys, sort_pairs }; namespace rp = rocprim; template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void sort_keys_kernel(const T * input, T * output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; sort.sort(keys); } rp::block_store_direct_striped(lid, output + block_offset, keys); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void sort_pairs_kernel(const T * input, T * output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T keys[ItemsPerThread]; T values[ItemsPerThread]; rp::block_load_direct_striped(lid, input + block_offset, keys); for(unsigned int i = 0; i < ItemsPerThread; i++) { values[i] = keys[i] + T(1); } ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { rp::block_radix_sort sort; sort.sort(keys, values); } for(unsigned int i = 0; i < ItemsPerThread; i++) { keys[i] += values[i]; } rp::block_store_direct_striped(lid, output + block_offset, keys); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 10 > void run_benchmark(benchmark::State& state, benchmark_kinds benchmark_kind, hipStream_t stream, size_t N) { constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); std::vector input; if(std::is_floating_point::value) { input = get_random_data(size, (T)-1000, (T)+1000); } else { input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(benchmark_kind == benchmark_kinds::sort_keys) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_keys_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); } else if(benchmark_kind == benchmark_kinds::sort_pairs) { hipLaunchKernelGGL( HIP_KERNEL_NAME(sort_pairs_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:radix_sort,key_type:" #T ",subalgo:" + name \ + ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ benchmark_kind, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8) void add_benchmarks(benchmark_kinds benchmark_kind, const std::string& name, std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(int, 128), BENCHMARK_TYPE(int, 192), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(int, 320), BENCHMARK_TYPE(int, 512), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(int8_t, 128), BENCHMARK_TYPE(int8_t, 192), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(int8_t, 320), BENCHMARK_TYPE(int8_t, 512), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(uint8_t, 128), BENCHMARK_TYPE(uint8_t, 192), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(uint8_t, 320), BENCHMARK_TYPE(uint8_t, 512), BENCHMARK_TYPE(rocprim::half, 64), BENCHMARK_TYPE(rocprim::half, 128), BENCHMARK_TYPE(rocprim::half, 192), BENCHMARK_TYPE(rocprim::half, 256), BENCHMARK_TYPE(rocprim::half, 320), BENCHMARK_TYPE(rocprim::half, 512), BENCHMARK_TYPE(long long, 64), BENCHMARK_TYPE(long long, 128), BENCHMARK_TYPE(long long, 192), BENCHMARK_TYPE(long long, 256), BENCHMARK_TYPE(long long, 320), BENCHMARK_TYPE(long long, 512), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmark_kinds::sort_keys, "keys", benchmarks, stream, size); add_benchmarks(benchmark_kinds::sort_pairs, "pairs", benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_reduce.cpp000066400000000000000000000227361446201466700230520ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct reduce { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; T reduced_value; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using breduce_t = rp::block_reduce; __shared__ typename breduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { breduce_t().reduce(values, reduced_value, storage); values[0] = reduced_value; } if(threadIdx.x == 0) { output[blockIdx.x] = reduced_value; } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:block,algo:reduce,key_type:" #T \ ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(rocprim::half, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(float2, 256, 1), CREATE_BENCHMARK(float2, 256, 4), CREATE_BENCHMARK(float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), CREATE_BENCHMARK(double2, 256, 1), CREATE_BENCHMARK(double2, 256, 4), CREATE_BENCHMARK(double2, 256, 8), CREATE_BENCHMARK(float4, 256, 1), CREATE_BENCHMARK(float4, 256, 4), CREATE_BENCHMARK(float4, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; // using_warp_scan using reduce_uwr_t = reduce; add_benchmarks(benchmarks, "using_warp_reduce", stream, size); // reduce then scan using reduce_rr_t = reduce; add_benchmarks(benchmarks, "raking_reduce", stream, size); // reduce commutative only using reduce_rrco_t = reduce; add_benchmarks(benchmarks, "raking_reduce_commutative_only", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_scan.cpp000066400000000000000000000255521446201466700225260ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template< class Runner, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __global__ __launch_bounds__(BlockSize) void kernel(const T* input, T* output) { Runner::template run(input, output); } template struct inclusive_scan { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; T values[ItemsPerThread]; for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().inclusive_scan(values, values, storage); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template struct exclusive_scan { template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials > __device__ static void run(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; using U = typename std::remove_reference::type; T values[ItemsPerThread]; U init = U(100); for(unsigned int k = 0; k < ItemsPerThread; k++) { values[k] = input[i * ItemsPerThread + k]; } using bscan_t = rp::block_scan; __shared__ typename bscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { bscan_t().exclusive_scan(values, values, init, storage); } for(unsigned int k = 0; k < ItemsPerThread; k++) { output[i * ItemsPerThread + k] = values[k]; } } }; template< class Benchmark, class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { // Make sure size is a multiple of BlockSize constexpr auto items_per_block = BlockSize * ItemsPerThread; const auto size = items_per_block * ((N + items_per_block - 1)/items_per_block); // Allocate and fill memory std::vector input(size, T(1)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input, d_output ); HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } // IPT - items per thread #define CREATE_BENCHMARK(T, BS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:block,algo:scan,subalgo:" + algorithm_name \ + ",key_type:" #T ",cfg:{bs:" #BS ",ipt:" #IPT ",method:" \ + method_name + "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type, block) \ CREATE_BENCHMARK(type, block, 1), \ CREATE_BENCHMARK(type, block, 2), \ CREATE_BENCHMARK(type, block, 3), \ CREATE_BENCHMARK(type, block, 4), \ CREATE_BENCHMARK(type, block, 8), \ CREATE_BENCHMARK(type, block, 11), \ CREATE_BENCHMARK(type, block, 16) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, const std::string& algorithm_name, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector new_benchmarks = { // When block size is less than or equal to warp size BENCHMARK_TYPE(int, 64), BENCHMARK_TYPE(float, 64), BENCHMARK_TYPE(double, 64), BENCHMARK_TYPE(int8_t, 64), BENCHMARK_TYPE(uint8_t, 64), BENCHMARK_TYPE(rocprim::half, 64), BENCHMARK_TYPE(int, 256), BENCHMARK_TYPE(float, 256), BENCHMARK_TYPE(double, 256), BENCHMARK_TYPE(int8_t, 256), BENCHMARK_TYPE(uint8_t, 256), BENCHMARK_TYPE(rocprim::half, 256), CREATE_BENCHMARK(custom_float2, 256, 1), CREATE_BENCHMARK(custom_float2, 256, 4), CREATE_BENCHMARK(custom_float2, 256, 8), CREATE_BENCHMARK(float2, 256, 1), CREATE_BENCHMARK(float2, 256, 4), CREATE_BENCHMARK(float2, 256, 8), CREATE_BENCHMARK(custom_double2, 256, 1), CREATE_BENCHMARK(custom_double2, 256, 4), CREATE_BENCHMARK(custom_double2, 256, 8), CREATE_BENCHMARK(double2, 256, 1), CREATE_BENCHMARK(double2, 256, 4), CREATE_BENCHMARK(double2, 256, 8), CREATE_BENCHMARK(float4, 256, 1), CREATE_BENCHMARK(float4, 256, 4), CREATE_BENCHMARK(float4, 256, 8), }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; // inclusive_scan using_warp_scan using inclusive_scan_uws_t = inclusive_scan; add_benchmarks( benchmarks, "inclusive_scan", "using_warp_scan", stream, size ); // exclusive_scan using_warp_scan using exclusive_scan_uws_t = exclusive_scan; add_benchmarks( benchmarks, "exclusive_scan", "using_warp_scan", stream, size ); // inclusive_scan reduce then scan using inclusive_scan_rts_t = inclusive_scan; add_benchmarks( benchmarks, "inclusive_scan", "reduce_then_scan", stream, size ); // exclusive_scan reduce then scan using exclusive_scan_rts_t = exclusive_scan; add_benchmarks( benchmarks, "exclusive_scan", "reduce_then_scan", stream, size ); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_sort.cpp000066400000000000000000000146541446201466700225720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include // Google Benchmark #include // HIP API #include // CmdParser #include "cmdparser.hpp" #include "benchmark_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif #define CREATE_BENCHMARK_IPT(K, V, BS, IPT) \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); \ config_autotune_register::create< \ block_sort_benchmark>(); #define CREATE_BENCHMARK(K, V, BS) \ CREATE_BENCHMARK_IPT(K, V, BS, 1) \ CREATE_BENCHMARK_IPT(K, V, BS, 4) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP const hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // If we are NOT config tuning run a selection of benchmarks // Block sizes as large as possible ar most relevant #ifndef BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(float, rocprim::empty_type, 256) CREATE_BENCHMARK(double, rocprim::empty_type, 256) CREATE_BENCHMARK(rocprim::half, rocprim::empty_type, 256) CREATE_BENCHMARK(uint8_t, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 256) CREATE_BENCHMARK(int, rocprim::empty_type, 512) CREATE_BENCHMARK(double, rocprim::empty_type, 512) CREATE_BENCHMARK(int, int, 512) CREATE_BENCHMARK(float, double, 512) CREATE_BENCHMARK(double, int64_t, 512) CREATE_BENCHMARK(rocprim::half, int16_t, 512) CREATE_BENCHMARK(uint8_t, uint32_t, 512) #endif std::vector benchmarks = {}; config_autotune_register::register_benchmark_subset(benchmarks, 0, 1, size, stream); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_block_sort.parallel.hpp000066400000000000000000000266511446201466700243720ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); rocprim::block_sort bsort; bsort.sort(keys); rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template::value, bool> = true> __global__ __launch_bounds__(BlockSize) void sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; ValueType values[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { values[item] = block_offset + lid * ItemsPerThread + item; } rocprim::block_sort bsort; bsort.sort(keys, values); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = keys[item] + static_cast(values[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void stable_sort_kernel(const KeyType* input, KeyType* output) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; KeyType keys[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, keys); using stable_key_type = rocprim::tuple; stable_key_type stable_keys[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { stable_keys[item] = rocprim::make_tuple(keys[item], ItemsPerThread * lid + item); } // Special comparison that preserves relative order of equal keys auto stable_compare_function = [](const stable_key_type& a, const stable_key_type& b) mutable -> bool { const bool ab = rocprim::less{}(rocprim::get<0>(a), rocprim::get<0>(b)); return ab || (!rocprim::less{}(rocprim::get<0>(b), rocprim::get<0>(a)) && (rocprim::get<1>(a) < rocprim::get<1>(b))); }; rocprim::block_sort bsort; bsort.sort(stable_keys, stable_compare_function); ROCPRIM_UNROLL for(unsigned int item = 0; item < ItemsPerThread; ++item) { keys[item] = rocprim::get<0>(stable_keys[item]); } rocprim::block_store_direct_blocked(lid, output + block_offset, keys); } template struct block_sort_benchmark : public config_autotune_interface { private: static constexpr bool with_values = !std::is_same::value; static constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; static const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } public: std::string sort_key() const override { using namespace std::string_literals; return std::string((with_values ? "_pairs"s : "_keys"s) + (stable ? "_stable"s : ""s) + pad_string(std::to_string(items_per_block), 5) + ", " + name()); } std::string name() const override { return bench_naming::format_name( "{lvl:block,algo:sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",stable:" + (stable ? "true" : "false") + ",cfg:{bs:" + std::to_string(BlockSize) + ",ipt:" + std::to_string(ItemsPerThread) + ",method:" + std::string(get_block_sort_method_name(block_sort_algorithm)) + "}}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; static constexpr bool debug_synchronous = false; auto dispatch_block_sort(std::false_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) const { hipLaunchKernelGGL( HIP_KERNEL_NAME( sort_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } auto dispatch_block_sort(std::true_type /*stable_sort*/, size_t size, const hipStream_t stream, KeyType* d_input, KeyType* d_output) const { hipLaunchKernelGGL(HIP_KERNEL_NAME(stable_sort_kernel), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_input, d_output); } void run(benchmark::State& state, const std::size_t N, const hipStream_t stream) const override { const auto size = items_per_block * ((N + items_per_block - 1) / items_per_block); std::vector input; if(std::is_floating_point::value) { input = get_random_data(size, (KeyType)-1000, (KeyType) + 1000); } else { input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } KeyType* d_input; KeyType* d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(KeyType))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(KeyType))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(KeyType), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); static constexpr auto stable_tag = rocprim::detail::bool_constant{}; // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { dispatch_block_sort(stable_tag, size, stream, d_input, d_output); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(KeyType)); state.SetItemsProcessed(state.iterations() * batch_size * size); state.counters["sorted_size"] = benchmark::Counter(BlockSize * ItemsPerThread, benchmark::Counter::kDefaults, benchmark::Counter::OneK::kIs1024); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } }; #endif // ROCPRIM_BENCHMARK_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_config_dispatch.cpp000066400000000000000000000041641446201466700235500ustar00rootroot00000000000000#include #include #include #include #include "benchmark_utils.hpp" enum class stream_kind { default_stream, per_thread_stream, explicit_stream, async_stream }; static void BM_host_target_arch(benchmark::State& state, const stream_kind stream_kind) { const hipStream_t stream = [stream_kind]() -> hipStream_t { hipStream_t stream = 0; switch(stream_kind) { case stream_kind::default_stream: return stream; case stream_kind::per_thread_stream: return hipStreamPerThread; case stream_kind::explicit_stream: HIP_CHECK(hipStreamCreate(&stream)); return stream; case stream_kind::async_stream: HIP_CHECK(hipStreamCreateWithFlags(&stream, hipStreamNonBlocking)); return stream; } }(); for(auto _ : state) { rocprim::detail::target_arch target_arch; HIP_CHECK(rocprim::detail::host_target_arch(stream, target_arch)); benchmark::DoNotOptimize(target_arch); } if(stream_kind != stream_kind::default_stream && stream_kind != stream_kind::per_thread_stream) { HIP_CHECK(hipStreamDestroy(stream)); } } __global__ void empty_kernel() {} // An empty kernel launch for baseline static void BM_kernel_launch(benchmark::State& state) { static constexpr hipStream_t stream = 0; for(auto _ : state) { hipLaunchKernelGGL(empty_kernel, dim3(1), dim3(1), 0, stream); HIP_CHECK(hipGetLastError()); } hipStreamSynchronize(stream); } BENCHMARK_CAPTURE(BM_host_target_arch, default_stream, stream_kind::default_stream); BENCHMARK_CAPTURE(BM_host_target_arch, per_thread_stream, stream_kind::per_thread_stream); BENCHMARK_CAPTURE(BM_host_target_arch, explicit_stream, stream_kind::explicit_stream); BENCHMARK_CAPTURE(BM_host_target_arch, async_stream, stream_kind::async_stream); BENCHMARK(BM_kernel_launch); int main(int argc, char** argv) { benchmark::Initialize(&argc, argv); add_common_benchmark_info(); benchmark::RunSpecifiedBenchmarks(); }rocPRIM-rocm-5.7.1/benchmark/benchmark_device_adjacent_difference.cpp000066400000000000000000000117001446201466700256600ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_adjacent_difference.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N constexpr std::size_t DEFAULT_N = 1024 * 1024 * 128; #endif #define CREATE_BENCHMARK(T, left, in_place) \ { \ const device_adjacent_difference_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } // clang-format off #define CREATE_BENCHMARKS(T) \ CREATE_BENCHMARK(T, true, false) \ CREATE_BENCHMARK(T, true, true) \ CREATE_BENCHMARK(T, false, false) \ CREATE_BENCHMARK(T, false, true) // clang-format on int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP const hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks CREATE_BENCHMARKS(int) CREATE_BENCHMARKS(std::int64_t) CREATE_BENCHMARKS(uint8_t) CREATE_BENCHMARKS(rocprim::half) CREATE_BENCHMARKS(float) CREATE_BENCHMARKS(double) CREATE_BENCHMARKS(custom_float2) CREATE_BENCHMARKS(custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_adjacent_difference.parallel.cpp.in000066400000000000000000000027631446201466700300710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_adjacent_difference.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create>>(); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_adjacent_difference.parallel.hpp000066400000000000000000000217051446201466700274660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include "benchmark_utils.hpp" template> struct device_adjacent_difference_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:adjacent_difference" + (left ? ""s : "_right"s) + (in_place ? "_inplace"s : ""s) + ",key_type:" + std::string(Traits::name()) + ",cfg:{bs:" + std::to_string(Config::block_size) + ",ipt:" + std::to_string(Config::items_per_thread) + "}}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; template auto dispatch_adjacent_difference(std::true_type /*left*/, std::false_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) const { return ::rocprim::adjacent_difference(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::false_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt output, Args&&... args) const { return ::rocprim::adjacent_difference_right(temporary_storage, storage_size, input, output, std::forward(args)...); } template auto dispatch_adjacent_difference(std::true_type /*left*/, std::true_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) const { return ::rocprim::adjacent_difference_inplace(temporary_storage, storage_size, input, std::forward(args)...); } template auto dispatch_adjacent_difference(std::false_type /*left*/, std::true_type /*in_place*/, void* const temporary_storage, std::size_t& storage_size, const InputIt input, const OutputIt /*output*/, Args&&... args) const { return ::rocprim::adjacent_difference_right_inplace(temporary_storage, storage_size, input, std::forward(args)...); } void run(benchmark::State& state, const std::size_t size, const hipStream_t stream) const override { using output_type = T; static constexpr bool debug_synchronous = false; // Generate data const std::vector input = get_random_data(size, 1, 100); T* d_input; output_type* d_output = nullptr; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); if(!in_place) { HIP_CHECK(hipMalloc(&d_output, size * sizeof(output_type))); } static constexpr auto left_tag = rocprim::detail::bool_constant{}; static constexpr auto in_place_tag = rocprim::detail::bool_constant{}; // Allocate temporary storage std::size_t temp_storage_size; void* d_temp_storage = nullptr; const auto launch = [&] { return dispatch_adjacent_difference(left_tag, in_place_tag, d_temp_storage, temp_storage_size, d_input, d_output, size, rocprim::plus<>{}, stream, debug_synchronous); }; HIP_CHECK(launch()); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(launch()); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); // Run for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(launch()); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); if(!in_place) { hipFree(d_output); } hipFree(d_temp_storage); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_ADJACENT_DIFFERENCE_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_binary_search.cpp000066400000000000000000000174661446201466700245650ustar00rootroot00000000000000// MIT License // // Copyright (c) 2019 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_lower_bound_benchmark(benchmark::State& state, hipStream_t stream, size_t haystack_size, size_t needles_size, bool sorted_needles) { using haystack_type = T; using needle_type = T; using output_type = size_t; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; compare_op_type compare_op; // Generate data std::vector haystack(haystack_size); std::iota(haystack.begin(), haystack.end(), 0); std::vector needles = get_random_data( needles_size, needle_type(0), needle_type(haystack_size) ); if(sorted_needles) { std::sort(needles.begin(), needles.end(), compare_op); } haystack_type * d_haystack; needle_type * d_needles; output_type * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_haystack), haystack_size * sizeof(haystack_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_needles), needles_size * sizeof(needle_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), needles_size * sizeof(output_type))); HIP_CHECK( hipMemcpy( d_haystack, haystack.data(), haystack_size * sizeof(haystack_type), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_needles, needles.data(), needles_size * sizeof(needle_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes; HIP_CHECK( rocprim::lower_bound( d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rocprim::lower_bound( d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rocprim::lower_bound( d_temporary_storage, temporary_storage_bytes, d_haystack, d_needles, d_output, haystack_size, needles_size, compare_op, stream ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * needles_size * sizeof(needle_type)); state.SetItemsProcessed(state.iterations() * batch_size * needles_size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_haystack)); HIP_CHECK(hipFree(d_needles)); HIP_CHECK(hipFree(d_output)); } #define CREATE_LOWER_BOUND_BENCHMARK(T, K, SORTED) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:device,algo:binary_search,key_type:" #T ",subalgo:" #K "_percent_" \ + std::string(SORTED ? "sorted" : "random") + "_needles,cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_lower_bound_benchmark(state, stream, size, size * K / 100, SORTED); }) #define BENCHMARK_TYPE(type) \ CREATE_LOWER_BOUND_BENCHMARK(type, 10, false), \ CREATE_LOWER_BOUND_BENCHMARK(type, 10, true) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(custom_float2), BENCHMARK_TYPE(custom_double2) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_histogram.cpp000066400000000000000000000744031446201466700237430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // HIP API #include // rocPRIM #include #include "benchmark_device_histogram.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; int get_entropy_percents(int entropy_reduction) { switch(entropy_reduction) { case 0: return 100; case 1: return 81; case 2: return 54; case 3: return 33; case 4: return 20; default: return 0; } } const int entropy_reductions[] = {0, 2, 4, 6}; template void run_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const level_type lower_level = 0; const level_type upper_level = bins * scale; // Generate data std::vector input = generate(size, entropy_reduction, lower_level, upper_level); T* d_input; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, lower_level, upper_level, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_even_benchmark(benchmark::State& state, size_t bins, size_t scale, int entropy_reduction, hipStream_t stream, size_t size) { using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; unsigned int num_levels[ActiveChannels]; level_type lower_level[ActiveChannels]; level_type upper_level[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { lower_level[channel] = 0; upper_level[channel] = bins * scale; num_levels[channel] = bins + 1; } // Generate data std::vector input = generate(size * Channels, entropy_reduction, lower_level[0], upper_level[0]); T* d_input; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], bins * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::multi_histogram_even(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, lower_level, upper_level, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } template void run_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; // Generate data std::vector input = get_random_data(size, 0, bins); std::vector levels(bins + 1); for(size_t i = 0; i < levels.size(); i++) { levels[i] = static_cast(i); } T* d_input; level_type* d_levels; counter_type* d_histogram; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_levels, (bins + 1) * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram, size * sizeof(counter_type))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_levels, levels.data(), (bins + 1) * sizeof(level_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, bins + 1, d_levels, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_levels)); HIP_CHECK(hipFree(d_histogram)); } template void run_multi_range_benchmark(benchmark::State& state, size_t bins, hipStream_t stream, size_t size) { using counter_type = unsigned int; using level_type = typename std::conditional_t::value && sizeof(T) < sizeof(int), int, T>; const int num_levels_channel = bins + 1; unsigned int num_levels[ActiveChannels]; std::vector levels[ActiveChannels]; for(unsigned int channel = 0; channel < ActiveChannels; channel++) { levels[channel].resize(num_levels_channel); for(size_t i = 0; i < levels[channel].size(); i++) { levels[channel][i] = static_cast(i); } num_levels[channel] = num_levels_channel; } // Generate data std::vector input = get_random_data(size * Channels, 0, bins); T* d_input; level_type* d_levels[ActiveChannels]; counter_type* d_histogram[ActiveChannels]; HIP_CHECK(hipMalloc(&d_input, size * Channels * sizeof(T))); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_levels[channel], num_levels_channel * sizeof(level_type))); HIP_CHECK(hipMalloc(&d_histogram[channel], size * sizeof(counter_type))); } HIP_CHECK(hipMemcpy(d_input, input.data(), size * Channels * sizeof(T), hipMemcpyHostToDevice)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMemcpy(d_levels[channel], levels[channel].data(), num_levels_channel * sizeof(level_type), hipMemcpyHostToDevice)); } void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::multi_histogram_range(d_temporary_storage, temporary_storage_bytes, d_input, size, d_histogram, num_levels, d_levels, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_levels[channel])); HIP_CHECK(hipFree(d_histogram[channel])); } } #define CREATE_EVEN_BENCHMARK(VECTOR, T, BINS, SCALE) \ VECTOR.push_back(benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:histogram_even,value_type:" #T ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_even_benchmark(state, BINS, SCALE, entropy_reduction, stream, size); })); #define BENCHMARK_EVEN_TYPE(VECTOR, T, S) \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 100, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 1000, S); \ CREATE_EVEN_BENCHMARK(VECTOR, T, 10000, S); void add_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { BENCHMARK_EVEN_TYPE(benchmarks, long long, 12345); BENCHMARK_EVEN_TYPE(benchmarks, int, 1234); BENCHMARK_EVEN_TYPE(benchmarks, short, 5); CREATE_EVEN_BENCHMARK(benchmarks, unsigned char, 16, 16); CREATE_EVEN_BENCHMARK(benchmarks, unsigned char, 256, 1); BENCHMARK_EVEN_TYPE(benchmarks, double, 1234); BENCHMARK_EVEN_TYPE(benchmarks, float, 1234); BENCHMARK_EVEN_TYPE(benchmarks, rocprim::half, 5); }; } #define CREATE_MULTI_EVEN_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS, SCALE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:multi_histogram_even,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS \ ",entropy:" \ + std::to_string(get_entropy_percents(entropy_reduction)) \ + ",bins:" + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { \ run_multi_even_benchmark(state, \ BINS, \ SCALE, \ entropy_reduction, \ stream, \ size); \ }) #define BENCHMARK_MULTI_EVEN_TYPE(C, A, T, S) \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10, S), CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 100, S), \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 1000, S), \ CREATE_MULTI_EVEN_BENCHMARK(C, A, T, 10000, S) void add_multi_even_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { for(int entropy_reduction : entropy_reductions) { std::vector bs = { BENCHMARK_MULTI_EVEN_TYPE(4, 4, int, 1234), BENCHMARK_MULTI_EVEN_TYPE(4, 3, short, 5), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 16, 16), CREATE_MULTI_EVEN_BENCHMARK(4, 3, unsigned char, 256, 1), BENCHMARK_MULTI_EVEN_TYPE(3, 3, float, 1234), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); }; } #define CREATE_RANGE_BENCHMARK(T, BINS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:histogram_range,value_type:" #T ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) { run_range_benchmark(state, BINS, stream, size); }) #define BENCHMARK_RANGE_TYPE(T) \ CREATE_RANGE_BENCHMARK(T, 10), CREATE_RANGE_BENCHMARK(T, 100), \ CREATE_RANGE_BENCHMARK(T, 1000), CREATE_RANGE_BENCHMARK(T, 10000) void add_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_RANGE_TYPE(long long), BENCHMARK_RANGE_TYPE(int), BENCHMARK_RANGE_TYPE(short), CREATE_RANGE_BENCHMARK(unsigned char, 16), CREATE_RANGE_BENCHMARK(unsigned char, 256), BENCHMARK_RANGE_TYPE(double), BENCHMARK_RANGE_TYPE(float), BENCHMARK_RANGE_TYPE(rocprim::half), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_MULTI_RANGE_BENCHMARK(CHANNELS, ACTIVE_CHANNELS, T, BINS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:multi_histogram_range,value_type:" #T \ ",channels:" #CHANNELS ",active_channels:" #ACTIVE_CHANNELS \ ",bins:" \ + std::to_string(BINS) + ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_multi_range_benchmark(state, BINS, stream, size); }) #define BENCHMARK_MULTI_RANGE_TYPE(C, A, T) \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10), CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 100), \ CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 1000), CREATE_MULTI_RANGE_BENCHMARK(C, A, T, 10000) void add_multi_range_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_MULTI_RANGE_TYPE(4, 4, int), BENCHMARK_MULTI_RANGE_TYPE(4, 3, short), CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 16), CREATE_MULTI_RANGE_BENCHMARK(4, 3, unsigned char, 256), BENCHMARK_MULTI_RANGE_TYPE(3, 3, float), BENCHMARK_MULTI_RANGE_TYPE(2, 2, double), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING add_even_benchmarks(benchmarks, stream, size); add_multi_even_benchmarks(benchmarks, stream, size); add_range_benchmarks(benchmarks, stream, size); add_multi_range_benchmarks(benchmarks, stream, size); #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_histogram.parallel.cpp.in000066400000000000000000000026201446201466700261330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_histogram.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_histogram_benchmark_generator<@DataType@, @BlockSize@>::create); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_histogram.parallel.hpp000066400000000000000000000363621446201466700255450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ #include #include #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include #include "benchmark_utils.hpp" template std::vector generate(size_t size, int entropy_reduction, int lower_level, int upper_level) { if(entropy_reduction >= 5) { return std::vector(size, static_cast((lower_level + upper_level) / 2)); } const size_t max_random_size = 1024 * 1024 + 4321; const unsigned int seed = 123; std::default_random_engine gen(seed); std::vector data(size); std::generate(data.begin(), data.begin() + std::min(size, max_random_size), [&]() { // Reduce enthropy by applying bitwise AND to random bits // "An Improved Supercomputer Sorting Benchmark", 1992 // Kurt Thearling & Stephen Smith auto v = gen(); for(int e = 0; e < entropy_reduction; e++) { v &= gen(); } return T(lower_level + v % (upper_level - lower_level)); }); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } // Cache for input data when multiple cases must be benchmarked with various configurations and // same inputs can be used for consecutive benchmarks. // It must be used as a singleton. class input_cache { public: ~input_cache() { clear(); } void clear() { for(auto& i : cache) { HIP_CHECK(hipFree(i.second)); } cache.clear(); } // The function returns an exisitng buffer if main_key matches and there is additional_key // in the cache or generates a new buffer using gen(). // If main_key does not match, it frees all device buffers and resets the cache. template T* get_or_generate(const std::string& main_key, const std::string& additional_key, size_t size, F gen) { if(this->main_key != main_key) { // The main key (for example, data type) has been changed, clear the cache clear(); this->main_key = main_key; } auto result = cache.find(additional_key); if(result != cache.end()) { return reinterpret_cast(result->second); } // Generate a new buffer std::vector data = gen(); T* d_buffer; HIP_CHECK(hipMalloc(&d_buffer, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_buffer, data.data(), size * sizeof(T), hipMemcpyHostToDevice)); cache[additional_key] = d_buffer; return d_buffer; } static input_cache& instance() { static input_cache instance; return instance; } private: std::string main_key; std::map cache; }; template std::string config_name() { const rocprim::detail::histogram_config_params config = Config(); return "{bs:" + std::to_string(config.histogram_config.block_size) + ",ipt:" + std::to_string(config.histogram_config.items_per_thread) + ",max_grid_size:" + std::to_string(config.max_grid_size) + ",shared_impl_max_bins:" + std::to_string(config.shared_impl_max_bins) + ",shared_impl_histograms:" + std::to_string(config.shared_impl_histograms) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_histogram_benchmark : public config_autotune_interface { std::vector cases; device_histogram_benchmark(const std::vector& cases) : cases(cases) {} std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:histogram,value_type:" + std::string(Traits::name()) + ",channels:" + std::to_string(Channels) + ",active_channels:" + std::to_string(ActiveChannels) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 3; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, const std::size_t full_size, const hipStream_t stream) const override { using counter_type = unsigned int; using level_type = typename std:: conditional_t::value && sizeof(T) < sizeof(int), int, T>; struct case_data { level_type lower_level[ActiveChannels]; level_type upper_level[ActiveChannels]; unsigned int num_levels[ActiveChannels]; T* d_input; }; const std::size_t size = full_size / Channels; size_t temporary_storage_bytes = 0; void* d_temporary_storage = nullptr; counter_type* d_histogram[ActiveChannels]; unsigned int max_bins = 0; std::vector cases_data; for(auto& bins : cases) { for(int entropy_reduction : {0, 2, 4, 6}) { case_data data; // Reuse inputs for the same sample type. This autotune uses multipe inputs for all // combinations of bins and entropy, but the inputs do not depend on autotuned // params (bs, ipt, shared_impl_max_bins) and can be reused saving time needed for // generating and copying to device. data.d_input = input_cache::instance().get_or_generate( std::string(Traits::name()), std::to_string(bins) + "_" + std::to_string(entropy_reduction), full_size, [&]() { return generate(full_size, entropy_reduction, 0, bins); }); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { data.lower_level[channel] = 0; data.upper_level[channel] = bins; data.num_levels[channel] = bins + 1; } cases_data.push_back(data); size_t current_temporary_storage_bytes = 0; HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, current_temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); temporary_storage_bytes = std::max(temporary_storage_bytes, current_temporary_storage_bytes); max_bins = std::max(max_bins, bins); } } HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipMalloc(&d_histogram[channel], max_bins * sizeof(counter_type))); } HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { for(auto& data : cases_data) { HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); } } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(auto& data : cases_data) { for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rocprim::multi_histogram_even( d_temporary_storage, temporary_storage_bytes, data.d_input, size, d_histogram, data.num_levels, data.lower_level, data.upper_level, stream, false))); } } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * cases_data.size() * batch_size * size * Channels * sizeof(T)); state.SetItemsProcessed(state.iterations() * cases_data.size() * batch_size * size * Channels); HIP_CHECK(hipFree(d_temporary_storage)); for(unsigned int channel = 0; channel < ActiveChannels; channel++) { HIP_CHECK(hipFree(d_histogram[channel])); } } }; template struct device_histogram_benchmark_generator { static constexpr unsigned int min_items_per_thread = 1; static constexpr unsigned int max_items_per_thread = 16; static constexpr unsigned int min_shared_impl_histograms = 2; static constexpr unsigned int max_shared_impl_histograms = 4; template struct create_ipt { template struct create_shared_impl_histograms { using generated_config = rocprim::histogram_config, 2048, 2048, SharedImplHistograms>; template auto create(std::vector>& storage, const std::vector& cases) -> typename std::enable_if<(items_per_thread * Channels <= max_items_per_thread), void>::type { storage.emplace_back( std::make_unique< device_histogram_benchmark>( cases)); } template auto create(std::vector>& storage, const std::vector& cases) -> typename std::enable_if::type {} void operator()(std::vector>& storage, const std::vector& cases) { // Tune histograms for single-channel data (histogram_even) create<1, 1>(storage, cases); // and some multi-channel configurations (multi_histogram_even) create<2, 2>(storage, cases); create<3, 3>(storage, cases); create<4, 4>(storage, cases); create<4, 3>(storage, cases); } }; void operator()(std::vector>& storage, const std::vector& cases) { static_for_each, create_shared_impl_histograms>(storage, cases); } }; static void create(std::vector>& storage) { // Benchmark multiple cases (with various sample distributions) and use sum of all cases // as a measurement for autotuning std::vector cases; if(std::is_same::value) { cases = {16, 127}; } else { cases = { 10, 100, 1000, 10000 // Multiple bins to trigger a global memory implementation }; } static_for_each, create_ipt>(storage, cases); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_HISTOGRAM_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_memory.cpp000066400000000000000000001544661446201466700232660ustar00rootroot00000000000000// MIT License // // Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // rocPRIM #include #include "benchmark_utils.hpp" enum memory_operation_method { block_primitives_transpose, striped, vectorized, block_primitive_direct, }; enum kernel_operation { no_operation, block_scan, custom_operation, atomics_no_collision, atomics_inter_block_collision, atomics_inter_warp_collision, }; template< kernel_operation Operation, class T, unsigned int ItemsPerThread, unsigned int BlockSize = 0 > struct operation; // no operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&)[ItemsPerThread], void* = nullptr, unsigned int = 0, T* = nullptr) { // No operation } }; #define repeats 30 // custom operation template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) global_mem_output; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { input[i] = input[i] + 666; ROCPRIM_UNROLL for(unsigned int j = 0; j < repeats; j++) { input[i] = input[i] * (input[j % ItemsPerThread]); } } } }; // block scan template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) global_mem_output; using block_scan_type = typename rocprim::block_scan< T, BlockSize, rocprim::block_scan_algorithm::using_warp_scan>; block_scan_type bscan; // when using vectorized or striped functions // NOTE: This is not safe but it is the easiest way to prevent code repetition if(shared_storage == nullptr || shared_storage_size < sizeof(typename block_scan_type::storage_type)) { __shared__ typename block_scan_type::storage_type storage; shared_storage = &storage; } bscan.inclusive_scan( input, input, *(reinterpret_cast(shared_storage)) ); __syncthreads(); } }; // atomics_no_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = threadIdx.x * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = (threadIdx.x % warpSize) * ItemsPerThread + blockIdx.x * blockDim.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // atomics_inter_block_collision template struct operation { ROCPRIM_HOST_DEVICE inline void operator()(T (&input)[ItemsPerThread], void* shared_storage = nullptr, unsigned int shared_storage_size = 0, T* global_mem_output = nullptr) { (void) shared_storage; (void) shared_storage_size; (void) input; unsigned int index = threadIdx.x * ItemsPerThread; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { atomicAdd(&global_mem_output[index + i], T(666)); } } }; // block_primitive_direct method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim::block_load< T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_direct>; using block_store_type = typename rocprim::block_store< T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_direct>; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } // vectorized method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; rocprim::block_load_direct_blocked_vectorized (threadIdx.x, input + offset, items); __syncthreads(); op(items, nullptr, 0, output); rocprim::block_store_direct_blocked_vectorized (threadIdx.x, output + offset, items); } // striped method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { const unsigned int lid = threadIdx.x; const unsigned int block_offset = blockIdx.x * ItemsPerThread * BlockSize; T items[ItemsPerThread]; rocprim::block_load_direct_striped(lid, input + block_offset, items); op(items, nullptr, 0, output); rocprim::block_store_direct_striped(lid, output + block_offset, items); } // block_primitives_transpose method base kernel template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, class CustomOp = typename operation::value_type, typename std::enable_if::type = 0 > __global__ __launch_bounds__(BlockSize) void operation_kernel(T* input, T* output, CustomOp op) { constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; using block_load_type = typename rocprim::block_load< T, BlockSize, ItemsPerThread, rocprim::block_load_method::block_load_transpose>; using block_store_type = typename rocprim::block_store< T, BlockSize, ItemsPerThread, rocprim::block_store_method::block_store_transpose>; block_load_type load; block_store_type store; __shared__ union { typename block_load_type::storage_type load; typename block_store_type::storage_type store; } storage; int offset = blockIdx.x * items_per_block; T items[ItemsPerThread]; load.load(input + offset, items, storage.load); __syncthreads(); op(items, &storage, sizeof(storage), output); store.store(output + offset, items, storage.store); } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, memory_operation_method MemOp, kernel_operation KernelOp = no_operation > void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream) { const size_t grid_size = size / (BlockSize * ItemsPerThread); std::vector input; if(std::is_floating_point::value) { input = get_random_data(size, (T)-1000, (T)+1000); } else { input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); operation selected_operation; // Warm-up for(size_t i = 0; i < 10; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { hipLaunchKernelGGL( HIP_KERNEL_NAME(operation_kernel), dim3(grid_size), dim3(BlockSize), 0, stream, d_input, d_output, selected_operation ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } template void run_benchmark_memcpy(benchmark::State& state, size_t size, const hipStream_t stream) { std::vector input; if(std::is_floating_point::value) { input = get_random_data(size, (T)-1000, (T)+1000); } else { input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(hipMemcpy(d_output, d_input, size * sizeof(T), hipMemcpyDeviceToDevice)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(METHOD, OPERATION, T, SIZE, BLOCK_SIZE, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:memory,subalgo:" #METHOD \ ",operation:" #OPERATION ",key_type:" #T ",size:" #SIZE \ ",cfg:{bs:" #BLOCK_SIZE ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ SIZE, \ stream) #define CREATE_BENCHMARK_MEMCPY(T, SIZE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:memory,subalgo:copy,key_type:" #T \ ",size:" #SIZE ",cfg:default_config}") \ .c_str(), \ run_benchmark_memcpy, \ SIZE, \ stream) template constexpr unsigned int megabytes(unsigned int size) { return(size * (1024 * 1024 / sizeof(T))); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); // Add benchmarks std::vector benchmarks = { // simple memory copy not running kernel CREATE_BENCHMARK_MEMCPY(int, megabytes(128)), // simple memory copy CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, no_operation, uint64_t, megabytes(128), 1024, 4), // simple memory copy using vector type CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, no_operation, uint64_t, megabytes(128), 1024, 8), // simple memory copy using striped CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(striped, no_operation, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 4), CREATE_BENCHMARK(striped, no_operation, uint64_t, megabytes(128), 1024, 8), // block_scan CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 128, 32), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, float, megabytes(128), 1024, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, double, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, block_scan, uint64_t, megabytes(128), 1024, 4), // vectorized - block_scan CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, int, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, float, megabytes(128), 1024, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, double, megabytes(128), 1024, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 2), CREATE_BENCHMARK(vectorized, block_scan, uint64_t, megabytes(128), 1024, 4), // custom_op CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, float, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, double, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, custom_operation, uint64_t, megabytes(128), 1024, 2), // block_primitives_transpose - atomics no collision CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_no_collision, int, megabytes(128), 1024, 8), // block_primitives_transpose - atomics inter block collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_block_collision, int, megabytes(128), 1024, 8), // block_primitives_transpose - atomics inter warp collision CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 128, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 256, 16), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 512, 8), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 1), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 2), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 4), CREATE_BENCHMARK(block_primitives_transpose, atomics_inter_warp_collision, int, megabytes(128), 1024, 8), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge.cpp000066400000000000000000000316741446201466700230500ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_merge_keys_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data std::vector keys_input1 = get_random_data(size1, 0, size); std::vector keys_input2 = get_random_data(size2, 0, size); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); key_type * d_keys_input1; key_type * d_keys_input2; key_type * d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, size1, size2, compare_op, stream, false ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input1)); HIP_CHECK(hipFree(d_keys_input2)); HIP_CHECK(hipFree(d_keys_output)); } template void run_merge_pairs_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { using key_type = Key; using value_type = Value; using compare_op_type = typename std::conditional::value, half_less, rocprim::less>::type; const size_t size1 = size / 2; const size_t size2 = size - size1; compare_op_type compare_op; // Generate data std::vector keys_input1 = get_random_data(size1, 0, size); std::vector keys_input2 = get_random_data(size2, 0, size); std::sort(keys_input1.begin(), keys_input1.end(), compare_op); std::sort(keys_input2.begin(), keys_input2.end(), compare_op); std::vector values_input1(size1); std::vector values_input2(size2); std::iota(values_input1.begin(), values_input1.end(), 0); std::iota(values_input2.begin(), values_input2.end(), size1); key_type * d_keys_input1; key_type * d_keys_input2; key_type * d_keys_output; value_type * d_values_input1; value_type * d_values_input2; value_type * d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input1), size1 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input2), size2 * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input1), size1 * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input2), size2 * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_keys_input1, keys_input1.data(), size1 * sizeof(key_type), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_keys_input2, keys_input2.data(), size2 * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::merge( d_temporary_storage, temporary_storage_bytes, d_keys_input1, d_keys_input2, d_keys_output, d_values_input1, d_values_input2, d_values_output, size1, size2, compare_op, stream, false ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input1)); HIP_CHECK(hipFree(d_keys_input2)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input1)); HIP_CHECK(hipFree(d_values_input2)); HIP_CHECK(hipFree(d_values_output)); } #define CREATE_MERGE_KEYS_BENCHMARK(Key) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:merge,key_type:" #Key ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) { run_merge_keys_benchmark(state, stream, size); }) #define CREATE_MERGE_PAIRS_BENCHMARK(Key, Value) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:device,algo:merge,key_type:" #Key \ ",value_type:" #Value \ ",cfg:default_config}") \ .c_str(), \ [=](benchmark::State& state) \ { run_merge_pairs_benchmark(state, stream, size); }) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_int2 = custom_type; using custom_double2 = custom_type; // Add benchmarks std::vector benchmarks = { CREATE_MERGE_KEYS_BENCHMARK(int), CREATE_MERGE_KEYS_BENCHMARK(long long), CREATE_MERGE_KEYS_BENCHMARK(int8_t), CREATE_MERGE_KEYS_BENCHMARK(uint8_t), CREATE_MERGE_KEYS_BENCHMARK(rocprim::half), CREATE_MERGE_KEYS_BENCHMARK(short), CREATE_MERGE_KEYS_BENCHMARK(custom_int2), CREATE_MERGE_KEYS_BENCHMARK(custom_double2), CREATE_MERGE_PAIRS_BENCHMARK(int, int), CREATE_MERGE_PAIRS_BENCHMARK(long long, long long), CREATE_MERGE_PAIRS_BENCHMARK(int8_t, int8_t), CREATE_MERGE_PAIRS_BENCHMARK(uint8_t, uint8_t), CREATE_MERGE_PAIRS_BENCHMARK(rocprim::half, rocprim::half), CREATE_MERGE_PAIRS_BENCHMARK(short, short), CREATE_MERGE_PAIRS_BENCHMARK(custom_int2, custom_int2), CREATE_MERGE_PAIRS_BENCHMARK(custom_double2, custom_double2), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort.cpp000066400000000000000000000101601446201466700241020ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_merge_sort.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; // used by ssbk benchmark using custom_longlong_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort.hpp000066400000000000000000000274031446201466700241170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; template struct device_merge_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:merge_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::merge_sort(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; #endif // ROCPRIM_BENCHMARK_DEVICE_MERGE_SORT_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_merge.cpp000066400000000000000000000123621446201466700264410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_block_merge_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_merge.parallel.cpp.in000066400000000000000000000027071446201466700306430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_merge_sort_block_merge.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_merge_sort_block_merge_benchmark_generator<@BlockSize@, @UseMergePath@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_merge.parallel.hpp000066400000000000000000000466011446201466700302440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; template std::string config_name() { const rocprim::detail::merge_sort_block_merge_config_params config = Config(); return "{oddeven_bs:" + std::to_string(config.merge_oddeven_config.block_size) + ",oddeven_ipt:" + std::to_string(config.merge_oddeven_config.items_per_thread) + ",oddeven_size_limit:" + std::to_string(config.merge_oddeven_config.size_limit) + ",mergepath_partition_bs:" + std::to_string(config.merge_mergepath_partition_config.block_size) + ",mergepath_bs:" + std::to_string(config.merge_mergepath_config.block_size) + ",mergepath_ipt:" + std::to_string(config.merge_mergepath_config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_merge_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:merge_sort_block_merge,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // Because merge_sort_block_merge expects partially sorted input: using block_sort_config = rocprim::default_config; // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } key_type* d_keys_input; key_type* d_keys; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); hipDeviceSynchronize(); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_input, values_ptr, values_ptr, size, sorted_block_size, lesser_op, stream, false)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); hipError_t err; // Warm-up for(size_t i = 0; i < warmup_size; i++) { err = rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false); } if(err == hipError_t::hipErrorAssert) { state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event hipMemcpyAsync(d_keys, d_keys_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream); HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, values_ptr, size, sorted_block_size, lesser_op, stream, false)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); hipDeviceSynchronize(); ::rocprim::less lesser_op; // Merge_sort_block_merge algorithm expects partially sorted input: unsigned int sorted_block_size; HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_input, d_values_input, d_values_input, size, sorted_block_size, lesser_op, stream, false)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); hipError_t err; // Warm-up for(size_t i = 0; i < warmup_size; i++) { err = rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false); } if(err == hipError_t::hipErrorAssert) { state.SkipWithError("SKIPPING: block_sort_items_per_block >= " "block_merge_items_per_block does not hold"); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values)); return; } else if(err != hipSuccess) { std::cout << "HIP error: " << err << " line: " << __LINE__ << std::endl; exit(err); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event hipMemcpyAsync(d_keys, d_keys_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream); hipMemcpyAsync(d_values, d_values_input, size * sizeof(key_type), hipMemcpyDeviceToDevice, stream); HIP_CHECK(hipEventRecord(start, stream)); HIP_CHECK(rp::detail::merge_sort_block_merge(d_temporary_storage, temporary_storage_bytes, d_keys, d_values, size, sorted_block_size, lesser_op, stream, false)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; template struct device_merge_sort_block_merge_benchmark_generator { static constexpr unsigned int get_limit() { return use_mergepath ? 0 : UINT32_MAX; } template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_merge_config; using benchmark_struct = device_merge_sort_block_merge_benchmark; void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique()); } }; static void create(std::vector>& storage) { static constexpr unsigned int min_items_per_thread_exponent = 0u; // Very large block sizes don't work with large items_per_thread since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = sizeof(Key) + sizeof(Value); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_MERGE_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_sort.cpp000066400000000000000000000125341446201466700263320ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_BENCHMARK(...) \ { \ const device_merge_sort_block_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; using custom_char_short = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(long long, custom_double2) CREATE_BENCHMARK(custom_double2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_double2) CREATE_BENCHMARK(custom_int2, custom_char_double) CREATE_BENCHMARK(custom_int2, custom_longlong_double) CREATE_BENCHMARK(int, custom_char_short) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_sort.parallel.cpp.in000066400000000000000000000027101446201466700305250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_merge_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_merge_sort_block_sort_benchmark_generator<@BlockSize@, @BlockSortMethod@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_merge_sort_block_sort.parallel.hpp000066400000000000000000000345131446201466700301330ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; constexpr const char* get_block_sort_method_name(rocprim::block_sort_algorithm alg) { switch(alg) { case rocprim::block_sort_algorithm::merge_sort: return "merge_sort"; case rocprim::block_sort_algorithm::bitonic_sort: return "bitonic_sort"; case rocprim::block_sort_algorithm::stable_merge_sort: return "stable_merge_sort"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::merge_sort_block_sort_config_params config = Config(); return "{bs:" + std::to_string(config.block_sort_config.block_size) + ",ipt:" + std::to_string(config.block_sort_config.items_per_thread) + ",method:" + std::string(get_block_sort_method_name(config.block_sort_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_merge_sort_block_sort_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name("{lvl:device,algo:merge_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); ::rocprim::less lesser_op; unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, lesser_op, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::detail::merge_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, lesser_op, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; template struct device_merge_sort_block_sort_benchmark_generator { template struct create_ipt { static constexpr unsigned int items_per_thread = 1u << ItemsPerThreadExponent; using generated_config = rocprim::detail::merge_sort_block_sort_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_merge_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than merge_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread_exponent = rocprim::Log2<(1024 / BlockSize)>::VALUE; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key) + sizeof(unsigned int), sizeof(Value)); static constexpr unsigned int max_items_per_thread = max_shared_memory / (BlockSize * max_size_per_element); static constexpr unsigned int max_items_per_thread_exponent = rocprim::Log2::VALUE - 1; static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_MERGE_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_partition.cpp000066400000000000000000000436221446201466700237560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_flagged_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { size = (size * sizeof(int)) / sizeof(T); std::vector input; std::vector flags = get_random_data01(size, true_probability); if(std::is_floating_point::value) { input = get_random_data(size, T(-1000), T(1000)); } else { input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; FlagType * d_flags; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(rocprim::partition( nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream )); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(rocprim::partition( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream )); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::partition( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream )); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_flags); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } template void run_if_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { auto select_op = [true_probability] __device__ (const T& value) -> bool { if(value < T(127 * true_probability)) return true; return false; }; std::vector input = get_random_data(size, T(0), T(127)); T * d_input; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(rocprim::partition( nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream )); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(rocprim::partition( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream )); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::partition( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream )); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } template void run_three_way_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float first_probability, float second_probability) { auto first_select_op = [first_probability] __device__ (const T& value) { return value < T(127 * first_probability); }; auto second_select_op = [second_probability] __device__ (const T& value) { return value < T(127 * second_probability); }; std::vector input = get_random_data(size, T(0), T(127)); T * d_input; T * d_output_first; T * d_output_second; T * d_output_unselected; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output_first, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output_second, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_output_unselected, input.size() * sizeof(T))); HIP_CHECK(hipMalloc(&d_selected_count_output, 2 * sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage HIP_CHECK(rocprim::partition_three_way( nullptr, temp_storage_size_bytes, d_input, d_output_first, d_output_second, d_output_unselected, d_selected_count_output, input.size(), first_select_op, second_select_op, stream )); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK(rocprim::partition_three_way( d_temp_storage, temp_storage_size_bytes, d_input, d_output_first, d_output_second, d_output_unselected, d_selected_count_output, input.size(), first_select_op, second_select_op, stream )); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rocprim::partition_three_way( d_temp_storage, temp_storage_size_bytes, d_input, d_output_first, d_output_second, d_output_unselected, d_selected_count_output, input.size(), first_select_op, second_select_op, stream )); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output_first); hipFree(d_output_second); hipFree(d_output_unselected); hipFree(d_selected_count_output); hipFree(d_temp_storage); } #define CREATE_PARTITION_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:partition,key_type:" #T \ ",subalgo:flags,flag_type:" #F ",probability:" #p \ ",cfg:default_config}") \ .c_str(), \ run_flagged_benchmark, \ size, \ stream, \ p) #define CREATE_PARTITION_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:partition,key_type:" #T \ ",subalgo:if,probability:" #p ",cfg:default_config}") \ .c_str(), \ run_if_benchmark, \ size, \ stream, \ p) #define CREATE_PARTITION_THREE_WAY_BENCHMARK(T, p1, p2) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:partition,key_type:" #T \ ",subalgo:three_way,probability1:" #p1 ",probability2:" #p2 \ ",cfg:default_config}") \ .c_str(), \ run_three_way_benchmark, \ size, \ stream, \ p1, \ p2) #define BENCHMARK_FLAGGED_TYPE(type, value) \ CREATE_PARTITION_FLAGGED_BENCHMARK(type, value, 0.05f), \ CREATE_PARTITION_FLAGGED_BENCHMARK(type, value, 0.25f), \ CREATE_PARTITION_FLAGGED_BENCHMARK(type, value, 0.5f), \ CREATE_PARTITION_FLAGGED_BENCHMARK(type, value, 0.75f) #define BENCHMARK_IF_TYPE(type) \ CREATE_PARTITION_IF_BENCHMARK(type, 0.05f), \ CREATE_PARTITION_IF_BENCHMARK(type, 0.25f), \ CREATE_PARTITION_IF_BENCHMARK(type, 0.5f), \ CREATE_PARTITION_IF_BENCHMARK(type, 0.75f) #define BENCHMARK_THREE_WAY_TYPE(type) \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, 0.05f, 0.25f), \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, 0.25f, 0.5f), \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, 0.5f, 0.75f), \ CREATE_PARTITION_THREE_WAY_BENCHMARK(type, 0.75f, 1.f) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_double2 = custom_type; using custom_int_double = custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(double, unsigned char), BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), BENCHMARK_FLAGGED_TYPE(rocprim::half, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_IF_TYPE(int), BENCHMARK_IF_TYPE(float), BENCHMARK_IF_TYPE(double), BENCHMARK_IF_TYPE(uint8_t), BENCHMARK_IF_TYPE(int8_t), BENCHMARK_IF_TYPE(rocprim::half), BENCHMARK_IF_TYPE(custom_int_double), BENCHMARK_THREE_WAY_TYPE(int), BENCHMARK_THREE_WAY_TYPE(float), BENCHMARK_THREE_WAY_TYPE(double), BENCHMARK_THREE_WAY_TYPE(uint8_t), BENCHMARK_THREE_WAY_TYPE(int8_t), BENCHMARK_THREE_WAY_TYPE(rocprim::half), BENCHMARK_THREE_WAY_TYPE(custom_int_double) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort.cpp000066400000000000000000000055761446201466700241310ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_radix_sort.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort.hpp000066400000000000000000000336411446201466700241300ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; template struct device_radix_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name( "{lvl:device,algo:radix_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg: default_config}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; static std::vector generate_keys(size_t size) { using key_type = Key; if(std::is_floating_point::value) { return get_random_data(size, static_cast(-1000), static_cast(1000), size); } else { return get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max(), size); } } // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { auto keys_input = generate_keys(size); using key_type = Key; key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::radix_sort_keys(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { auto keys_input = generate_keys(size); using key_type = Key; using value_type = Value; std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false)); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false)); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK(rp::radix_sort_pairs(d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, 0, sizeof(key_type) * 8, stream, false)); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; #define CREATE_RADIX_SORT_BENCHMARK(...) \ { \ const device_radix_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } inline void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) } inline void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) } #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_block_sort.cpp000066400000000000000000000116731446201466700263450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_BENCHMARK(...) \ { \ const device_radix_sort_block_sort_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING CREATE_BENCHMARK(int) CREATE_BENCHMARK(long long) CREATE_BENCHMARK(int8_t) CREATE_BENCHMARK(uint8_t) CREATE_BENCHMARK(rocprim::half) CREATE_BENCHMARK(short) using custom_float2 = custom_type; using custom_double2 = custom_type; using custom_char_double = custom_type; CREATE_BENCHMARK(int, float) CREATE_BENCHMARK(long long, double) CREATE_BENCHMARK(int8_t, int8_t) CREATE_BENCHMARK(uint8_t, uint8_t) CREATE_BENCHMARK(rocprim::half, rocprim::half) CREATE_BENCHMARK(short, short) CREATE_BENCHMARK(int, custom_float2) CREATE_BENCHMARK(int, custom_char_double) CREATE_BENCHMARK(long long, custom_double2) #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_block_sort.parallel.cpp.in000066400000000000000000000026651446201466700305460ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_radix_sort_block_sort.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_radix_sort_block_sort_benchmark_generator<@BlockSize@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_block_sort.parallel.hpp000066400000000000000000000333251446201466700301430ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; template std::string config_name() { const rocprim::detail::kernel_config_params config = Config(); return "{bs:" + std::to_string(config.block_size) + ",ipt:" + std::to_string(config.items_per_thread) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_block_sort_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_block_sort,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); rocprim::empty_type* values_ptr = nullptr; unsigned int items_per_block; // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, values_ptr, values_ptr, size, items_per_block, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data(size, static_cast(-1000), static_cast(1000)); } else { keys_input = get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max()); } std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); unsigned int items_per_block; HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((rp::detail::radix_sort_block_sort(d_keys_input, d_keys_output, d_values_input, d_values_output, size, items_per_block, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; template struct device_radix_sort_block_sort_benchmark_generator { template struct create_ipt { using generated_config = rocprim::kernel_config; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_block_sort_benchmark>()); } }; static void create(std::vector>& storage) { // Sort_items_per_block must be equal or larger than radix_items_per_block, so make // the items_per_thread at least as large so the sort_items_per_block // would be atleast 1024. static constexpr unsigned int min_items_per_thread = 1024 / BlockSize; // Very large block sizes don't work with large items_per_blocks since // shared memory is limited static constexpr unsigned int max_shared_memory = TUNING_SHARED_MEMORY_MAX - 2000; static constexpr unsigned int max_size_per_element = std::max(sizeof(Key), sizeof(Value)); static constexpr unsigned int max_items_per_thread = std::min(32u, max_shared_memory / (BlockSize * max_size_per_element)); static_for_each, create_ipt>(storage); } }; #endif // ROCPRIM_BENCHMARK_DETAIL_BENCHMARK_DEVICE_RADIX_SORT_BLOCK_SORT_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_onesweep.cpp000066400000000000000000000077471446201466700260400ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_radix_sort_onesweep.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else // BENCHMARK_CONFIG_TUNING add_sort_keys_benchmarks(benchmarks, stream, size); add_sort_pairs_benchmarks(benchmarks, stream, size); #endif // BENCHMARK_CONFIG_TUNING // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_onesweep.parallel.cpp.in000066400000000000000000000027021446201466700302220ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_radix_sort_onesweep.parallel.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_radix_sort_onesweep_benchmark_generator<@BlockSize@, @RadixBits@, @KeyType@, @ValueType@>::create); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_radix_sort_onesweep.parallel.hpp000066400000000000000000000543001446201466700276230ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" namespace rp = rocprim; constexpr const char* radix_rank_algorithm_name(rp::block_radix_rank_algorithm algorithm) { switch(algorithm) { case rp::block_radix_rank_algorithm::basic: return "block_radix_rank_algorithm::basic"; case rp::block_radix_rank_algorithm::basic_memoize: return "block_radix_rank_algorithm::basic_memoize"; case rp::block_radix_rank_algorithm::match: return "block_radix_rank_algorithm::match"; } } template std::string config_name() { constexpr rocprim::detail::radix_sort_onesweep_config_params params = Config(); return "{histogram:{bs:" + std::to_string(params.histogram.block_size) + ",ipt:" + std::to_string(params.histogram.items_per_thread) + "},sort:{" + "bs:" + std::to_string(params.sort.block_size) + ",ipt:" + std::to_string(params.sort.items_per_thread) + "},bits_per_place:" + std::to_string(params.radix_bits_per_place) + ",algorithm:" + radix_rank_algorithm_name(params.radix_rank_algorithm) + "}"; } template<> inline std::string config_name() { return "default_config"; } template struct device_radix_sort_onesweep_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:radix_sort_onesweep,key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; static std::vector generate_keys(size_t size) { using key_type = Key; if(std::is_floating_point::value) { return get_random_data(size, static_cast(-1000), static_cast(1000), size); } else { return get_random_data(size, std::numeric_limits::min(), std::numeric_limits::max(), size); } } // keys benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { auto keys_input = generate_keys(size); using key_type = Key; key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; rocprim::empty_type* d_values_ptr = nullptr; HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_ptr, nullptr, d_values_ptr, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } // pairs benchmark template auto do_run(benchmark::State& state, size_t size, const hipStream_t stream) const -> typename std::enable_if::value, void>::type { auto keys_input = generate_keys(size); using key_type = Key; using value_type = Value; std::vector values_input(size); for(size_t i = 0; i < size; i++) { values_input[i] = value_type(i); } key_type* d_keys_input; key_type* d_keys_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_output), size * sizeof(key_type))); HIP_CHECK(hipMemcpy(d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice)); value_type* d_values_input; value_type* d_values_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_output), size * sizeof(value_type))); HIP_CHECK(hipMemcpy(d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice)); void* d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; bool is_result_in_output = true; HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK((rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( (rp::detail::radix_sort_onesweep_impl(d_temporary_storage, temporary_storage_bytes, d_keys_input, nullptr, d_keys_output, d_values_input, nullptr, d_values_output, size, is_result_in_output, 0, sizeof(key_type) * 8, stream, false))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { do_run(state, size, stream); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_radix_sort_onesweep_benchmark_generator { template static constexpr bool is_buildable() { using sharedmem_storage = typename rp::detail::onesweep_iteration_helper::storage_type; return sizeof(sharedmem_storage) < TUNING_SHARED_MEMORY_MAX; } template struct create_ipt; template struct create_ipt())>> { using generated_config = rocprim::radix_sort_onesweep_config, rocprim::kernel_config, RadixBits, RadixRankAlgorithm>; void operator()(std::vector>& storage) { storage.emplace_back( std::make_unique< device_radix_sort_onesweep_benchmark>()); } }; template struct create_ipt())>> { void operator()(std::vector>&) {} }; template static void create_algo(std::vector>& storage) { create_ipt<1u, RadixRankAlgorithm>()(storage); create_ipt<4u, RadixRankAlgorithm>()(storage); create_ipt<6u, RadixRankAlgorithm>()(storage); create_ipt<8u, RadixRankAlgorithm>()(storage); create_ipt<12u, RadixRankAlgorithm>()(storage); create_ipt<16u, RadixRankAlgorithm>()(storage); create_ipt<18u, RadixRankAlgorithm>()(storage); create_ipt<22u, RadixRankAlgorithm>()(storage); } static void create(std::vector>& storage) { create_algo(storage); create_algo(storage); } }; #else // BENCHMARK_CONFIG_TUNING #define CREATE_RADIX_SORT_BENCHMARK(...) \ { \ const device_radix_sort_onesweep_benchmark<__VA_ARGS__> instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } inline void add_sort_keys_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { CREATE_RADIX_SORT_BENCHMARK(int) CREATE_RADIX_SORT_BENCHMARK(float) CREATE_RADIX_SORT_BENCHMARK(long long) CREATE_RADIX_SORT_BENCHMARK(int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half) CREATE_RADIX_SORT_BENCHMARK(short) } inline void add_sort_pairs_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_RADIX_SORT_BENCHMARK(int, float) CREATE_RADIX_SORT_BENCHMARK(int, double) CREATE_RADIX_SORT_BENCHMARK(int, float2) CREATE_RADIX_SORT_BENCHMARK(int, custom_float2) CREATE_RADIX_SORT_BENCHMARK(int, double2) CREATE_RADIX_SORT_BENCHMARK(int, custom_double2) CREATE_RADIX_SORT_BENCHMARK(long long, float) CREATE_RADIX_SORT_BENCHMARK(long long, double) CREATE_RADIX_SORT_BENCHMARK(long long, float2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_float2) CREATE_RADIX_SORT_BENCHMARK(long long, double2) CREATE_RADIX_SORT_BENCHMARK(long long, custom_double2) CREATE_RADIX_SORT_BENCHMARK(int8_t, int8_t) CREATE_RADIX_SORT_BENCHMARK(uint8_t, uint8_t) CREATE_RADIX_SORT_BENCHMARK(rocprim::half, rocprim::half) } #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_RADIX_SORT_ONESWEEP_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_reduce.cpp000066400000000000000000000113471446201466700232130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" #include "benchmark_device_reduce.parallel.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif #define CREATE_BENCHMARK(T, REDUCE_OP) \ { \ const device_reduce_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_reduce.parallel.cpp.in000066400000000000000000000027701446201466700254130ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_utils.hpp" #include "benchmark_device_reduce.parallel.hpp" namespace { auto benchmark = config_autotune_register::create, rocprim::reduce_config<@BlockSize@u, @ItemsPerThread@u, rocprim::block_reduce_algorithm::using_warp_reduce>>>(); } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_reduce.parallel.hpp000066400000000000000000000137631446201466700250170ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_REDUCE_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM HIP API #include #include "benchmark_utils.hpp" constexpr const char* get_reduce_method_name(rocprim::block_reduce_algorithm alg) { switch(alg) { case rocprim::block_reduce_algorithm::raking_reduce: return "raking_reduce"; case rocprim::block_reduce_algorithm::raking_reduce_commutative_only: return "raking_reduce_commutative_only"; case rocprim::block_reduce_algorithm::using_warp_reduce: return "using_warp_reduce"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } template std::string config_name() { const rocprim::detail::reduce_config_params config = Config(); return "{bs:" + std::to_string(config.reduce_config.block_size) + ",ipt:" + std::to_string(config.reduce_config.items_per_thread) + ",method:" + std::string(get_reduce_method_name(config.block_reduce_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, typename Config = rocprim::default_config> struct device_reduce_benchmark : public config_autotune_interface { std::string name() const override { return bench_naming::format_name("{lvl:device,algo:reduce,key_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } static constexpr unsigned int batch_size = 10; static constexpr unsigned int warmup_size = 5; void run(benchmark::State& state, size_t size, const hipStream_t stream) const override { BinaryFunction reduce_op{}; std::vector input = get_random_data(size, T(0), T(1000)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; void * d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); HIP_CHECK(hipMalloc(&d_temp_storage,temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rocprim::reduce( d_temp_storage, temp_storage_size_bytes, d_input, d_output, T(), size, reduce_op, stream ) ); } HIP_CHECK(hipStreamSynchronize(stream)); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } }; #endif rocPRIM-rocm-5.7.1/benchmark/benchmark_device_reduce_by_key.cpp000066400000000000000000000221431446201466700245510ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = Key; using value_type = Value; // Generate data std::vector keys_input(size); unsigned int unique_count = 0; std::vector key_counts = get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[unique_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { keys_input[i] = unique_count; } unique_count++; offset += key_count; } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); key_type * d_keys_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_keys_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); key_type * d_unique_output; value_type * d_aggregates_output; unsigned int * d_unique_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_output), unique_count * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_aggregates_output), unique_count * sizeof(value_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_count_output), sizeof(unsigned int))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; rp::plus reduce_op; rp::equal_to key_compare_op; HIP_CHECK( rp::reduce_by_key( nullptr, temporary_storage_bytes, d_keys_input, d_values_input, size, d_unique_output, d_aggregates_output, d_unique_count_output, reduce_op, key_compare_op, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::reduce_by_key( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, size, d_unique_output, d_aggregates_output, d_unique_count_output, reduce_op, key_compare_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::reduce_by_key( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_values_input, size, d_unique_output, d_aggregates_output, d_unique_count_output, reduce_op, key_compare_op, stream ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_aggregates_output)); HIP_CHECK(hipFree(d_unique_count_output)); } #define CREATE_BENCHMARK(Key, Value) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:reduce_by_key,key_type:" #Key \ ",value_type:" #Value ",keys_max_length:" \ + std::to_string(max_length) + ",cfg:default_config}") \ .c_str(), \ run_benchmark, \ max_length, \ stream, \ size) void add_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { CREATE_BENCHMARK(int, float), CREATE_BENCHMARK(int, double), CREATE_BENCHMARK(int, custom_float2), CREATE_BENCHMARK(int, custom_double2), CREATE_BENCHMARK(int8_t, int8_t), CREATE_BENCHMARK(uint8_t, uint8_t), CREATE_BENCHMARK(rocprim::half, rocprim::half), CREATE_BENCHMARK(long long, float), CREATE_BENCHMARK(long long, double), CREATE_BENCHMARK(long long, custom_float2), CREATE_BENCHMARK(long long, custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(1000, benchmarks, stream, size); add_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_run_length_encode.cpp000066400000000000000000000330351446201466700254240ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template void run_encode_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type * d_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); key_type * d_unique_output; count_type * d_counts_output; count_type * d_runs_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_unique_output), runs_count * sizeof(key_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_counts_output), runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_runs_count_output), sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::run_length_encode( nullptr, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( rp::run_length_encode( d_temporary_storage, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rp::run_length_encode( d_temporary_storage, temporary_storage_bytes, d_input, size, d_unique_output, d_counts_output, d_runs_count_output, stream, false ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_unique_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } template void run_non_trivial_runs_benchmark(benchmark::State& state, size_t max_length, hipStream_t stream, size_t size) { using key_type = T; using offset_type = unsigned int; using count_type = unsigned int; // Generate data std::vector input(size); unsigned int runs_count = 0; std::vector key_counts = get_random_data(100000, 1, max_length); size_t offset = 0; while(offset < size) { const size_t key_count = key_counts[runs_count % key_counts.size()]; const size_t end = std::min(size, offset + key_count); for(size_t i = offset; i < end; i++) { input[i] = runs_count; } runs_count++; offset += key_count; } key_type * d_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); offset_type * d_offsets_output; count_type * d_counts_output; count_type * d_runs_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets_output), runs_count * sizeof(offset_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_counts_output), runs_count * sizeof(count_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_runs_count_output), sizeof(count_type))); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::run_length_encode_non_trivial_runs( nullptr, temporary_storage_bytes, d_input, size, d_offsets_output, d_counts_output, d_runs_count_output, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { HIP_CHECK( rp::run_length_encode_non_trivial_runs( d_temporary_storage, temporary_storage_bytes, d_input, size, d_offsets_output, d_counts_output, d_runs_count_output, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rp::run_length_encode_non_trivial_runs( d_temporary_storage, temporary_storage_bytes, d_input, size, d_offsets_output, d_counts_output, d_runs_count_output, stream, false ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_offsets_output)); HIP_CHECK(hipFree(d_counts_output)); HIP_CHECK(hipFree(d_runs_count_output)); } #define CREATE_ENCODE_BENCHMARK(T) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:device,algo:run_length_encode,subalgo:trivial,key_type:" #T ",keys_max_length:" \ + std::to_string(max_length) + ",cfg:default_config}") \ .c_str(), \ run_encode_benchmark, \ max_length, \ stream, \ size) void add_encode_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { CREATE_ENCODE_BENCHMARK(int), CREATE_ENCODE_BENCHMARK(long long), CREATE_ENCODE_BENCHMARK(int8_t), CREATE_ENCODE_BENCHMARK(uint8_t), CREATE_ENCODE_BENCHMARK(rocprim::half), CREATE_ENCODE_BENCHMARK(custom_float2), CREATE_ENCODE_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } #define CREATE_NON_TRIVIAL_RUNS_BENCHMARK(T) \ benchmark::RegisterBenchmark( \ bench_naming::format_name( \ "{lvl:device,algo:run_length_encode,subalgo:non_trivial,key_type:" #T \ ",keys_max_length:" \ + std::to_string(max_length) + ",cfg:default_config}") \ .c_str(), \ run_non_trivial_runs_benchmark, \ max_length, \ stream, \ size) void add_non_trivial_runs_benchmarks(size_t max_length, std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(long long), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(int8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(uint8_t), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(rocprim::half), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_float2), CREATE_NON_TRIVIAL_RUNS_BENCHMARK(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_encode_benchmarks(1000, benchmarks, stream, size); add_encode_benchmarks(10, benchmarks, stream, size); add_non_trivial_runs_benchmarks(1000, benchmarks, stream, size); add_non_trivial_runs_benchmarks(10, benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan.cpp000066400000000000000000000117441446201466700226710ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ { \ const device_scan_benchmark instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan.parallel.cpp.in000066400000000000000000000026521446201466700250670ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_scan.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_scan_benchmark_generator<@DataType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan.parallel.hpp000066400000000000000000000247371446201466700244770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" template std::string config_name() { const rocprim::detail::scan_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_method_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, class Config = rocprim::default_config> struct device_scan_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan,exclusive:" + (Exclusive ? "true"s : "false"s) + ",value_type:" + std::string(Traits::name()) + ",cfg:" + config_name() + "}"); } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { return rocprim::exclusive_scan(temporary_storage, storage_size, input, output, initial_value, input_size, scan_op, stream, debug); } template auto run_device_scan(void* temporary_storage, size_t& storage_size, T* input, T* output, const T initial_value, const size_t input_size, ScanOp scan_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { (void)initial_value; return rocprim::inclusive_scan(temporary_storage, storage_size, input, output, input_size, scan_op, stream, debug); } void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, ScanOp scan_op) const { std::vector input = get_random_data(size, T(0), T(1000)); T initial_value = T(123); T* d_input; T* d_output; HIP_CHECK(hipMalloc(&d_input, size * sizeof(T))); HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); HIP_CHECK(hipMemcpy(d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice)); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan(d_temp_storage, temp_storage_size_bytes, d_input, d_output, initial_value, size, scan_op, stream))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { run_benchmark(state, size, stream, ScanOp()); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::scan_config_v2< block_size, ItemsPerThread, rocprim::block_load_method::block_load_transpose, rocprim::block_store_method::block_store_transpose, BlockScanAlgorithm>>>()); } }; void operator()(std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min(65536 / (block_size * sizeof(T)), 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan_by_key.cpp000066400000000000000000000133741446201466700242340ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include // CmdParser #include "cmdparser.hpp" #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif #define CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, MAX_SEGMENT_LENGTH) \ { \ const device_scan_by_key_benchmark, \ MAX_SEGMENT_LENGTH> \ instance; \ REGISTER_BENCHMARK(benchmarks, size, stream, instance); \ } #define CREATE_EXCL_INCL_BENCHMARK(EXCL, T, SCAN_OP) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 1) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 16) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 256) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 4096) \ CREATE_BY_KEY_BENCHMARK(EXCL, T, SCAN_OP, 65536) #define CREATE_BENCHMARK(T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(false, T, SCAN_OP) \ CREATE_EXCL_INCL_BENCHMARK(true, T, SCAN_OP) int main(int argc, char* argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); #ifdef BENCHMARK_CONFIG_TUNING // optionally run an evenly split subset of benchmarks, when making multiple program invocations parser.set_optional("parallel_instance", "parallel_instance", 0, "parallel instance index"); parser.set_optional("parallel_instances", "parallel_instances", 1, "total parallel instances"); #endif parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks = {}; #ifdef BENCHMARK_CONFIG_TUNING const int parallel_instance = parser.get("parallel_instance"); const int parallel_instances = parser.get("parallel_instances"); config_autotune_register::register_benchmark_subset(benchmarks, parallel_instance, parallel_instances, size, stream); #else using custom_float2 = custom_type; using custom_double2 = custom_type; CREATE_BENCHMARK(int, rocprim::plus) CREATE_BENCHMARK(float, rocprim::plus) CREATE_BENCHMARK(double, rocprim::plus) CREATE_BENCHMARK(long long, rocprim::plus) CREATE_BENCHMARK(float2, rocprim::plus) CREATE_BENCHMARK(custom_float2, rocprim::plus) CREATE_BENCHMARK(double2, rocprim::plus) CREATE_BENCHMARK(custom_double2, rocprim::plus) CREATE_BENCHMARK(int8_t, rocprim::plus) CREATE_BENCHMARK(uint8_t, rocprim::plus) CREATE_BENCHMARK(rocprim::half, rocprim::plus) #endif // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan_by_key.parallel.cpp.in000066400000000000000000000027041446201466700264270ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include "benchmark_device_scan_by_key.parallel.hpp" #include "benchmark_utils.hpp" namespace { auto benchmarks = config_autotune_register::create_bulk( device_scan_by_key_benchmark_generator<@KeyType@, @ValueType@, rocprim::block_scan_algorithm::@Algo@>::create); } // namespace rocPRIM-rocm-5.7.1/benchmark/benchmark_device_scan_by_key.parallel.hpp000066400000000000000000000313671446201466700260360ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #ifndef ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #define ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ #include #include #include // Google Benchmark #include // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" template std::string config_name() { const rocprim::detail::scan_by_key_config_params config = Config(); return "{bs:" + std::to_string(config.kernel_config.block_size) + ",ipt:" + std::to_string(config.kernel_config.items_per_thread) + ",method:" + std::string(get_block_scan_method_name(config.block_scan_method)) + "}"; } template<> inline std::string config_name() { return "default_config"; } template, class CompareOp = rocprim::equal_to, unsigned int MaxSegmentLength = 1024, class Config = rocprim::default_config> struct device_scan_by_key_benchmark : public config_autotune_interface { std::string name() const override { using namespace std::string_literals; return bench_naming::format_name( "{lvl:device,algo:scan_by_key,exclusive:" + (Exclusive ? "true"s : "false"s) + ",key_type:" + std::string(Traits::name()) + ",value_type:" + std::string(Traits::name()) + ",max_segment_length:" + std::to_string(MaxSegmentLength) + ",cfg:" + config_name() + "}"); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value initial_value, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { return rocprim::exclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, initial_value, input_size, scan_op, compare_op, stream, debug); } template auto run_device_scan_by_key(void* temporary_storage, size_t& storage_size, const Key* keys, const Value* input, Value* output, const Value /*initial_value*/, const size_t input_size, const ScanOp scan_op, const CompareOp compare_op, const hipStream_t stream, const bool debug = false) const -> typename std::enable_if::type { return rocprim::inclusive_scan_by_key(temporary_storage, storage_size, keys, input, output, input_size, scan_op, compare_op, stream, debug); } void run(benchmark::State& state, size_t size, hipStream_t stream) const override { constexpr bool debug = false; const std::vector keys = get_random_segments(size, MaxSegmentLength, std::random_device{}()); const std::vector input = get_random_data(size, Value(0), Value(1000)); ScanOp scan_op{}; CompareOp compare_op{}; Value initial_value = Value(123); Value* d_input; Key* d_keys; Value* d_output; HIP_CHECK(hipMalloc(&d_input, input.size() * sizeof(input[0]))); HIP_CHECK(hipMalloc(&d_keys, keys.size() * sizeof(keys[0]))); HIP_CHECK(hipMalloc(&d_output, input.size() * sizeof(input[0]))); HIP_CHECK(hipMemcpy(d_input, input.data(), input.size() * sizeof(input[0]), hipMemcpyHostToDevice)); HIP_CHECK( hipMemcpy(d_keys, keys.data(), keys.size() * sizeof(keys[0]), hipMemcpyHostToDevice)); // Allocate temporary storage memory size_t temp_storage_size_bytes; void* d_temp_storage = nullptr; // Get size of d_temp_storage HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); // Warm-up for(size_t i = 0; i < 5; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK((run_device_scan_by_key(d_temp_storage, temp_storage_size_bytes, d_keys, d_input, d_output, initial_value, size, scan_op, compare_op, stream, debug))); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(Key) + sizeof(Value))); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_keys)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_temp_storage)); } }; #ifdef BENCHMARK_CONFIG_TUNING template struct device_scan_by_key_benchmark_generator { template struct create_block_scan_algorithm { template struct create_block_size { template struct create_ipt { void operator()(std::vector>& storage) { storage.emplace_back(std::make_unique, rocprim::equal_to, 1024, rocprim::scan_by_key_config_v2< block_size, ItemsPerThread, rocprim::block_load_method::block_load_transpose, rocprim::block_store_method::block_store_transpose, BlockScanAlgorithm>>>()); } }; void operator()(std::vector>& storage) { // Limit items per thread to not over-use shared memory static constexpr unsigned int max_items_per_thread = ::rocprim::min( 65536 / (block_size * (sizeof(KeyType) + sizeof(ValueType))), 24); static_for_each, create_ipt>(storage); } static constexpr unsigned int block_size = 1u << BlockSizeExponent; }; static void create(std::vector>& storage) { static_for_each(storage); } }; static void create(std::vector>& storage) { // Block sizes 64, 128, 256 create_block_scan_algorithm>::create(storage); } }; #endif // BENCHMARK_CONFIG_TUNING #endif // ROCPRIM_BENCHMARK_DEVICE_SCAN_BY_KEY_PARALLEL_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_device_segmented_radix_sort.cpp000066400000000000000000000431361446201466700261560ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; namespace { constexpr unsigned int warmup_size = 2; constexpr size_t min_size = 30000; constexpr std::array segment_counts{ 10, 100, 1000, 2500, 5000, 7500, 10000, 100000 }; constexpr std::array segment_lengths{30, 256, 3000, 300000}; } template void run_sort_keys_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_size, hipStream_t stream) { using offset_type = int; using key_type = Key; std::vector offsets; offsets.push_back(0); static constexpr int seed = 716; std::default_random_engine gen(seed); std::normal_distribution segment_length_dis(static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if (segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data( size, static_cast(-1000), static_cast(1000) ); } else { keys_input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, offsets.size() * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), offsets.size() * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::segmented_radix_sort_keys( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::segmented_radix_sort_keys( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::segmented_radix_sort_keys( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(key_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); } template void run_sort_pairs_benchmark(benchmark::State& state, size_t num_segments, size_t mean_segment_length, size_t target_size, hipStream_t stream) { using offset_type = int; using key_type = Key; using value_type = Value; // Generate data std::vector offsets; offsets.push_back(0); static constexpr int seed = 716; std::default_random_engine gen(seed); std::normal_distribution segment_length_dis(static_cast(mean_segment_length), 0.1 * mean_segment_length); size_t offset = 0; for(size_t segment_index = 0; segment_index < num_segments;) { const double segment_length_candidate = std::round(segment_length_dis(gen)); if (segment_length_candidate < 0) { continue; } const offset_type segment_length = static_cast(segment_length_candidate); offset += segment_length; offsets.push_back(offset); ++segment_index; } const size_t size = offset; const size_t segments_count = offsets.size() - 1; std::vector keys_input; if(std::is_floating_point::value) { keys_input = get_random_data( size, static_cast(-1000), static_cast(1000) ); } else { keys_input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } size_t batch_size = 1; if(size < target_size) { batch_size = (target_size + size - 1) / size; } std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; HIP_CHECK(hipMalloc(&d_offsets, (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); key_type * d_keys_input; key_type * d_keys_output; HIP_CHECK(hipMalloc(&d_keys_input, size * sizeof(key_type))); HIP_CHECK(hipMalloc(&d_keys_output, size * sizeof(key_type))); HIP_CHECK( hipMemcpy( d_keys_input, keys_input.data(), size * sizeof(key_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; value_type * d_values_output; HIP_CHECK(hipMalloc(&d_values_input, size * sizeof(value_type))); HIP_CHECK(hipMalloc(&d_values_output, size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::segmented_radix_sort_pairs( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::segmented_radix_sort_pairs( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::segmented_radix_sort_pairs( d_temporary_storage, temporary_storage_bytes, d_keys_input, d_keys_output, d_values_input, d_values_output, size, segments_count, d_offsets, d_offsets + 1, 0, sizeof(key_type) * 8, stream, false ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed( state.iterations() * batch_size * size * (sizeof(key_type) + sizeof(value_type)) ); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_keys_input)); HIP_CHECK(hipFree(d_keys_output)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_values_output)); } template void add_sort_keys_benchmarks(std::vector &benchmarks, hipStream_t stream, size_t max_size, size_t min_size, size_t target_size) { std::string key_name = Traits::name(); std::string value_name = Traits::name(); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > max_size || number_of_elements < min_size) { continue; } benchmarks.push_back(benchmark::RegisterBenchmark( bench_naming::format_name( "{lvl:device,algo:radix_sort_segmented,key_type:" + key_name + ",value_type:" + value_name + ",segment_count:" + std::to_string(segment_count) + ",segment_length:" + std::to_string(segment_length) + ",cfg:default_config}") .c_str(), [=](benchmark::State& state) { run_sort_keys_benchmark(state, segment_count, segment_length, target_size, stream); })); } } } template void add_sort_pairs_benchmarks(std::vector &benchmarks, hipStream_t stream, size_t max_size, size_t min_size, size_t target_size) { std::string key_name = Traits::name(); std::string value_name = Traits::name(); for(const auto segment_count : segment_counts) { for(const auto segment_length : segment_lengths) { const auto number_of_elements = segment_count * segment_length; if(number_of_elements > max_size || number_of_elements < min_size) { continue; } benchmarks.push_back(benchmark::RegisterBenchmark( bench_naming::format_name( "{lvl:device,algo:radix_sort_segmented,key_type:" + key_name + ",value_type:" + value_name + ",segment_count:" + std::to_string(segment_count) + ",segment_length:" + std::to_string(segment_length) + ",cfg:default_config}") .c_str(), [=](benchmark::State& state) { run_sort_pairs_benchmark(state, segment_count, segment_length, target_size, stream); })); } } } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_keys_benchmarks(benchmarks, stream, size, min_size, size / 2); using custom_float2 = custom_type; using custom_double2 = custom_type; add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); add_sort_pairs_benchmarks(benchmarks, stream, size, min_size, size / 2); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_segmented_reduce.cpp000066400000000000000000000211251446201466700252410ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template void run_benchmark(benchmark::State& state, size_t desired_segments, hipStream_t stream, size_t size) { using offset_type = int; using value_type = T; // Generate data const unsigned int seed = 123; std::default_random_engine gen(seed); const double avg_segment_length = static_cast(size) / desired_segments; std::uniform_real_distribution segment_length_dis(0, avg_segment_length * 2); std::vector offsets; unsigned int segments_count = 0; size_t offset = 0; while(offset < size) { const size_t segment_length = std::round(segment_length_dis(gen)); offsets.push_back(offset); segments_count++; offset += segment_length; } offsets.push_back(size); std::vector values_input(size); std::iota(values_input.begin(), values_input.end(), 0); offset_type * d_offsets; HIP_CHECK(hipMalloc(reinterpret_cast(&d_offsets), (segments_count + 1) * sizeof(offset_type))); HIP_CHECK( hipMemcpy( d_offsets, offsets.data(), (segments_count + 1) * sizeof(offset_type), hipMemcpyHostToDevice ) ); value_type * d_values_input; HIP_CHECK(hipMalloc(reinterpret_cast(&d_values_input), size * sizeof(value_type))); HIP_CHECK( hipMemcpy( d_values_input, values_input.data(), size * sizeof(value_type), hipMemcpyHostToDevice ) ); value_type * d_aggregates_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_aggregates_output), segments_count * sizeof(value_type))); rocprim::plus reduce_op; value_type init(0); void * d_temporary_storage = nullptr; size_t temporary_storage_bytes = 0; HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); HIP_CHECK(hipMalloc(&d_temporary_storage, temporary_storage_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rp::segmented_reduce( d_temporary_storage, temporary_storage_bytes, d_values_input, d_aggregates_output, segments_count, d_offsets, d_offsets + 1, reduce_op, init, stream ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(value_type)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_temporary_storage)); HIP_CHECK(hipFree(d_offsets)); HIP_CHECK(hipFree(d_values_input)); HIP_CHECK(hipFree(d_aggregates_output)); } #define CREATE_BENCHMARK(T, SEGMENTS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:reduce_segmented,key_type:" #T \ ",segment_count:" \ + std::to_string(SEGMENTS) + ",cfg:default_config}") \ .c_str(), \ run_benchmark, \ SEGMENTS, \ stream, \ size) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 1), \ CREATE_BENCHMARK(type, 10), \ CREATE_BENCHMARK(type, 100), \ CREATE_BENCHMARK(type, 1000), \ CREATE_BENCHMARK(type, 10000) void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { using custom_float2 = custom_type; using custom_double2 = custom_type; std::vector bs = { BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(int), BENCHMARK_TYPE(custom_float2), BENCHMARK_TYPE(custom_double2), }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_select.cpp000066400000000000000000000570511446201466700232250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template void run_flagged_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input; std::vector flags = get_random_data01(size, true_probability); std::vector selected_count_output(1); if(std::is_floating_point::value) { input = get_random_data(size, T(-1000), T(1000)); } else { input = get_random_data( size, std::numeric_limits::min(), std::numeric_limits::max() ); } T * d_input; FlagType * d_flags; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), flags.size() * sizeof(FlagType))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), flags.size() * sizeof(FlagType), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage rocprim::select( nullptr, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { rocprim::select( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rocprim::select( d_temp_storage, temp_storage_size_bytes, d_input, d_flags, d_output, d_selected_count_output, input.size(), stream ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_flags); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); HIP_CHECK(hipDeviceSynchronize()); } template void run_selectop_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float true_probability) { std::vector input = get_random_data(size, T(0), T(1000)); std::vector selected_count_output(1); auto select_op = [true_probability] __device__ (const T& value) -> bool { if(value < T(1000 * true_probability)) return true; return false; }; T * d_input; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage rocprim::select( nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { rocprim::select( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rocprim::select( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), select_op, stream ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); HIP_CHECK(hipDeviceSynchronize()); } template void run_unique_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { using op_type = typename std::conditional::value, half_plus, rocprim::plus>::type; op_type op; std::vector input(size); { auto input01 = get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input[i] = op(acc, input01[i]); } } std::vector selected_count_output(1); auto equality_op = rocprim::equal_to(); T * d_input; T * d_output; unsigned int * d_selected_count_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), input.size() * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_selected_count_output), sizeof(unsigned int))); HIP_CHECK( hipMemcpy( d_input, input.data(), input.size() * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage rocprim::unique( nullptr, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), equality_op, stream ); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void * d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { rocprim::unique( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), equality_op, stream ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rocprim::unique( d_temp_storage, temp_storage_size_bytes, d_input, d_output, d_selected_count_output, input.size(), equality_op, stream ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_input); hipFree(d_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } template void run_unique_by_key_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, float discontinuity_probability) { using op_type = typename std:: conditional_t::value, half_plus, rocprim::plus>; op_type op; std::vector input_keys(size); { auto input01 = get_random_data01(size, discontinuity_probability); auto acc = input01[0]; input_keys[0] = acc; for(size_t i = 1; i < input01.size(); i++) { input_keys[i] = op(acc, input01[i]); } } const auto input_values = get_random_data(size, -1000, 1000); std::vector selected_count_output(1); auto equality_op = rocprim::equal_to(); Key* d_keys_input; Value* d_values_input; Key* d_keys_output; Value* d_values_output; unsigned int* d_selected_count_output; HIP_CHECK(hipMalloc(&d_keys_input, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_input, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_keys_output, input_keys.size() * sizeof(input_keys[0]))); HIP_CHECK(hipMalloc(&d_values_output, input_values.size() * sizeof(input_values[0]))); HIP_CHECK(hipMalloc(&d_selected_count_output, sizeof(selected_count_output[0]))); HIP_CHECK(hipMemcpy(d_keys_input, input_keys.data(), input_keys.size() * sizeof(input_keys[0]), hipMemcpyHostToDevice)); HIP_CHECK(hipMemcpy(d_values_input, input_values.data(), input_values.size() * sizeof(input_values[0]), hipMemcpyHostToDevice)); // Allocate temporary storage memory size_t temp_storage_size_bytes; // Get size of d_temp_storage rocprim::unique_by_key(nullptr, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), equality_op, stream); HIP_CHECK(hipDeviceSynchronize()); // allocate temporary storage void* d_temp_storage = nullptr; HIP_CHECK(hipMalloc(&d_temp_storage, temp_storage_size_bytes)); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < 10; i++) { rocprim::unique_by_key(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), equality_op, stream); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); const unsigned int batch_size = 10; for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { rocprim::unique_by_key(d_temp_storage, temp_storage_size_bytes, d_keys_input, d_values_input, d_keys_output, d_values_output, d_selected_count_output, input_keys.size(), equality_op, stream); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * (sizeof(Key) + sizeof(Value))); state.SetItemsProcessed(state.iterations() * batch_size * size); hipFree(d_keys_input); hipFree(d_values_input); hipFree(d_keys_output); hipFree(d_values_output); hipFree(d_selected_count_output); hipFree(d_temp_storage); } #define CREATE_SELECT_FLAGGED_BENCHMARK(T, F, p) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:select,subalgo:flagged,key_type:" #T \ ",flag_type:" #F ",probability:" #p ",cfg:default_config}") \ .c_str(), \ run_flagged_benchmark, \ size, \ stream, \ p) #define CREATE_SELECT_IF_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:select,subalgo:if,key_type:" #T \ ",probability:" #p ",cfg:default_config}") \ .c_str(), \ run_selectop_benchmark, \ size, \ stream, \ p) #define CREATE_UNIQUE_BENCHMARK(T, p) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:unique,key_type:" #T ",value_type:" \ + std::string(Traits::name()) \ + ",probability:" #p ",cfg:default_config}") \ .c_str(), \ run_unique_benchmark, \ size, \ stream, \ p) #define CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, p) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:device,algo:unique,key_type:" #K \ ",value_type:" #V ",probability:" #p \ ",cfg:default_config}") \ .c_str(), \ run_unique_by_key_benchmark, \ size, \ stream, \ p) #define BENCHMARK_FLAGGED_TYPE(type, value) \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.05f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.25f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.5f), \ CREATE_SELECT_FLAGGED_BENCHMARK(type, value, 0.75f) #define BENCHMARK_IF_TYPE(type) \ CREATE_SELECT_IF_BENCHMARK(type, 0.05f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.25f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.5f), \ CREATE_SELECT_IF_BENCHMARK(type, 0.75f) #define BENCHMARK_UNIQUE_TYPE(type) \ CREATE_UNIQUE_BENCHMARK(type, 0.05f), \ CREATE_UNIQUE_BENCHMARK(type, 0.25f), \ CREATE_UNIQUE_BENCHMARK(type, 0.5f), \ CREATE_UNIQUE_BENCHMARK(type, 0.75f) #define BENCHMARK_UNIQUE_BY_KEY_TYPE(K, V) \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, 0.05f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, 0.25f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, 0.5f), \ CREATE_UNIQUE_BY_KEY_BENCHMARK(K, V, 0.75f) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_double2 = custom_type; using custom_int_double = custom_type; // Add benchmarks std::vector benchmarks = { BENCHMARK_FLAGGED_TYPE(int, unsigned char), BENCHMARK_FLAGGED_TYPE(float, unsigned char), BENCHMARK_FLAGGED_TYPE(double, unsigned char), BENCHMARK_FLAGGED_TYPE(uint8_t, uint8_t), BENCHMARK_FLAGGED_TYPE(int8_t, int8_t), BENCHMARK_FLAGGED_TYPE(rocprim::half, int8_t), BENCHMARK_FLAGGED_TYPE(custom_double2, unsigned char), BENCHMARK_IF_TYPE(int), BENCHMARK_IF_TYPE(float), BENCHMARK_IF_TYPE(double), BENCHMARK_IF_TYPE(uint8_t), BENCHMARK_IF_TYPE(int8_t), BENCHMARK_IF_TYPE(rocprim::half), BENCHMARK_IF_TYPE(custom_int_double), BENCHMARK_UNIQUE_TYPE(int), BENCHMARK_UNIQUE_TYPE(float), BENCHMARK_UNIQUE_TYPE(double), BENCHMARK_UNIQUE_TYPE(uint8_t), BENCHMARK_UNIQUE_TYPE(int8_t), BENCHMARK_UNIQUE_TYPE(rocprim::half), BENCHMARK_UNIQUE_TYPE(custom_int_double), BENCHMARK_UNIQUE_BY_KEY_TYPE(int, int), BENCHMARK_UNIQUE_BY_KEY_TYPE(float, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(double, custom_double2), BENCHMARK_UNIQUE_BY_KEY_TYPE(uint8_t, uint8_t), BENCHMARK_UNIQUE_BY_KEY_TYPE(int8_t, double), BENCHMARK_UNIQUE_BY_KEY_TYPE(rocprim::half, rocprim::half), BENCHMARK_UNIQUE_BY_KEY_TYPE(custom_int_double, custom_int_double) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_device_transform.cpp000066400000000000000000000147041446201466700237570ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 128; #endif const unsigned int batch_size = 10; const unsigned int warmup_size = 5; template struct transform { __device__ __host__ constexpr T operator()(const T& a) const { return a + T(5); } }; template< class T, class BinaryFunction > void run_benchmark(benchmark::State& state, size_t size, const hipStream_t stream, BinaryFunction transform_op) { std::vector input = get_random_data(size, T(0), T(1000)); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // Warm-up for(size_t i = 0; i < warmup_size; i++) { HIP_CHECK( rocprim::transform( d_input, d_output, size, transform_op, stream ) ); } HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); for(size_t i = 0; i < batch_size; i++) { HIP_CHECK( rocprim::transform( d_input, d_output, size, transform_op, stream ) ); } // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * batch_size * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * batch_size * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, TRANSFORM_OP) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:device,algo:transform,key_type:" #T \ ",transform_op:" #TRANSFORM_OP ",cfg:default_config}") \ .c_str(), \ run_benchmark, \ size, \ stream, \ TRANSFORM_OP()) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_float2 = custom_type; using custom_double2 = custom_type; // Add benchmarks std::vector benchmarks = { CREATE_BENCHMARK(int, transform), CREATE_BENCHMARK(long long, transform), CREATE_BENCHMARK(int8_t, transform), CREATE_BENCHMARK(uint8_t, transform), CREATE_BENCHMARK(rocprim::half, transform), CREATE_BENCHMARK(float, transform), CREATE_BENCHMARK(double, transform), CREATE_BENCHMARK(custom_float2, transform), CREATE_BENCHMARK(custom_double2, transform), }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_utils.hpp000066400000000000000000000717111446201466700215730ustar00rootroot00000000000000// Copyright (c) 2017-2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCPRIM_BENCHMARK_UTILS_HPP_ #define ROCPRIM_BENCHMARK_UTILS_HPP_ #include #include #include #include #include #include #include #include #include #ifdef WIN32 #include #endif #include "benchmark/benchmark.h" #include #define HIP_CHECK(condition) \ { \ hipError_t error = condition; \ if(error != hipSuccess) \ { \ std::cout << "HIP error: " << hipGetErrorString(error) << " file: " << __FILE__ \ << " line: " << __LINE__ << std::endl; \ exit(error); \ } \ } #define TUNING_SHARED_MEMORY_MAX 65536u // Support half operators on host side ROCPRIM_HOST inline rocprim::native_half half_to_native(const rocprim::half& x) { return *reinterpret_cast(&x); } ROCPRIM_HOST inline rocprim::half native_to_half(const rocprim::native_half& x) { return *reinterpret_cast(&x); } struct half_less { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a < b; #else return half_to_native(a) < half_to_native(b); #endif } }; struct half_plus { ROCPRIM_HOST_DEVICE inline rocprim::half operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a + b; #else return native_to_half(half_to_native(a) + half_to_native(b)); #endif } }; struct half_equal_to { ROCPRIM_HOST_DEVICE inline bool operator()(const rocprim::half& a, const rocprim::half& b) const { #if __HIP_DEVICE_COMPILE__ return a == b; #else return half_to_native(a) == half_to_native(b); #endif } }; // std::uniform_int_distribution is undefined for anything other than: // short, int, long, long long, unsigned short, unsigned int, unsigned long, or unsigned long long template struct is_valid_for_int_distribution : std::integral_constant::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value || std::is_same::value > {}; using engine_type = std::default_random_engine; // get_random_data() generates only part of sequence and replicates it, // because benchmarks usually do not need "true" random sequence. template inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { engine_type gen{std::random_device{}()}; using dis_type = typename std::conditional< is_valid_for_int_distribution::value, T, typename std::conditional::value, int, unsigned int>::type >::type; std::uniform_int_distribution distribution((T)min, (T)max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline auto get_random_data(size_t size, U min, V max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { engine_type gen{std::random_device{}()}; // Generate floats when T is half using dis_type = typename std::conditional::value, float, T>::type; std::uniform_real_distribution distribution((dis_type)min, (dis_type)max); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline std::vector get_random_data01(size_t size, float p, size_t max_random_size = 1024 * 1024) { engine_type gen{std::random_device{}()}; std::bernoulli_distribution distribution(p); std::vector data(size); std::generate( data.begin(), data.begin() + std::min(size, max_random_size), [&]() { return distribution(gen); } ); for(size_t i = max_random_size; i < size; i += max_random_size) { std::copy_n(data.begin(), std::min(size - i, max_random_size), data.begin() + i); } return data; } template inline T get_random_value(T min, T max) { return get_random_data(1, min, max)[0]; } template struct custom_type { using first_type = T; using second_type = U; T x; U y; ROCPRIM_HOST_DEVICE inline custom_type(T xx = 0, U yy = 0) : x(xx), y(yy) { } ROCPRIM_HOST_DEVICE inline ~custom_type() = default; ROCPRIM_HOST_DEVICE inline custom_type operator+(const custom_type& rhs) const { return custom_type(x + rhs.x, y + rhs.y); } ROCPRIM_HOST_DEVICE inline bool operator<(const custom_type& rhs) const { // intentionally suboptimal choice for short-circuting, // required to generate more performant device code return ((x == rhs.x && y < rhs.y) || x < rhs.x); } ROCPRIM_HOST_DEVICE inline bool operator==(const custom_type& rhs) const { return x == rhs.x && y == rhs.y; } }; template struct is_custom_type : std::false_type {}; template struct is_custom_type> : std::true_type {}; template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value, std::vector>::type { using first_type = typename T::first_type; using second_type = typename T::second_type; std::vector data(size); auto fdata = get_random_data(size, min.x, max.x, max_random_size); auto sdata = get_random_data(size, min.y, max.y, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(fdata[i], sdata[i]); } return data; } template inline auto get_random_data(size_t size, T min, T max, size_t max_random_size = 1024 * 1024) -> typename std::enable_if::value && !std::is_same::value, std::vector>::type { // NOTE 1: post-increment operator required, because HIP has different typedefs for vector field types // when using HCC or HIP-Clang. Using HIP-Clang members are accessed as fields of a struct via // a union, but in HCC mode they are proxy types (Scalar_accessor). STL algorithms don't // always tolerate proxies. Unfortunately, Scalar_accessor doesn't have any member typedefs to // conveniently obtain the inner stored type. All operations on it (operator+, operator+=, // CTOR, etc.) return a reference to an accessor, it is only the post-increment operator that // returns a copy of the stored type, hence we take the decltype of that. // // NOTE 2: decltype() is unevaluated context. We don't really modify max, just compute the type of the // expression if we were to actually call it. using field_type = decltype(max.x++); std::vector data(size); auto field_data = get_random_data(size, min.x, max.x, max_random_size); for(size_t i = 0; i < size; i++) { data[i] = T(field_data[i]); } return data; } inline bool is_warp_size_supported(const unsigned int required_warp_size) { return ::rocprim::host_warp_size() >= required_warp_size; } template struct DeviceSelectWarpSize { static constexpr unsigned int value = ::rocprim::device_warp_size() >= LogicalWarpSize ? LogicalWarpSize : ::rocprim::device_warp_size(); }; template std::vector get_random_segments(const size_t size, const size_t max_segment_length, const int seed_value) { static_assert(rocprim::is_arithmetic::value, "Key type must be arithmetic"); std::default_random_engine prng(seed_value); std::uniform_int_distribution segment_length_distribution(max_segment_length); // std::uniform_real_distribution cannot handle rocprim::half, use float instead using dis_type = typename std::conditional::value, float, T>::type; using key_distribution_type = std::conditional_t::value, std::uniform_int_distribution, std::uniform_real_distribution>; key_distribution_type key_distribution(std::numeric_limits::max()); std::vector keys(size); size_t keys_start_index = 0; while(keys_start_index < size) { const size_t new_segment_length = segment_length_distribution(prng); const size_t new_segment_end = std::min(size, keys_start_index + new_segment_length); const T key = key_distribution(prng); std::fill(keys.begin() + keys_start_index, keys.begin() + new_segment_end, key); keys_start_index += new_segment_length; } return keys; } template struct make_index_range_impl; template struct make_index_range_impl> { using type = std::integer_sequence; }; // make a std::integer_sequence with values from Start to End inclusive template using make_index_range = typename make_index_range_impl>::type; template class Function, T... I, typename... Args> void static_for_each_impl(std::integer_sequence, Args&&... args) { int a[] = {(Function{}(std::forward(args)...), 0)...}; static_cast(a); } // call the supplied template with all values of the std::integer_sequence Indices template class Function, typename... Args> void static_for_each(Args&&... args) { static_for_each_impl(Indices{}, std::forward(args)...); } #define REGISTER_BENCHMARK(benchmarks, size, stream, instance) \ benchmark::internal::Benchmark* benchmark = benchmark::RegisterBenchmark( \ instance.name().c_str(), \ [instance](benchmark::State& state, size_t size, const hipStream_t stream) \ { instance.run(state, size, stream); }, \ size, \ stream); \ benchmarks.emplace_back(benchmark) struct config_autotune_interface { virtual std::string name() const = 0; virtual std::string sort_key() const { return name(); }; virtual ~config_autotune_interface() = default; virtual void run(benchmark::State&, size_t, hipStream_t) const = 0; }; struct config_autotune_register { static std::vector>& vector() { static std::vector> storage; return storage; } template static config_autotune_register create() { vector().push_back(std::make_unique()); return config_autotune_register(); } template static config_autotune_register create_bulk(BulkCreateFunction&& f) { std::forward(f)(vector()); return config_autotune_register(); } // Register a subset of all created benchmarks for the current parallel instance and add to vector. static void register_benchmark_subset(std::vector& benchmarks, int parallel_instance_index, int parallel_instance_count, size_t size, const hipStream_t stream) { std::vector>& configs = vector(); // sorting to get a consistent order because order of initialization of static variables is undefined by the C++ standard. std::sort(configs.begin(), configs.end(), [](const auto& l, const auto& r) { return l->sort_key() < r->sort_key(); }); size_t configs_per_instance = (configs.size() + parallel_instance_count - 1) / parallel_instance_count; size_t start = std::min(parallel_instance_index * configs_per_instance, configs.size()); size_t end = std::min((parallel_instance_index + 1) * configs_per_instance, configs.size()); for(size_t i = start; i < end; i++) { std::unique_ptr& uniq_ptr = configs.at(i); config_autotune_interface* tuning_benchmark = uniq_ptr.get(); benchmark::internal::Benchmark* benchmark = benchmark::RegisterBenchmark( tuning_benchmark->name().c_str(), [tuning_benchmark](benchmark::State& state, size_t size, const hipStream_t stream) { tuning_benchmark->run(state, size, stream); }, size, stream); benchmarks.emplace_back(benchmark); } } }; // Inserts spaces at beginning of string if string shorter than specified length. inline std::string pad_string(std::string str, const size_t len) { if(len > str.size()) { str.insert(str.begin(), len - str.size(), ' '); } return str; } struct bench_naming { public: enum format { json, human, txt }; static format& get_format() { static format storage = human; return storage; } static void set_format(std::string argument) { format result = human; if(argument == "json") { result = json; } else if(argument == "txt") { result = txt; } get_format() = result; } private: static std::string matches_as_json(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 1; result << "{"; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } result << "\"" << m[1].str() << "\":"; if(m[2].length() > 0) { if(m[2].str().find_first_not_of("0123456789") == std::string::npos) { result << m[2].str(); } else { result << "\"" << m[2].str() << "\""; } if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; c++) { result << "}"; } } } else { brackets_count++; result << "{"; insert_comma = false; } } while(brackets_count > 0) { brackets_count--; result << "}"; } return result.str(); } static std::string matches_as_human(std::sregex_iterator& matches) { std::stringstream result; int brackets_count = 0; bool insert_comma = false; for(std::sregex_iterator i = matches; i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(insert_comma) { result << ","; } else { insert_comma = true; } if(m[2].length() > 0) { result << m[2].str(); if(m[3].length() > 0 && brackets_count > 0) { int n = std::min(brackets_count, static_cast(m[3].length())); brackets_count -= n; for(int c = 0; c < n; c++) { result << ">"; } } } else { brackets_count++; result << "<"; insert_comma = false; } } while(brackets_count > 0) { brackets_count--; result << ">"; } return result.str(); } public: static std::string format_name(std::string string) { format format = get_format(); std::regex r("([A-z0-9]*):\\s*((?:custom_type<[A-z0-9,]*>)|[A-z:\\(\\)\\.<>\\s0-9]*)(\\}*)"); // First we perform some checks bool checks[4] = {false}; for(std::sregex_iterator i = std::sregex_iterator(string.begin(), string.end(), r); i != std::sregex_iterator(); ++i) { std::smatch m = *i; if(m[1].str() == "lvl") { checks[0] = true; } else if(m[1].str() == "algo") { checks[1] = true; } else if(m[1].str() == "cfg") { checks[2] = true; } } std::string string_substitute = std::regex_replace(string, r, ""); checks[3] = string_substitute.find_first_not_of(" ,{}") == std::string::npos; for(bool check_name_format : checks) { if(!check_name_format) { std::cout << "Benchmark name \"" << string << "\" not in the correct format (e.g. " "{lvl:block,algo:reduce,cfg:default_config} )" << std::endl; exit(1); } } // Now we generate the desired format std::sregex_iterator matches = std::sregex_iterator(string.begin(), string.end(), r); switch(format) { case format::json: return matches_as_json(matches); case format::human: return matches_as_human(matches); case format::txt: return string; } return string; } }; template struct Traits { //static inline method instead of static inline attribute because that's only supported from C++17 onwards static inline const char* name(){ static_assert(sizeof(T) == 0, "Traits::name() unknown"); return "unknown"; } }; // Explicit definitions template <> inline const char* Traits::name() { return "int"; } template <> inline const char* Traits::name() { return "short"; } template <> inline const char* Traits::name() { return "int8_t"; } template <> inline const char* Traits::name() { return "uint8_t"; } template<> inline const char* Traits::name() { return "uint16_t"; } template<> inline const char* Traits::name() { return "uint32_t"; } template<> inline const char* Traits::name() { return "rocprim::half"; } template<> inline const char* Traits::name() { return "int64_t"; } // On MSVC `int64_t` and `long long` are the same, leading to multiple definition errors #ifndef WIN32 template <> inline const char* Traits::name() { return "int64_t"; } #endif template <> inline const char* Traits::name() { return "float"; } template <> inline const char* Traits::name() { return "double"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits>::name() { return "custom_type"; } template<> inline const char* Traits::name() { return "empty_type"; } template<> inline const char* Traits>::name() { return "float2"; } template<> inline const char* Traits>::name() { return "double2"; } inline void add_common_benchmark_info() { hipDeviceProp_t devProp; int device_id = 0; HIP_CHECK(hipGetDevice(&device_id)); HIP_CHECK(hipGetDeviceProperties(&devProp, device_id)); auto str = [](const std::string& name, const std::string& val) { benchmark::AddCustomContext(name, val); }; auto num = [](const std::string& name, const auto& value) { benchmark::AddCustomContext(name, std::to_string(value)); }; auto dim2 = [num](const std::string& name, const auto* values) { num(name + "_x", values[0]); num(name + "_y", values[1]); }; auto dim3 = [num, dim2](const std::string& name, const auto* values) { dim2(name, values); num(name + "_z", values[2]); }; str("hdp_name", devProp.name); num("hdp_total_global_mem", devProp.totalGlobalMem); num("hdp_shared_mem_per_block", devProp.sharedMemPerBlock); num("hdp_regs_per_block", devProp.regsPerBlock); num("hdp_warp_size", devProp.warpSize); num("hdp_max_threads_per_block", devProp.maxThreadsPerBlock); dim3("hdp_max_threads_dim", devProp.maxThreadsDim); dim3("hdp_max_grid_size", devProp.maxGridSize); num("hdp_clock_rate", devProp.clockRate); num("hdp_memory_clock_rate", devProp.memoryClockRate); num("hdp_memory_bus_width", devProp.memoryBusWidth); num("hdp_total_const_mem", devProp.totalConstMem); num("hdp_major", devProp.major); num("hdp_minor", devProp.minor); num("hdp_multi_processor_count", devProp.multiProcessorCount); num("hdp_l2_cache_size", devProp.l2CacheSize); num("hdp_max_threads_per_multiprocessor", devProp.maxThreadsPerMultiProcessor); num("hdp_compute_mode", devProp.computeMode); num("hdp_clock_instruction_rate", devProp.clockInstructionRate); num("hdp_concurrent_kernels", devProp.concurrentKernels); num("hdp_pci_domain_id", devProp.pciDomainID); num("hdp_pci_bus_id", devProp.pciBusID); num("hdp_pci_device_id", devProp.pciDeviceID); num("hdp_max_shared_memory_per_multi_processor", devProp.maxSharedMemoryPerMultiProcessor); num("hdp_is_multi_gpu_board", devProp.isMultiGpuBoard); num("hdp_can_map_host_memory", devProp.canMapHostMemory); str("hdp_gcn_arch_name", devProp.gcnArchName); num("hdp_integrated", devProp.integrated); num("hdp_cooperative_launch", devProp.cooperativeLaunch); num("hdp_cooperative_multi_device_launch", devProp.cooperativeMultiDeviceLaunch); num("hdp_max_texture_1d_linear", devProp.maxTexture1DLinear); num("hdp_max_texture_1d", devProp.maxTexture1D); dim2("hdp_max_texture_2d", devProp.maxTexture2D); dim3("hdp_max_texture_3d", devProp.maxTexture3D); num("hdp_mem_pitch", devProp.memPitch); num("hdp_texture_alignment", devProp.textureAlignment); num("hdp_texture_pitch_alignment", devProp.texturePitchAlignment); num("hdp_kernel_exec_timeout_enabled", devProp.kernelExecTimeoutEnabled); num("hdp_ecc_enabled", devProp.ECCEnabled); num("hdp_tcc_driver", devProp.tccDriver); num("hdp_cooperative_multi_device_unmatched_func", devProp.cooperativeMultiDeviceUnmatchedFunc); num("hdp_cooperative_multi_device_unmatched_grid_dim", devProp.cooperativeMultiDeviceUnmatchedGridDim); num("hdp_cooperative_multi_device_unmatched_block_dim", devProp.cooperativeMultiDeviceUnmatchedBlockDim); num("hdp_cooperative_multi_device_unmatched_shared_mem", devProp.cooperativeMultiDeviceUnmatchedSharedMem); num("hdp_is_large_bar", devProp.isLargeBar); num("hdp_asic_revision", devProp.asicRevision); num("hdp_managed_memory", devProp.managedMemory); num("hdp_direct_managed_mem_access_from_host", devProp.directManagedMemAccessFromHost); num("hdp_concurrent_managed_access", devProp.concurrentManagedAccess); num("hdp_pageable_memory_access", devProp.pageableMemoryAccess); num("hdp_pageable_memory_access_uses_host_page_tables", devProp.pageableMemoryAccessUsesHostPageTables); const auto arch = devProp.arch; num("hdp_arch_has_global_int32_atomics", arch.hasGlobalInt32Atomics); num("hdp_arch_has_global_float_atomic_exch", arch.hasGlobalFloatAtomicExch); num("hdp_arch_has_shared_int32_atomics", arch.hasSharedInt32Atomics); num("hdp_arch_has_shared_float_atomic_exch", arch.hasSharedFloatAtomicExch); num("hdp_arch_has_float_atomic_add", arch.hasFloatAtomicAdd); num("hdp_arch_has_global_int64_atomics", arch.hasGlobalInt64Atomics); num("hdp_arch_has_shared_int64_atomics", arch.hasSharedInt64Atomics); num("hdp_arch_has_doubles", arch.hasDoubles); num("hdp_arch_has_warp_vote", arch.hasWarpVote); num("hdp_arch_has_warp_ballot", arch.hasWarpBallot); num("hdp_arch_has_warp_shuffle", arch.hasWarpShuffle); num("hdp_arch_has_funnel_shift", arch.hasFunnelShift); num("hdp_arch_has_thread_fence_system", arch.hasThreadFenceSystem); num("hdp_arch_has_sync_threads_ext", arch.hasSyncThreadsExt); num("hdp_arch_has_surface_funcs", arch.hasSurfaceFuncs); num("hdp_arch_has_3d_grid", arch.has3dGrid); num("hdp_arch_has_dynamic_parallelism", arch.hasDynamicParallelism); } inline const char* get_block_scan_method_name(rocprim::block_scan_algorithm alg) { switch(alg) { case rocprim::block_scan_algorithm::using_warp_scan: return "block_scan_algorithm::using_warp_scan"; case rocprim::block_scan_algorithm::reduce_then_scan: return "block_scan_algorithm::reduce_then_scan"; // Not using `default: ...` because it kills effectiveness of -Wswitch } return "unknown_algorithm"; } #endif // ROCPRIM_BENCHMARK_UTILS_HPP_ rocPRIM-rocm-5.7.1/benchmark/benchmark_warp_exchange.cpp000066400000000000000000000335721446201466700232440ustar00rootroot00000000000000// MIT License // // Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif struct BlockedToStripedOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) { warp_exchange.blocked_to_striped(items, items, storage); } }; struct StripedToBlockedOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) { warp_exchange.striped_to_blocked(items, items, storage); } }; struct BlockedToStripedShuffleOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) { warp_exchange.blocked_to_striped_shuffle(items, items); } }; struct StripedToBlockedShuffleOp { template< class warp_exchange_type, class T, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE void operator()(warp_exchange_type warp_exchange, T (&items)[ItemsPerThread], typename warp_exchange_type::storage_type& /*storage*/) { warp_exchange.striped_to_blocked_shuffle(items, items); } }; struct ScatterToStripedOp { template< class T, class OffsetT, class warp_exchange_type, unsigned int ItemsPerThread > ROCPRIM_DEVICE ROCPRIM_INLINE void operator()(warp_exchange_type warp_exchange, T (&thread_data)[ItemsPerThread], const OffsetT (&ranks)[ItemsPerThread], typename warp_exchange_type::storage_type& storage) { warp_exchange.scatter_to_striped(thread_data, thread_data, ranks, storage); } }; template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int LogicalWarpSize, class Op > __global__ __launch_bounds__(BlockSize) auto warp_exchange_kernel(T* d_output, unsigned int trials) -> typename std::enable_if::value, void>::type { T thread_data[ItemsPerThread]; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { // generate unique value each data-element thread_data[i] = static_cast(hipThreadIdx_x*ItemsPerThread+i); } using warp_exchange_type = ::rocprim::warp_exchange< T, ItemsPerThread, DeviceSelectWarpSize::value >; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; i++) { Op{}(warp_exchange_type(), thread_data, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { const unsigned int global_idx = (BlockSize * hipBlockIdx_x + hipThreadIdx_x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int LogicalWarpSize, class Op > __global__ __launch_bounds__(BlockSize) auto warp_exchange_kernel(T* d_output, unsigned int trials) -> typename std::enable_if::value, void>::type { T thread_data[ItemsPerThread]; unsigned int thread_ranks[ItemsPerThread]; constexpr unsigned int warps_in_block = BlockSize / LogicalWarpSize; const unsigned int warp_id = hipThreadIdx_x / LogicalWarpSize; const unsigned int lane_id = hipThreadIdx_x % LogicalWarpSize; ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { // generate unique value each data-element thread_data[i] = static_cast(hipThreadIdx_x*ItemsPerThread+i); // generate unique destination location for each data-element const unsigned int s_lane_id = i % 2 == 0 ? LogicalWarpSize - 1 - lane_id : lane_id; thread_ranks[i] = s_lane_id*ItemsPerThread+i; // scatter values in warp across whole storage } using warp_exchange_type = ::rocprim::warp_exchange< T, ItemsPerThread, DeviceSelectWarpSize::value >; ROCPRIM_SHARED_MEMORY typename warp_exchange_type::storage_type storage[warps_in_block]; ROCPRIM_NO_UNROLL for(unsigned int i = 0; i < trials; i++) { Op{}(warp_exchange_type(), thread_data, thread_ranks, storage[warp_id]); ::rocprim::wave_barrier(); } ROCPRIM_UNROLL for(unsigned int i = 0; i < ItemsPerThread; i++) { const unsigned int global_idx = (BlockSize * hipBlockIdx_x + hipThreadIdx_x) * ItemsPerThread + i; d_output[global_idx] = thread_data[i]; } } template< class T, unsigned int BlockSize, unsigned int ItemsPerThread, unsigned int LogicalWarpSize, class Op > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { constexpr unsigned int trials = 200; constexpr unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int size = items_per_block * ((N + items_per_block - 1) / items_per_block); T * d_output; HIP_CHECK(hipMalloc(&d_output, size * sizeof(T))); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exchange_kernel< T, BlockSize, ItemsPerThread, LogicalWarpSize, Op > ), dim3(size / items_per_block), dim3(BlockSize), 0, stream, d_output, trials ); HIP_CHECK(hipPeekAtLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * trials * size); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, IT, WS, OP) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:warp,algo:exchange,key_type:" #T \ ",operation:" #OP ",ws:" #WS \ ",cfg:{bs:" #BS ",ipt:" #IT "}}") \ .c_str(), \ &run_benchmark, \ stream, \ size) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks{ CREATE_BENCHMARK(int, 256, 1, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 16, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 32, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 16, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 32, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 16, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 32, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 16, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 32, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 1, 32, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 32, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 16, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 32, ScatterToStripedOp) }; if(is_warp_size_supported(64)) { std::vector additional_benchmarks{ CREATE_BENCHMARK(int, 256, 1, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 4, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 16, 64, BlockedToStripedOp), CREATE_BENCHMARK(int, 256, 1, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 4, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 16, 64, StripedToBlockedOp), CREATE_BENCHMARK(int, 256, 1, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 64, BlockedToStripedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 4, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 16, 64, StripedToBlockedShuffleOp), CREATE_BENCHMARK(int, 256, 1, 64, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 4, 64, ScatterToStripedOp), CREATE_BENCHMARK(int, 256, 16, 64, ScatterToStripedOp) }; benchmarks.insert( benchmarks.end(), additional_benchmarks.begin(), additional_benchmarks.end() ); } // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for (auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_warp_reduce.cpp000066400000000000000000000227321446201466700227250ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" #include "benchmark_utils.hpp" // HIP API #include // rocPRIM #include #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif template< bool AllReduce, class T, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_reduce_kernel(const T * d_input, T * d_output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().reduce(value, value, storage); } d_output[i] = value; } template< class T, class Flag, unsigned int WarpSize, unsigned int Trials > __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void segmented_warp_reduce_kernel(const T* d_input, Flag* d_flags, T* d_output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = d_input[i]; auto flag = d_flags[i]; using wreduce_t = rocprim::warp_reduce; __shared__ typename wreduce_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wreduce_t().head_segmented_reduce(value, value, flag, storage); } d_output[i] = value; } template< bool AllReduce, bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* /* flags */, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, output ); HIP_CHECK(hipGetLastError()); } template< bool AllReduce, bool Segmented, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials, class T, class Flag > inline auto execute_warp_reduce_kernel(T* input, T* output, Flag* flags, size_t size, hipStream_t stream) -> typename std::enable_if::type { hipLaunchKernelGGL( HIP_KERNEL_NAME(segmented_warp_reduce_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, input, flags, output ); HIP_CHECK(hipGetLastError()); } template< bool AllReduce, bool Segmented, class T, unsigned int WarpSize, unsigned int BlockSize, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t N) { using flag_type = unsigned char; const auto size = BlockSize * ((N + BlockSize - 1)/BlockSize); std::vector input = get_random_data(size, T(0), T(10)); std::vector flags = get_random_data(size, 0, 1); T * d_input; flag_type * d_flags; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_flags), size * sizeof(flag_type))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK( hipMemcpy( d_flags, flags.data(), size * sizeof(flag_type), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); execute_warp_reduce_kernel(d_input, d_output, d_flags, size, stream); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * Trials * size * sizeof(T)); state.SetItemsProcessed(state.iterations() * Trials * size); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); HIP_CHECK(hipFree(d_flags)); } #define CREATE_BENCHMARK(T, WS, BS) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:reduce,key_type:" #T ",broadcast_result:" \ + std::string(AllReduce ? "true" : "false") \ + ",segmented:" + std::string(Segmented ? "true" : "false") \ + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 32, 64), \ CREATE_BENCHMARK(type, 37, 64), \ CREATE_BENCHMARK(type, 61, 64), \ CREATE_BENCHMARK(type, 64, 64) template void add_benchmarks(std::vector& benchmarks, hipStream_t stream, size_t size) { std::vector bs = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half) }; benchmarks.insert(benchmarks.end(), bs.begin(), bs.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, stream, size); add_benchmarks(benchmarks, stream, size); add_benchmarks(benchmarks, stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_warp_scan.cpp000066400000000000000000000202631446201466700223770ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_inclusive_scan_kernel(const T* input, T* output) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().inclusive_scan(value, value, storage); } output[i] = value; } template __global__ __launch_bounds__(ROCPRIM_DEFAULT_MAX_BLOCK_SIZE) void warp_exclusive_scan_kernel(const T* input, T* output, const T init) { const unsigned int i = blockIdx.x * blockDim.x + threadIdx.x; auto value = input[i]; using wscan_t = rp::warp_scan; __shared__ typename wscan_t::storage_type storage; ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; trial++) { wscan_t().exclusive_scan(value, value, init, storage); } output[i] = value; } template< class T, unsigned int BlockSize, unsigned int WarpSize, bool Inclusive = true, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of BlockSize size = BlockSize * ((size + BlockSize - 1)/BlockSize); // Allocate and fill memory std::vector input(size, (T)1); T * d_input; T * d_output; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input), size * sizeof(T))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output), size * sizeof(T))); HIP_CHECK( hipMemcpy( d_input, input.data(), size * sizeof(T), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for (auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(Inclusive) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_inclusive_scan_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, d_input, d_output ); } else { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_exclusive_scan_kernel), dim3(size/BlockSize), dim3(BlockSize), 0, stream, d_input, d_output, input[0] ); } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); state.SetBytesProcessed(state.iterations() * size * sizeof(T) * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input)); HIP_CHECK(hipFree(d_output)); } #define CREATE_BENCHMARK(T, BS, WS, INCLUSIVE) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:scan,key_type:" #T ",subalgo:" \ + std::string(Inclusive ? "inclusive" : "exclusive") \ + ",ws:" #WS ",cfg:{bs:" #BS "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type) \ CREATE_BENCHMARK(type, 64, 64, Inclusive), \ CREATE_BENCHMARK(type, 128, 64, Inclusive), \ CREATE_BENCHMARK(type, 256, 64, Inclusive), \ CREATE_BENCHMARK(type, 256, 32, Inclusive), \ CREATE_BENCHMARK(type, 256, 16, Inclusive), \ CREATE_BENCHMARK(type, 63, 63, Inclusive), \ CREATE_BENCHMARK(type, 62, 31, Inclusive), \ CREATE_BENCHMARK(type, 60, 15, Inclusive) template void add_benchmarks(std::vector& benchmarks, const std::string& method_name, hipStream_t stream, size_t size) { using custom_double2 = custom_type; using custom_int_double = custom_type; std::vector new_benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_TYPE(custom_double2), BENCHMARK_TYPE(custom_int_double) }; benchmarks.insert(benchmarks.end(), new_benchmarks.begin(), new_benchmarks.end()); } int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); // Add benchmarks std::vector benchmarks; add_benchmarks(benchmarks, "inclusive", stream, size); add_benchmarks(benchmarks, "exclusive", stream, size); // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/benchmark_warp_sort.cpp000066400000000000000000000272411446201466700224450ustar00rootroot00000000000000// MIT License // // Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. #include #include #include #include #include #include // Google Benchmark #include "benchmark/benchmark.h" // CmdParser #include "cmdparser.hpp" // HIP API #include // rocPRIM #include #include "benchmark_utils.hpp" #ifndef DEFAULT_N const size_t DEFAULT_N = 1024 * 1024 * 32; #endif namespace rp = rocprim; template __global__ __launch_bounds__(BlockSize) void warp_sort_kernel(K* input_keys, K* output_keys) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; rp::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rp::warp_sort wsort; wsort.sort(keys); rp::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); } template __global__ __launch_bounds__(BlockSize) void warp_sort_by_key_kernel(K* input_keys, V* input_values, K* output_keys, V* output_values) { const unsigned int flat_tid = threadIdx.x; const unsigned int items_per_block = BlockSize * ItemsPerThread; const unsigned int block_offset = blockIdx.x * items_per_block; K keys[ItemsPerThread]; V values[ItemsPerThread]; rp::block_load_direct_striped(flat_tid, input_keys + block_offset, keys); rp::block_load_direct_striped(flat_tid, input_values + block_offset, values); rp::warp_sort wsort; wsort.sort(keys, values); rp::block_store_direct_blocked(flat_tid, output_keys + block_offset, keys); rp::block_store_direct_blocked(flat_tid, output_values + block_offset, values); } template Value get_max_value() { return Value(10000); } template<> char get_max_value() { return std::numeric_limits::max(); } template<> custom_type get_max_value() { return custom_type(std::numeric_limits::max()); } template< class Key, unsigned int BlockSize, unsigned int WarpSize, unsigned int ItemsPerThread = 1, class Value = Key, bool SortByKey = false, unsigned int Trials = 100 > void run_benchmark(benchmark::State& state, hipStream_t stream, size_t size) { // Make sure size is a multiple of items_per_block constexpr auto items_per_block = BlockSize * ItemsPerThread; size = BlockSize * ((size + items_per_block - 1) / items_per_block); // Allocate and fill memory std::vector input_key = get_random_data(size, 0, get_max_value()); std::vector input_value(size_t(1)); if(SortByKey) input_value = get_random_data(size, 0, get_max_value()); Key * d_input_key = nullptr; Key * d_output_key = nullptr; Value * d_input_value = nullptr; Value * d_output_value = nullptr; HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_key), size * sizeof(Key))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_key), size * sizeof(Key))); if(SortByKey) { HIP_CHECK(hipMalloc(reinterpret_cast(&d_input_value), size * sizeof(Value))); HIP_CHECK(hipMalloc(reinterpret_cast(&d_output_value), size * sizeof(Value))); } HIP_CHECK( hipMemcpy( d_input_key, input_key.data(), size * sizeof(Key), hipMemcpyHostToDevice ) ); if(SortByKey) HIP_CHECK( hipMemcpy( d_input_value, input_value.data(), size * sizeof(Value), hipMemcpyHostToDevice ) ); HIP_CHECK(hipDeviceSynchronize()); // HIP events creation hipEvent_t start, stop; HIP_CHECK(hipEventCreate(&start)); HIP_CHECK(hipEventCreate(&stop)); for(auto _ : state) { // Record start event HIP_CHECK(hipEventRecord(start, stream)); if(SortByKey) { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_by_key_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_input_value, d_output_key, d_output_value ); } } else { ROCPRIM_NO_UNROLL for(unsigned int trial = 0; trial < Trials; ++trial) { hipLaunchKernelGGL( HIP_KERNEL_NAME(warp_sort_kernel), dim3(size/items_per_block), dim3(BlockSize), 0, stream, d_input_key, d_output_key ); } } HIP_CHECK(hipGetLastError()); // Record stop event and wait until it completes HIP_CHECK(hipEventRecord(stop, stream)); HIP_CHECK(hipEventSynchronize(stop)); float elapsed_mseconds; HIP_CHECK(hipEventElapsedTime(&elapsed_mseconds, start, stop)); state.SetIterationTime(elapsed_mseconds / 1000); } // Destroy HIP events HIP_CHECK(hipEventDestroy(start)); HIP_CHECK(hipEventDestroy(stop)); // SortByKey also transfers values auto sorted_type_size = sizeof(Key); if(SortByKey) sorted_type_size += sizeof(Value); state.SetBytesProcessed(state.iterations() * size * sorted_type_size * Trials); state.SetItemsProcessed(state.iterations() * size * Trials); HIP_CHECK(hipFree(d_input_key)); HIP_CHECK(hipFree(d_output_key)); HIP_CHECK(hipFree(d_input_value)); HIP_CHECK(hipFree(d_output_value)); } #define CREATE_SORT_BENCHMARK(K, BS, WS, IPT) \ benchmark::RegisterBenchmark( \ bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K ",value_type:" \ + std::string(Traits::name()) \ + ",ws:" #WS ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define CREATE_SORTBYKEY_BENCHMARK(K, V, BS, WS, IPT) \ benchmark::RegisterBenchmark(bench_naming::format_name("{lvl:warp,algo:sort,key_type:" #K \ ",value_type:" #V ",ws:" #WS \ ",cfg:{bs:" #BS ",ipt:" #IPT "}}") \ .c_str(), \ run_benchmark, \ stream, \ size) #define BENCHMARK_TYPE(type) \ CREATE_SORT_BENCHMARK(type, 64, 64, 1), \ CREATE_SORT_BENCHMARK(type, 64, 64, 2), \ CREATE_SORT_BENCHMARK(type, 64, 64, 4), \ CREATE_SORT_BENCHMARK(type, 128, 64, 1), \ CREATE_SORT_BENCHMARK(type, 128, 64, 2), \ CREATE_SORT_BENCHMARK(type, 128, 64, 4), \ CREATE_SORT_BENCHMARK(type, 256, 64, 1), \ CREATE_SORT_BENCHMARK(type, 256, 64, 2), \ CREATE_SORT_BENCHMARK(type, 256, 64, 4), \ CREATE_SORT_BENCHMARK(type, 64, 32, 1), \ CREATE_SORT_BENCHMARK(type, 64, 32, 2), \ CREATE_SORT_BENCHMARK(type, 64, 16, 1), \ CREATE_SORT_BENCHMARK(type, 64, 16, 2), \ CREATE_SORT_BENCHMARK(type, 64, 16, 4) #define BENCHMARK_KEY_TYPE(type, value) \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 1), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 2), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 64, 64, 4), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 1), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 2), \ CREATE_SORTBYKEY_BENCHMARK(type, value, 256, 64, 4) int main(int argc, char *argv[]) { cli::Parser parser(argc, argv); parser.set_optional("size", "size", DEFAULT_N, "number of values"); parser.set_optional("trials", "trials", -1, "number of iterations"); parser.set_optional("name_format", "name_format", "human", "either: json,human,txt"); parser.run_and_exit_if_error(); // Parse argv benchmark::Initialize(&argc, argv); const size_t size = parser.get("size"); const int trials = parser.get("trials"); bench_naming::set_format(parser.get("name_format")); // HIP hipStream_t stream = 0; // default // Benchmark info add_common_benchmark_info(); benchmark::AddCustomContext("size", std::to_string(size)); using custom_double2 = custom_type; using custom_int_double = custom_type; using custom_int2 = custom_type; using custom_char_double = custom_type; using custom_longlong_double = custom_type; std::vector benchmarks = { BENCHMARK_TYPE(int), BENCHMARK_TYPE(float), BENCHMARK_TYPE(double), BENCHMARK_TYPE(int8_t), BENCHMARK_TYPE(uint8_t), BENCHMARK_TYPE(rocprim::half), BENCHMARK_KEY_TYPE(float, float), BENCHMARK_KEY_TYPE(unsigned int, int), BENCHMARK_KEY_TYPE(int, custom_double2), BENCHMARK_KEY_TYPE(int, custom_int_double), BENCHMARK_KEY_TYPE(custom_int2, custom_double2), BENCHMARK_KEY_TYPE(custom_int2, custom_char_double), BENCHMARK_KEY_TYPE(custom_int2, custom_longlong_double), BENCHMARK_KEY_TYPE(int8_t, int8_t), BENCHMARK_KEY_TYPE(uint8_t, uint8_t), BENCHMARK_KEY_TYPE(rocprim::half, rocprim::half) }; // Use manual timing for(auto& b : benchmarks) { b->UseManualTime(); b->Unit(benchmark::kMillisecond); } // Force number of iterations if(trials > 0) { for(auto& b : benchmarks) { b->Iterations(trials); } } // Run benchmarks benchmark::RunSpecifiedBenchmarks(); return 0; } rocPRIM-rocm-5.7.1/benchmark/cmdparser.hpp000066400000000000000000000417661446201466700204100ustar00rootroot00000000000000// The MIT License (MIT) // // Copyright (c) 2015 - 2016 Florian Rappl // Modifications Copyright (c) 2019, Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in all // copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE // SOFTWARE. /* This file is part of the C++ CmdParser utility. Copyright (c) 2015 - 2016 Florian Rappl */ #pragma once #include #include #include #include #include #include namespace cli { struct CallbackArgs { const std::vector& arguments; std::ostream& output; std::ostream& error; }; class Parser { private: class CmdBase { public: explicit CmdBase(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant, bool variadic) : name(name), command(name.size() > 0 ? "-" + name : ""), alternative(alternative.size() > 0 ? "--" + alternative : ""), description(description), required(required), handled(false), arguments({}), dominant(dominant), variadic(variadic) { } virtual ~CmdBase() { } std::string name; std::string command; std::string alternative; std::string description; bool required; bool handled; std::vector arguments; bool const dominant; bool const variadic; virtual std::string print_value() const = 0; virtual bool parse(std::ostream& output, std::ostream& error) = 0; bool is(const std::string& given) const { return given == command || given == alternative; } }; template struct ArgumentCountChecker { static constexpr bool Variadic = false; }; template struct ArgumentCountChecker> { static constexpr bool Variadic = true; }; template class CmdFunction final : public CmdBase { public: explicit CmdFunction(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream& output, std::ostream& error) { try { CallbackArgs args { arguments, output, error }; value = callback(args); return true; } catch (...) { return false; } } virtual std::string print_value() const { return ""; } std::function callback; T value; }; template class CmdArgument final : public CmdBase { public: explicit CmdArgument(const std::string& name, const std::string& alternative, const std::string& description, bool required, bool dominant) : CmdBase(name, alternative, description, required, dominant, ArgumentCountChecker::Variadic) { } virtual bool parse(std::ostream&, std::ostream&) { try { value = Parser::parse(arguments, value); return true; } catch (...) { return false; } } virtual std::string print_value() const { return stringify(value); } T value; }; static int parse(const std::vector& elements, const int&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoi(elements[0]); } static bool parse(const std::vector& elements, const bool& defval) { if (elements.size() != 0) throw std::runtime_error("A boolean command line parameter cannot have any arguments."); return !defval; } static double parse(const std::vector& elements, const double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stod(elements[0]); } static float parse(const std::vector& elements, const float&) { if (elements.size() != 1) throw std::bad_cast(); return std::stof(elements[0]); } static long double parse(const std::vector& elements, const long double&) { if (elements.size() != 1) throw std::bad_cast(); return std::stold(elements[0]); } static unsigned int parse(const std::vector& elements, const unsigned int&) { if (elements.size() != 1) throw std::bad_cast(); return static_cast(std::stoul(elements[0])); } static unsigned long parse(const std::vector& elements, const unsigned long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoul(elements[0]); } static unsigned long long parse(const std::vector& elements, const unsigned long long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stoull(elements[0]); } static long parse(const std::vector& elements, const long&) { if (elements.size() != 1) throw std::bad_cast(); return std::stol(elements[0]); } static std::string parse(const std::vector& elements, const std::string&) { if (elements.size() != 1) throw std::bad_cast(); return elements[0]; } template static std::vector parse(const std::vector& elements, const std::vector&) { const T defval = T(); std::vector values { }; std::vector buffer(1); for (const auto& element : elements) { buffer[0] = element; values.push_back(parse(buffer, defval)); } return values; } template static std::string stringify(const T& value) { return std::to_string(value); } template static std::string stringify(const std::vector& values) { std::stringstream ss { }; ss << "[ "; for (const auto& value : values) { ss << stringify(value) << " "; } ss << "]"; return ss.str(); } static std::string stringify(const std::string& str) { return str; } public: explicit Parser(int argc, const char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } explicit Parser(int argc, char** argv) : _appname(argv[0]) { for (int i = 1; i < argc; ++i) { _arguments.push_back(argv[i]); } enable_help(); } ~Parser() { for (int i = 0, n = _commands.size(); i < n; ++i) { delete _commands[i]; } } bool has_help() const { for (const auto command : _commands) { if (command->name == "h" && command->alternative == "--help") { return true; } } return false; } void enable_help() { set_callback("h", "help", std::function([this](CallbackArgs& args){ args.output << this->usage(); /*exit(0);*/ return false; }), "", true); } void disable_help() { for (auto command = _commands.begin(); command != _commands.end(); ++command) { if ((*command)->name == "h" && (*command)->alternative == "--help") { _commands.erase(command); break; } } } template void set_default(bool is_required, const std::string& description = "") { auto command = new CmdArgument { "", "", description, is_required, false }; _commands.push_back(command); } template void set_required(const std::string& name, const std::string& alternative, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, true, dominant }; _commands.push_back(command); } template void set_optional(const std::string& name, const std::string& alternative, T defaultValue, const std::string& description = "", bool dominant = false) { auto command = new CmdArgument { name, alternative, description, false, dominant }; command->value = defaultValue; _commands.push_back(command); } template void set_callback(const std::string& name, const std::string& alternative, std::function callback, const std::string& description = "", bool dominant = false) { auto command = new CmdFunction { name, alternative, description, false, dominant }; command->callback = callback; _commands.push_back(command); } inline void run_and_exit_if_error() { if (run() == false) { exit(1); } } inline bool run() { return run(std::cout, std::cerr); } inline bool run(std::ostream& output) { return run(output, std::cerr); } bool run(std::ostream& output, std::ostream& error) { if (_arguments.size() > 0) { auto current = find_default(); for (int i = 0, n = _arguments.size(); i < n; ++i) { auto isarg = _arguments[i].size() > 0 && _arguments[i][0] == '-'; auto associated = isarg ? find(_arguments[i]) : nullptr; if (associated != nullptr) { current = associated; associated->handled = true; } else if (current == nullptr) { current = find(_arguments[i]); // Code was commented out so cmdparser can ignore unknown options // error << no_default(); // return false; } else { current->arguments.push_back(_arguments[i]); current->handled = true; if (!current->variadic) { // If the current command is not variadic, then no more arguments // should be added to it. In this case, switch back to the default // command. current = find_default(); } } } } // First, parse dominant arguments since they succeed even if required // arguments are missing. for (auto command : _commands) { if (command->handled && command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } // Next, check for any missing arguments. for (auto command : _commands) { if (command->required && !command->handled) { error << howto_required(command); return false; } } // Finally, parse all remaining arguments. for (auto command : _commands) { if (command->handled && !command->dominant && !command->parse(output, error)) { error << howto_use(command); return false; } } return true; } template T get(const std::string& name) const { for (const auto& command : _commands) { if (command->name == name) { auto cmd = dynamic_cast*>(command); if (cmd == nullptr) { throw std::runtime_error("Invalid usage of the parameter " + name + " detected."); } return cmd->value; } } throw std::runtime_error("The parameter " + name + " could not be found."); } template T get_if(const std::string& name, std::function callback) const { auto value = get(name); return callback(value); } int requirements() const { int count = 0; for (const auto& command : _commands) { if (command->required) { ++count; } } return count; } int commands() const { return static_cast(_commands.size()); } inline const std::string& app_name() const { return _appname; } protected: CmdBase* find(const std::string& name) { for (auto command : _commands) { if (command->is(name)) { return command; } } return nullptr; } CmdBase* find_default() { for (auto command : _commands) { if (command->name == "") { return command; } } return nullptr; } std::string usage() const { std::stringstream ss { }; ss << "Available parameters:\n\n"; for (const auto& command : _commands) { ss << " " << command->command << "\t" << command->alternative; if (command->required == true) { ss << "\t(required)"; } ss << "\n " << command->description; if (command->required == false) { ss << "\n " << "This parameter is optional. The default value is '" + command->print_value() << "'."; } ss << "\n\n"; } return ss.str(); } void print_help(std::stringstream& ss) const { if (has_help()) { ss << "For more help use --help or -h.\n"; } } std::string howto_required(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " is required.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string howto_use(CmdBase* command) const { std::stringstream ss { }; ss << "The parameter " << command->name << " has invalid arguments.\n"; ss << command->description << '\n'; print_help(ss); return ss.str(); } std::string no_default() const { std::stringstream ss { }; ss << "No default parameter has been specified.\n"; ss << "The given argument must be used with a parameter.\n"; print_help(ss); return ss.str(); } private: const std::string _appname; std::vector _arguments; std::vector _commands; }; } rocPRIM-rocm-5.7.1/cmake/000077500000000000000000000000001446201466700150275ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/cmake/ConfigAutotune.cmake000066400000000000000000000104501446201466700207630ustar00rootroot00000000000000# MIT License # # Copyright (c) 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(add_configured_source) cmake_parse_arguments(PARSE_ARGV 0 ARG "" "INPUT;TARGET;OUTPUT_PATTERN" "NAMES;VALUES") list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_VALUES VALS_LEN) if (NOT NAMES_LEN EQUAL VALS_LEN) message("NAMES_LEN: ${NAMES_LEN}, VALS_LEN: ${VALS_LEN}") message(FATAL_ERROR "The same number of names and values must be provided!") endif() math(EXPR max "${VALS_LEN} - 1") foreach(i RANGE ${max}) list(GET ARG_NAMES ${i} curr_name) list(GET ARG_VALUES ${i} "${curr_name}") endforeach() string(CONFIGURE "${ARG_OUTPUT_PATTERN}" output @ONLY) string(MAKE_C_IDENTIFIER ${output} output) configure_file("${ARG_INPUT}" "${ARG_TARGET}.parallel/${output}.cpp" @ONLY) set_property(DIRECTORY APPEND PROPERTY ADDITIONAL_CLEAN_FILES "${ARG_TARGET}.parallel") target_sources("${ARG_TARGET}" PRIVATE "${ARG_TARGET}.parallel/${output}.cpp") target_include_directories("${ARG_TARGET}" PRIVATE "../benchmark") # Cmake configuration needs to be rerun if the input template changes set_property(DIRECTORY APPEND PROPERTY CMAKE_CONFIGURE_DEPENDS "${ARG_INPUT}") endfunction() function(div_round_up dividend divisor result_var) math(EXPR result "(${dividend} + ${divisor} - 1) / ${divisor}") set("${result_var}" "${result}" PARENT_SCOPE) endfunction() function(add_matrix) set(single_value_args "TARGET" "INPUT" "OUTPUT_PATTERN" "SHARDS" "CURRENT_SHARD") cmake_parse_arguments(PARSE_ARGV 0 ARG "" "${single_value_args}" "NAMES;LISTS") list(LENGTH ARG_NAMES NAMES_LEN) list(LENGTH ARG_LISTS LISTS_LEN) if (NOT NAMES_LEN EQUAL LISTS_LEN) message("NAMES_LEN: ${NAMES_LEN}, LISTS_LEN: ${LISTS_LEN}") message(FATAL_ERROR "The same number of names and lists must be provided!") endif() # Calculate the total number of permutations set(total_len 1) foreach(LIST IN LISTS ARG_LISTS) string(REPLACE " " ";" list ${LIST}) list(LENGTH list LIST_LEN) math(EXPR total_len "${total_len} * ${LIST_LEN}") endforeach() if(NOT DEFINED ARG_SHARDS) set(ARG_SHARDS 1) endif() div_round_up("${total_len}" "${ARG_SHARDS}" per_shard) message("per_shard: ${per_shard}") math(EXPR start "${ARG_CURRENT_SHARD} * ${per_shard}") math(EXPR stop "${start} + ${per_shard} - 1") message("start: ${start}, stop: ${stop}") # Run for each permutation of input paramters foreach(i RANGE ${start} ${stop}) set(index ${i}) set(values "") foreach(input_list IN LISTS ARG_LISTS) string(REPLACE " " ";" curr_list ${input_list}) list(LENGTH curr_list curr_length) math(EXPR curr_index "${index} % ${curr_length}") list(GET curr_list ${curr_index} curr_item) list(APPEND values "${curr_item}") math(EXPR index "${index} / ${curr_length}") endforeach() add_configured_source(TARGET "${ARG_TARGET}" INPUT "${ARG_INPUT}" OUTPUT_PATTERN "${ARG_OUTPUT_PATTERN}" NAMES ${ARG_NAMES} VALUES ${values}) endforeach() endfunction() # example of a FILTER rule function(reject_odd_blocksize RESULT BlockSize) math(EXPR res "${BlockSize} % 2") if(res EQUAL 0) set("${RESULT}" ON PARENT_SCOPE) else() set("${RESULT}" OFF PARENT_SCOPE) endif() endfunction() rocPRIM-rocm-5.7.1/cmake/Dependencies.cmake000066400000000000000000000211021446201466700204130ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. # ########################### # rocPRIM dependencies # ########################### # NOTE1: the reason we don't scope global state meddling using add_subdirectory # is because CMake < 3.24 lacks CMAKE_FIND_PACKAGE_TARGETS_GLOBAL which # would promote IMPORTED targets of find_package(CONFIG) to be visible # by other parts of the build. So we save and restore global state. # # NOTE2: We disable the ROCMChecks.cmake warning noting that we meddle with # global state. This is consequence of abusing the CMake CXX language # which HIP piggybacks on top of. This kind of HIP support has one chance # at observing the global flags, at the find_package(HIP) invocation. # The device compiler won't be able to pick up changes after that, hence # the warning. set(USER_CXX_FLAGS ${CMAKE_CXX_FLAGS}) if(DEFINED BUILD_SHARED_LIBS) set(USER_BUILD_SHARED_LIBS ${BUILD_SHARED_LIBS}) endif() set(USER_ROCM_WARN_TOOLCHAIN_VAR ${ROCM_WARN_TOOLCHAIN_VAR}) set(ROCM_WARN_TOOLCHAIN_VAR OFF CACHE BOOL "") # Turn off warnings and errors for all warnings in dependencies separate_arguments(CXX_FLAGS_LIST NATIVE_COMMAND ${CMAKE_CXX_FLAGS}) list(REMOVE_ITEM CXX_FLAGS_LIST /WX -Werror -Werror=pendantic -pedantic-errors) if(MSVC) list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "/[Ww]([0-4]?)(all)?") # Remove MSVC warning flags list(APPEND CXX_FLAGS_LIST /w) else() list(FILTER CXX_FLAGS_LIST EXCLUDE REGEX "-W(all|extra|everything)") # Remove GCC/LLVM flags list(APPEND CXX_FLAGS_LIST -w) endif() list(JOIN CXX_FLAGS_LIST " " CMAKE_CXX_FLAGS) # Don't build client dependencies as shared set(BUILD_SHARED_LIBS OFF CACHE BOOL "Global flag to cause add_library() to create shared libraries if on." FORCE) # HIP dependency is handled earlier in the project cmake file # when VerifyCompiler.cmake is included (when not using HIP-CPU). include(FetchContent) if(USE_HIP_CPU) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(hip_cpu_rt QUIET) endif() if(NOT TARGET hip_cpu_rt::hip_cpu_rt) message(STATUS "HIP-CPU runtime not found. Fetching...") FetchContent_Declare( hip-cpu GIT_REPOSITORY https://github.com/ROCm-Developer-Tools/HIP-CPU.git GIT_TAG 56f559c93be210bb300dad3673c06d2bb0119d13 # master@2022.07.01 ) FetchContent_MakeAvailable(hip-cpu) if(NOT TARGET hip_cpu_rt::hip_cpu_rt) add_library(hip_cpu_rt::hip_cpu_rt ALIAS hip_cpu_rt) endif() else() find_package(hip_cpu_rt REQUIRED) # If we found HIP-CPU as binary, search for transitive dependencies find_package(Threads REQUIRED) set(CMAKE_REQUIRED_FLAGS "-std=c++17") include(CheckCXXSymbolExists) check_cxx_symbol_exists(__GLIBCXX__ "cstddef" STL_IS_GLIBCXX) set(STL_DEPENDS_ON_TBB ${STL_IS_GLIBCXX}) if(STL_DEPENDS_ON_TBB) find_package(TBB QUIET) if(NOT TARGET TBB::tbb AND NOT TARGET tbb) message(STATUS "Thread Building Blocks not found. Fetching...") FetchContent_Declare( thread-building-blocks GIT_REPOSITORY https://github.com/oneapi-src/oneTBB.git GIT_TAG 3df08fe234f23e732a122809b40eb129ae22733f # v2021.5.0 ) FetchContent_MakeAvailable(thread-building-blocks) else() find_package(TBB REQUIRED) endif() endif(STL_DEPENDS_ON_TBB) endif() endif(USE_HIP_CPU) # Test dependencies if(BUILD_TEST) # NOTE1: Google Test has created a mess with legacy FindGTest.cmake and newer GTestConfig.cmake # # FindGTest.cmake defines: GTest::GTest, GTest::Main, GTEST_FOUND # # GTestConfig.cmake defines: GTest::gtest, GTest::gtest_main, GTest::gmock, GTest::gmock_main # # NOTE2: Finding GTest in MODULE mode, one cannot invoke find_package in CONFIG mode, because targets # will be duplicately defined. # # NOTE3: The following snippet first tries to find Google Test binary either in MODULE or CONFIG modes. # If neither succeeds it goes on to import Google Test into this build either from a system # source package (apt install googletest on Ubuntu 18.04 only) or GitHub and defines the MODULE # mode targets. Otherwise if MODULE or CONFIG succeeded, then it prints the result to the # console via a non-QUIET find_package call and if CONFIG succeeded, creates ALIAS targets # with the MODULE IMPORTED names. if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(GTest QUIET) endif() if(NOT TARGET GTest::GTest AND NOT TARGET GTest::gtest) option(BUILD_GTEST "Builds the googletest subproject" ON) option(BUILD_GMOCK "Builds the googlemock subproject" OFF) option(INSTALL_GTEST "Enable installation of googletest." OFF) if(EXISTS /usr/src/googletest AND NOT DEPENDENCIES_FORCE_DOWNLOAD) FetchContent_Declare( googletest SOURCE_DIR /usr/src/googletest ) else() message(STATUS "Google Test not found. Fetching...") FetchContent_Declare( googletest GIT_REPOSITORY https://github.com/google/googletest.git GIT_TAG e2239ee6043f73722e7aa812a459f54a28552929 # release-1.11.0 ) endif() FetchContent_MakeAvailable(googletest) add_library(GTest::GTest ALIAS gtest) add_library(GTest::Main ALIAS gtest_main) else() find_package(GTest REQUIRED) if(TARGET GTest::gtest_main AND NOT TARGET GTest::Main) add_library(GTest::GTest ALIAS GTest::gtest) add_library(GTest::Main ALIAS GTest::gtest_main) endif() endif() endif(BUILD_TEST) if(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) find_package(benchmark CONFIG QUIET) endif() if(NOT TARGET benchmark::benchmark) message(STATUS "Google Benchmark not found. Fetching...") option(BENCHMARK_ENABLE_TESTING "Enable testing of the benchmark library." OFF) option(BENCHMARK_ENABLE_INSTALL "Enable installation of benchmark." OFF) FetchContent_Declare( googlebench GIT_REPOSITORY https://github.com/google/benchmark.git GIT_TAG d17ea665515f0c54d100c6fc973632431379f64b # v1.6.1 ) set(HAVE_STD_REGEX ON) set(RUN_HAVE_STD_REGEX 1) FetchContent_MakeAvailable(googlebench) if(NOT TARGET benchmark::benchmark) add_library(benchmark::benchmark ALIAS benchmark) endif() else() find_package(benchmark CONFIG REQUIRED) endif() endif(BUILD_BENCHMARK) if(NOT DEPENDENCIES_FORCE_DOWNLOAD) set(CMAKE_FIND_DEBUG_MODE TRUE) find_package(ROCM 0.7.3 CONFIG QUIET PATHS /opt/rocm) set(CMAKE_FIND_DEBUG_MODE FALSE) endif() if(NOT ROCM_FOUND) if(NOT EXISTS "${FETCHCONTENT_BASE_DIR}/rocm-cmake-src") message(STATUS "ROCm CMake not found. Fetching...") set(rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download") FetchContent_Declare( rocm-cmake URL https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.tar.gz ) FetchContent_MakeAvailable(rocm-cmake) endif() find_package(ROCM CONFIG REQUIRED NO_DEFAULT_PATH HINTS "${rocm-cmake_SOURCE_DIR}") else() find_package(ROCM 0.7.3 CONFIG REQUIRED PATHS /opt/rocm) endif() # Restore user global state set(CMAKE_CXX_FLAGS ${USER_CXX_FLAGS}) if(DEFINED USER_BUILD_SHARED_LIBS) set(BUILD_SHARED_LIBS ${USER_BUILD_SHARED_LIBS}) else() unset(BUILD_SHARED_LIBS CACHE ) endif() set(ROCM_WARN_TOOLCHAIN_VAR ${USER_ROCM_WARN_TOOLCHAIN_VAR} CACHE BOOL "") include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMHeaderWrapper) include(ROCMCheckTargetIds) include(ROCMClients) rocPRIM-rocm-5.7.1/cmake/GenerateResourceSpec.cmake000077500000000000000000000066241446201466700221210ustar00rootroot00000000000000#!/usr/bin/cmake -P find_program(ROCMINFO_EXECUTABLE rocminfo ) if(NOT ROCMINFO_EXECUTABLE) message(FATAL_ERROR "rocminfo not found") endif() execute_process( COMMAND ${ROCMINFO_EXECUTABLE} RESULT_VARIABLE ROCMINFO_EXIT_CODE OUTPUT_VARIABLE ROCMINFO_STDOUT ERROR_VARIABLE ROCMINFO_STDERR ) if(ROCMINFO_EXIT_CODE) message(SEND_ERROR "rocminfo exited with ${ROCMINFO_EXIT_CODE}") message(FATAL_ERROR ${ROCMINFO_STDERR}) endif() string(REGEX MATCHALL [[--(gfx[0-9a-f]+)]] ROCMINFO_MATCHES ${ROCMINFO_STDOUT} ) # NOTE: Unfortunately we don't have structs in CMake, # neither do we have std::partition only list(SORT) # # Transform raw regex matches to pairs of gfx IP and device id # This will be our struct emulation. In C++ it would be # # struct device # { # std::string ip; # int id; # }; # # std::vector GFXIP_AND_ID{ {"gfx900",0},{"gfx803",1},{"gfx900",2} }; # std::sort(GFXIP_AND_ID.begin(), GFXIP_AND_ID.end(), # [](const device& lhs, const device& rhs) # { # return std::lexicographical_compare(lhs.ip.begin(), lhs.ip.end(), # rhs.ip.begin(), rhs.ip.end()); # }); # set(GFXIP_AND_ID) set(ID 0) foreach(ROCMINFO_MATCH IN LISTS ROCMINFO_MATCHES) string(REGEX REPLACE "--" "" ROCMINFO_MATCH ${ROCMINFO_MATCH} ) list(APPEND GFXIP_AND_ID "${ROCMINFO_MATCH}:${ID}") math(EXPR ID "${ID} + 1") endforeach() list(SORT GFXIP_AND_ID) # Now comes the tricky part: implementing the following C++ logic # # std::stringstream JSON_PAYLOAD; # auto it = GFXIP_AND_ID.begin(); # while (it != GFXIP_AND_ID.end()) # { # auto IT = std::find_if(it, GFXIP_AND_ID.end(), # [=](const device& ip_id){ return ip_id.ip.compare(it->ip) != 0; }); # JSON_PAYLOAD << "\n \"" << it->ip << "\": ["; # std::for_each(it, IT, [&](const device& ip_id) # { # JSON_PAYLOAD << # "\n {\n" << # " \"id\": \"" << ip_id.id << "\"\n" << # " },"; # }); # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # JSON_PAYLOAD << "\n ],"; # it = IT; # } # JSON_PAYLOAD.seekp(-1, std::ios_base::end); // discard trailing comma # set(JSON_PAYLOAD) set(IT1 0) list(GET GFXIP_AND_ID ${IT1} I1) string(REGEX REPLACE ":[0-9]+" "" IP1 ${I1}) list(LENGTH GFXIP_AND_ID COUNT) while(IT1 LESS COUNT) string(APPEND JSON_PAYLOAD "\n \"${IP1}\": [") set(IT2 ${IT1}) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) while(${IP2} STREQUAL ${IP1} AND IT2 LESS COUNT) string(APPEND JSON_PAYLOAD "\n {\n" " \"id\": \"${ID2}\"\n" " }," ) math(EXPR IT2 "${IT2} + 1") if(IT2 LESS COUNT) list(GET GFXIP_AND_ID ${IT2} I2) string(REGEX REPLACE [[:[0-9]+$]] "" IP2 ${I2}) string(REGEX REPLACE [[^gfx[0-9]+:]] "" ID2 ${I2}) endif() endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) string(APPEND JSON_PAYLOAD "\n ],") set(IT1 ${IT2}) set(IP1 ${IP2}) endwhile() string(REGEX REPLACE [[,$]] "" JSON_PAYLOAD ${JSON_PAYLOAD}) set(JSON_HEAD [[{ "version": { "major": 1, "minor": 0 }, "local": [ {]] ) set(JSON_TAIL [[ } ] }]] ) file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/resources.json ${JSON_HEAD} ${JSON_PAYLOAD} ${JSON_TAIL} )rocPRIM-rocm-5.7.1/cmake/Summary.cmake000066400000000000000000000042561446201466700174750ustar00rootroot00000000000000# MIT License # # Copyright (c) 2017-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. function(print_configuration_summary) message(STATUS "") message(STATUS "******** Summary ********") message(STATUS "General:") message(STATUS " System : ${CMAKE_SYSTEM_NAME}") message(STATUS " C++ compiler : ${CMAKE_CXX_COMPILER}") message(STATUS " C++ compiler version : ${CMAKE_CXX_COMPILER_VERSION}") string(STRIP "${CMAKE_CXX_FLAGS}" CMAKE_CXX_FLAGS_STRIP) message(STATUS " CXX flags : ${CMAKE_CXX_FLAGS_STRIP}") message(STATUS " Build type : ${CMAKE_BUILD_TYPE}") message(STATUS " Install prefix : ${CMAKE_INSTALL_PREFIX}") if(NOT USE_HIP_CPU) message(STATUS " Device targets : ${GPU_TARGETS}") endif() message(STATUS "") message(STATUS " ONLY_INSTALL : ${ONLY_INSTALL}") message(STATUS " BUILD_TEST : ${BUILD_TEST}") message(STATUS " BUILD_BENCHMARK : ${BUILD_BENCHMARK}") message(STATUS " BUILD_EXAMPLE : ${BUILD_EXAMPLE}") message(STATUS " USE_HIP_CPU : ${USE_HIP_CPU}") endfunction() rocPRIM-rocm-5.7.1/cmake/VerifyCompiler.cmake000066400000000000000000000032171446201466700207730ustar00rootroot00000000000000# MIT License # # Copyright (c) 2018-2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in all # copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE # SOFTWARE. list(APPEND CMAKE_PREFIX_PATH ${ROCM_PATH} ${ROCM_PATH}/hip ${ROCM_PATH}/llvm /opt/rocm/llvm /opt/rocm /opt/rocm/hip) find_package(hip REQUIRED CONFIG PATHS ${HIP_DIR} ${ROCM_PATH} /opt/rocm) if(HIP_COMPILER STREQUAL "clang") if(NOT (HIP_CXX_COMPILER MATCHES ".*hipcc" OR HIP_CXX_COMPILER MATCHES ".*clang\\+\\+")) message(FATAL_ERROR "On ROCm platform 'hipcc' or HIP-aware Clang must be used as C++ compiler.") endif() else() message(FATAL_ERROR "HIP_COMPILER must be 'clang' (AMD ROCm platform)") endif() rocPRIM-rocm-5.7.1/conanfile.py000066400000000000000000000012051446201466700162550ustar00rootroot00000000000000# Copyright 2021 Advanced Micro Devices, Inc. # This conanfile is used to install development requirements, # e.g. # conan install -o clients=True -if build/deps . from conans import ConanFile, CMake class ConanPkgReqs(ConanFile): settings = "os", "compiler", "build_type", "arch" generators = "cmake_find_package" options = { "shared": [True, False], "clients": [True, False], } default_options = { "shared": True, "clients": False, } def requirements(self): if self.options.clients: self.requires("gtest/1.11.0") self.requires("benchmark/1.5.2") rocPRIM-rocm-5.7.1/custom.properties000066400000000000000000000001361446201466700173770ustar00rootroot00000000000000booktitle=rocPRIM API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocPRIM-rocm-5.7.1/docs/000077500000000000000000000000001446201466700146775ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/docs/.doxygen/000077500000000000000000000000001446201466700164325ustar00rootroot00000000000000rocPRIM-rocm-5.7.1/docs/.doxygen/Doxyfile000066400000000000000000003203471446201466700201510ustar00rootroot00000000000000# Doxyfile 1.8.11 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = rocPRIM # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docBin # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = YES # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 8 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = skip_doxy_start="\{" ALIASES += skip_doxy_end="\}" # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = NO # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = NO # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = YES # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = NO # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. # The default value is: NO. WARN_AS_ERROR = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = mainpage.dox \ primitivesmodule.dox \ warpmodule.dox \ blockmodule.dox \ devicemodule.dox \ utilsmodule.dox \ iteratormodule.dox \ intrinsicsmodule.dox \ glossary.dox \ ../../rocprim/include/rocprim # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.pyw, *.f90, *.f, *.for, *.tcl, # *.vhd, *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = YES # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = detail::* # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # properly processed by doxygen. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # generated with the -Duse-libclang=ON option for CMake. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = NO # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NONE # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 4 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = NO # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /