pax_global_header00006660000000000000000000000064150664215320014516gustar00rootroot0000000000000052 comment=78d2482de1dd1bcf484a8bc31dfebfea9cf7bbb4 hipFFT-rocm-7.1.0/000077500000000000000000000000001506642153200136015ustar00rootroot00000000000000hipFFT-rocm-7.1.0/.azuredevops/000077500000000000000000000000001506642153200162265ustar00rootroot00000000000000hipFFT-rocm-7.1.0/.azuredevops/rocm-ci.yml000066400000000000000000000012401506642153200202770ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/hipFFT.yml@pipelines_repo hipFFT-rocm-7.1.0/.clang-format000066400000000000000000000065421506642153200161630ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- hipFFT-rocm-7.1.0/.githooks/000077500000000000000000000000001506642153200155065ustar00rootroot00000000000000hipFFT-rocm-7.1.0/.githooks/install000077500000000000000000000002221506642153200170760ustar00rootroot00000000000000#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" hipFFT-rocm-7.1.0/.githooks/pre-commit000077500000000000000000000017661506642153200175220ustar00rootroot00000000000000#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/hcc/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done hipFFT-rocm-7.1.0/.github/000077500000000000000000000000001506642153200151415ustar00rootroot00000000000000hipFFT-rocm-7.1.0/.github/CODEOWNERS000066400000000000000000000005741506642153200165420ustar00rootroot00000000000000* @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation # Header directory for Doxygen documentation library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd hipFFT-rocm-7.1.0/.github/CONTRIBUTING.md000066400000000000000000000151701506642153200173760ustar00rootroot00000000000000 # Contributing to hipFFT # We welcome contributions to hipFFT. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgment for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions. These checks include: * Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.) * Running on different AMD GPU architectures (MI-series, Radeon series cards, etc.) * Running on different NVIDIA GPU architectures (V100, A100, etc) * Running benchmarks to check for performance degradation In order for a submission to be accepted: * It must pass all of the automated checks * It must undergo a code review Users can visualize our continuous integration infrastructure in: `hipFFT/.jenkins`. The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests. ## Code Structure ## In a broad view, hipFFT library is structured as follows: ├── docs/: contains hipFFT documentation ├── library/: contains main source code and headers │   ├── src/amd_detail/ : for porting to AMD devices │   ├── src/nvidia_detail/ : for porting to NVIDIA devices ├── clients/: │   ├── bench/ : contains benchmarking code │   ├── samples/ : contains examples │   ├── tests/ : contains our test infrastructure ├── shared/: contains important global headers and those for linking to other applications ## Coding Style ## * All public APIs are C89 compatible; all other library code should use c++17. * Our minimum supported compiler is clang 3.6. * Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code. * C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `hipFFT/.clang-format`. To format a C/C++ file, use: ``` clang-format -style=file -i ``` * Python code should use: ``` yapf --style pep8 ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. The following guidelines apply: * When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch. * Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers. * Ensure code builds successfully. * Do not break existing test cases * Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. ### Deliverables ### New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference. The following guidelines apply: * New functionality will only be merged with new unit tests. * New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). * Tests must have good code coverage. ### Process ### All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Once a contribution is ready to be submitted, consider the following: * Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i * While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted. * Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. * During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. * When a modification request has been completed, the conversation thread about it will be marked as resolved. * To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. * Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch. ## Code License ## All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/hipFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license. hipFFT-rocm-7.1.0/.github/ISSUE_TEMPLATE.md000066400000000000000000000004611506642153200176470ustar00rootroot00000000000000### What is the expected behavior - ### What actually happens - ### How to reproduce - ### Environment | Hardware | description | |-----|-----| | GPU | device string | | CPU | device string | | Software | version | |-----|-----| | ROCK | v0.0 | | ROCR | v0.0 | | HCC | v0.0 | | Library | v0.0 | hipFFT-rocm-7.1.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000000701506642153200207370ustar00rootroot00000000000000resolves #___ Summary of proposed changes: - - - hipFFT-rocm-7.1.0/.github/dependabot.yml000066400000000000000000000013001506642153200177630ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "monthly" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" - "malcolmroberts" - "evetsso" hipFFT-rocm-7.1.0/.gitignore000066400000000000000000000006461506642153200155770ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Visual Studio Code .vscode # documentation artifacts build/ _build/ _images/ _static/ _templates/ _toc.yml docBin/ # python bytecode __pycache__ hipFFT-rocm-7.1.0/.readthedocs.yaml000066400000000000000000000005021506642153200170250ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "3.10" hipFFT-rocm-7.1.0/CHANGELOG.md000066400000000000000000000143271506642153200154210ustar00rootroot00000000000000# Changelog for hipFFT Documentation for hipFFT is available at [https://rocm.docs.amd.com/projects/hipFFT/en/latest/](https://rocm.docs.amd.com/projects/hipFFT/en/latest/). ## hipFFT 1.0.21 for ROCm 7.1.0 ### Added * Improved test coverage of multi-stream plans. * Improved test coverage of user-specified work areas. * Improved test coverage of default stride calculation. * **[Experimental]** Introduced the hipFFTW library, interfacing rocFFT on AMD platforms using the same symbols as FFTW3 (with partial support). ## hipFFT 1.0.20 for ROCm 7.0.0 ### Added * Added gfx950 support. ### Removed * Removed hipfft-rider legacy compatibility from clients * Remove support for the gfx940 and gfx941 targets from the client programs. ## hipFFT 1.0.18 for ROCm 6.4.0 ### Added * Implemented the `hipfftMpAttachComm`, `hipfftXtSetDistribution`, and `hipfftXtSetSubformatDefault` APIs to allow computing FFTs that are distributed between multiple MPI (Message Passing Interface) processes. These APIs can be enabled with the `HIPFFT_MPI_ENABLE` CMake option, which defaults to `OFF`. * The backend FFT library called by hipFFT must support MPI for these APIs to work. ### Changed * Building with the address sanitizer option sets xnack+ for the relevant GPU architectures. * Use find_package CUDAToolkit instead of CUDA in cmake for modern-cmake compatibility. * The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated. ### Resolved issues * Fixed client packages to depend on hipRAND instead of rocRAND. ## hipFFT 1.0.17 for ROCm 6.3.0 ### Added * Support for the gfx1151, gfx1200, and gfx1201 architectures * hipfft-test now includes a --smoketest option. ### Changed * The AMD backend is now compiled using amdclang++ instead of hipcc. The NVIDIA CUDA backend still uses hipcc-nvcc. * CLI11 replaces Boost Program Options as the command line parser for clients. ## hipFFT 1.0.16 for ROCm 6.2.4 ### Changed * Support gfx1151 architecture. ## hipFFT 1.0.15 for ROCm 6.2.0 ### Fixes * Added hip::host as a public link library, as hipfft.h includes HIP runtime headers. * Prevent C++ exceptions leaking from public API functions. * Make output of hipfftXt match cufftXt in geometry and alignment for 2D and 3D FFTs. ## hipFFT 1.0.14 for ROCm 6.1.0 ### Changes * When building hipFFT from source, rocFFT code no longer needs to be initialized as a git submodule. ### Fixes * Fixed error when creating length-1 plans. ## hipFFT 1.0.13 for ROCm 6.0.0 ### Changes * `hipfft-rider` has been renamed to `hipfft-bench`; it is controlled by the `BUILD_CLIENTS_BENCH` CMake option (note that a link for the old file name is installed, and the old `BUILD_CLIENTS_RIDER` CMake option is accepted for backwards compatibility, but both will be removed in a future release) * Binaries in debug builds no longer have a `-d` suffix * The minimum rocFFT required version has been updated to 1.0.21 ### Additions * `hipfftXtSetGPUs`, `hipfftXtMalloc, hipfftXtMemcpy`, `hipfftXtFree`, and `hipfftXtExecDescriptor` APIs have been implemented to allow FFT computing on multiple devices in a single process ## hipFFT 1.0.12 for ROCm 5.6.0 ### Additions * `hipfftXtMakePlanMany`, `hipfftXtGetSizeMany`, and `hipfftXtExec` APIs have been implemented to allow half-precision transform requests ### Changes * Added the `--precision` argument to benchmark and test clients (`--double` is still accepted, but has been deprecated as a method to request a double-precision transform) ## hipFFT 1.0.11 for ROCm 5.5.0 ### Fixes * Fixed old version ROCm include and lib folders that were not removed during upgrades ## hipFFT 1.0.10 for ROCm 5.4.0 ### Additions * Added the `hipfftExtPlanScaleFactor` API to efficiently multiply each output element of an FFT by a given scaling factor (result scaling must be supported in the backend FFT library) ### Changes * rocFFT 1.0.19 or higher is now required for hipFFT builds on the rocFFT backend * Data are initialized directly on GPUs using hipRAND * Updated build files now use standard C++17 ## hipFFT 1.0.9 for ROCm 5.3.0 ### Changes * Cleaned up build warnings * GNUInstallDirs enhancements * GoogleTest 1.11 is required ## hipFFT 1.0.8 for ROCm 5.2.0 ### Additions * Added file and folder reorganization changes with backward compatibility support when using rocm-cmake wrapper functions * New packages for test and benchmark executables on all supported operating systems that use CPack * Implemented `hipfftMakePlanMany64` and `hipfftGetSizeMany64` ## hipFFT 1.0.7 for ROCm 5.1.0 ### Changes * Use `fft_params` struct for accuracy and benchmark clients ## hipFFT 1.0.6 for ROCm 5.0.0 ### Fixes * Incorrect reporting of rocFFT version ### Changes * Unconditionally enabled callback functionality: On the CUDA backend, callbacks only run correctly when hipFFT is built as a static library, and linked against the static cuFFT library ## hipFFT 1.0.5 for ROCm 4.5.0 ### Additions * Added support for Windows 10 as a build target ### Changes * Packaging has been split into a runtime package (`hipfft`) and a development package (`hipfft-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. ## hipFFT 1.0.4 for ROCm 4.4.0 ### Fixes * Add calls to rocFFT setup and cleanup * CMake fixes for clients and backend support ### Additions * Added support for Windows 10 as a build target ## hipFFT 1.0.3 for ROCm 4.3.0 ### Fixes * CMake updates ### Additions * New callback API in `hipfftXt.h` header ## hipFFT 1.0.2 for ROCm 4.2.0 * No changes ## hipFFT 1.0.1 for ROCm 4.1.0 ### Fixes * Batch support for `hipfftMakePlanMany` * Work area handling during plan creation and `hipfftSetWorkArea` * Honour `autoAllocate` flag ### Changes * Testing infrastructure reuses code from [rocFFT](https://github.com/ROCmSoftwarePlatform/rocFFT) hipFFT-rocm-7.1.0/CMakeLists.txt000066400000000000000000000235161506642153200163500ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################ # CMake version according to latest ROCm platform requirements cmake_minimum_required( VERSION 3.17 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) set(CMAKE_CXX_EXTENSIONS OFF) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else() set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif() # Workarounds.. list( APPEND CMAKE_PREFIX_PATH /opt/rocm/llvm /opt/rocm ) list( APPEND CMAKE_MODULE_PATH ${ROCM_PATH}/lib/cmake/hip /opt/rocm/lib/cmake/hip /opt/rocm/hip/cmake ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. # MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( HIPFFT_BUILD_SCOPE ON ) project( hipfft LANGUAGES CXX ) # Build options option( BUILD_SHARED_LIBS "Build ${PROJECT_NAME} as a shared library" ON ) option( BUILD_VERBOSE "Output additional build information" OFF ) option( HIPFFT_MPI_ENABLE "Build with MPI support for distributed transforms" OFF ) set( BUILD_WITH_COMPILER "HOST-default" CACHE INTERNAL "Build ${PROJECT_NAME} with compiler HIP-clang, HIP-nvcc, or just the host default compiler, eg g++") set( BUILD_WITH_LIB "ROCM" CACHE STRING "Build ${PROJECT_NAME} with ROCM or CUDA libraries" ) option( BUILD_CLIENTS "Build all clients" OFF) option( BUILD_CLIENTS_BENCH "Build benchmark client" OFF ) option( BUILD_CLIENTS_TESTS "Build ${PROJECT_NAME} tests (requires 3rd dependencies)" OFF ) option( BUILD_CLIENTS_SAMPLES "Build examples" OFF ) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) # Provide ability to disable hipRAND dependency option(USE_HIPRAND "Build using hipRAND for test input generation instead of host-side generation" ON) if( USE_HIPRAND ) add_compile_definitions(USE_HIPRAND) endif( ) option( WERROR "Treat warnings as errors" OFF ) set(DEFAULT_GPUS gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 gfx1030 gfx1100 gfx1101 gfx1102 gfx1151 gfx1200 gfx1201) if(BUILD_ADDRESS_SANITIZER) add_compile_options(-fsanitize=address) add_link_options(-fsanitize=address) add_link_options(-shared-libasan) SET(DEFAULT_GPUS gfx908:xnack+ gfx90a:xnack+ gfx942:xnack+) add_link_options(-fuse-ld=lld) add_compile_definitions(ADDRESS_SANITIZER) endif() # Set internal BUILD_WITH_COMPILER. if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+")) set( BUILD_WITH_COMPILER "HOST-default" ) else() if( $ENV{HIP_PLATFORM} MATCHES "nvidia" ) set( BUILD_WITH_COMPILER "HIP-nvcc" ) else() set( BUILD_WITH_COMPILER "HIP-clang" ) if( NOT BUILD_WITH_LIB STREQUAL "ROCM" ) message( FATAL_ERROR "Detected HIP_COMPILER=clang, but BUILD_WITH_LIB is not ROCM!" ) endif() endif() endif() string( TOUPPER "${BUILD_WITH_COMPILER}" BUILD_WITH_COMPILER ) string( TOUPPER "${BUILD_WITH_LIB}" BUILD_WITH_LIB ) # nvc++ doesn't understand warning flags if( NOT CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+" ) set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wno-unknown-pragmas) if( WERROR ) set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) endif() endif() # Dependencies include(cmake/dependencies.cmake) if (BUILD_WITH_COMPILER STREQUAL "HIP-NVCC" ) set (BUILD_WITH_LIB "CUDA") set( HIP_PLATFORM "nvidia" ) set( CMAKE_CXX_EXTENSIONS OFF ) set( CMAKE_CXX_COMPILE_OPTIONS_PIC "-Xcompiler=${CMAKE_CXX_COMPILE_OPTIONS_PIC}" ) set( CMAKE_SHARED_LIBRARY_C_FLAGS "-Xlinker=${CMAKE_SHARED_LIBRARY_C_FLAGS}" ) set( CMAKE_SHARED_LIBRARY_CXX_FLAGS "-Xlinker=${CMAKE_SHARED_LIBRARY_CXX_FLAGS}" ) set( CMAKE_SHARED_LIBRARY_SONAME_C_FLAG "-Xlinker=-soname," ) set( CMAKE_SHARED_LIBRARY_SONAME_CXX_FLAG "-Xlinker=-soname," ) set( CMAKE_SHARED_LIBRARY_RUNTIME_C_FLAG "-Xlinker=-rpath," ) set( CMAKE_SHARED_LIBRARY_RUNTIME_CXX_FLAG "-Xlinker=-rpath," ) set( CMAKE_EXECUTABLE_RUNTIME_C_FLAG "-Xlinker=-rpath," ) set( CMAKE_EXECUTABLE_RUNTIME_CXX_FLAG "-Xlinker=-rpath," ) set( CMAKE_C_COMPILE_OPTIONS_VISIBILITY "-Xcompiler='${CMAKE_C_COMPILE_OPTIONS_VISIBILITY}'" ) set( CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY "-Xcompiler='${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY}'" ) set( CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler='${CMAKE_C_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}'" ) set( CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN "-Xcompiler='${CMAKE_CXX_COMPILE_OPTIONS_VISIBILITY_INLINES_HIDDEN}'" ) foreach( FLAG IN ITEMS ${WARNING_FLAGS} ) set( NVCC_WARNING_FLAGS ${NVCC_WARNING_FLAGS} "-Xcompiler=${FLAG}" ) endforeach() set( WARNING_FLAGS ${NVCC_WARNING_FLAGS} ) else() # Check support for the GPU target(s) rocm_check_target_ids(GPU_TARGETS TARGETS "${GPU_TARGETS}") if( BUILD_WITH_COMPILER STREQUAL "HIP-CLANG" ) set( HIP_PLATFORM "amd" ) set( HIP_COMPILER "clang" ) endif() endif() # Show the actual compiler(internal option) message(STATUS "BUILD_WITH_COMPILER = " ${BUILD_WITH_COMPILER}) # Version set( VERSION_STRING "1.0.21" ) set( hipfft_SOVERSION 0.1 ) if( ROCmCMakeBuildTools_FOUND ) rocm_setup_version( VERSION ${VERSION_STRING} ) endif() add_subdirectory( library ) # Build clients of the library if( BUILD_CLIENTS ) set( BUILD_CLIENTS_BENCH ON ) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) endif() # old name for BUILD_CLIENTS_BENCH if( BUILD_CLIENTS_RIDER ) set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} ) endif() # Build clients of the library if( BUILD_CLIENTS_BENCH OR BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS ) include( clients/cmake/build-options.cmake ) rocm_package_setup_component(clients) if(NOT CLIENTS_OS) rocm_set_os_id(CLIENTS_OS) string(TOLOWER "${CLIENTS_OS}" CLIENTS_OS) rocm_read_os_release(CLIENTS_OS_VERSION VERSION_ID) endif() message(STATUS "OS: ${CLIENTS_OS} ${CLIENTS_OS_VERSION}") set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(FFTW_RPM "libfftw3-3") elseif(CLIENTS_OS STREQUAL "mariner") set(BOOST_RPM RPM "boost = ${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}") set(FFTW_RPM "fftw-libs") else() set(FFTW_RPM "fftw-libs") endif() if( USE_HIPRAND ) set( HIPRAND_DEP hiprand ) endif() if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS DEB ${FFTW_DEB} ${HIPRAND_DEP} RPM ${FFTW_RPM} ${HIPRAND_DEP} ) endif() if(BUILD_CLIENTS_BENCH) rocm_package_setup_client_component( benchmarks DEPENDS DEB ${HIPRAND_DEP} RPM ${HIPRAND_DEP} ) endif() add_subdirectory( clients ) endif() # Packaging... if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) endif() set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) endif() if( ROCmCMakeBuildTools_FOUND ) # Package specific CPACK vars if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) rocm_package_add_dependencies(DEPENDS "rocfft >= 1.0.21") else() if( NVHPC_FOUND ) string( REPLACE "." "-" NVHPC_PKG_VERSION ${NVHPC_VERSION} ) rocm_package_add_dependencies(DEPENDS "nvhpc-${NVHPC_PKG_VERSION}") else() rocm_package_add_dependencies(DEPENDS "cufft >= 10.0.0") endif() endif() set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) # Give hipfft compiled for CUDA backend a different name if( BUILD_WITH_LIB STREQUAL "ROCM" ) set( package_name hipfft ) else() set( package_name hipfft-alt ) endif() set( HIPFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" ) rocm_create_package( NAME ${package_name} DESCRIPTION "ROCm FFT marshalling library" MAINTAINER "hipfft-maintainer@amd.com" LDCONFIG LDCONFIG_DIR ${HIPFFT_CONFIG_DIR} ) endif() hipFFT-rocm-7.1.0/CppCheckSuppressions.txt000066400000000000000000000001531506642153200204570ustar00rootroot00000000000000// has some false positives and isn't hard to run manually for periodic // dead code sweeps unusedFunction hipFFT-rocm-7.1.0/LICENSE.md000066400000000000000000000053571506642153200152170ustar00rootroot00000000000000MIT License Copyright (C) Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- This product includes software from copyright holders as shown below, and distributed under their license terms as specified. CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry Schreiner under NSF AWARD 1414736. All rights reserved. Redistribution and use in source and binary forms of CLI11, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. hipFFT-rocm-7.1.0/README.md000066400000000000000000000074531506642153200150710ustar00rootroot00000000000000# hipFFT hipFFT is an FFT marshalling library that supports [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft) and [cuFFT](https://developer.nvidia.com/cufft) backends. hipFFT exports an interface that doesn't require the client to change, regardless of the chosen backend. It sits between your application and the backend FFT library, where it marshals inputs to the backend and marshals results back to your application. ## Documentation > [!NOTE] > The published hipFFT documentation is available at [hipFFT](https://rocm.docs.amd.com/projects/hipFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the projects/hipfft/docs folder of the rocm-libraries repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). To build our documentation locally, run the following code: ```bash cd projects/hipfft/docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Build and install You can download pre-built packages from the [ROCm package servers](https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html). If you're using Ubuntu, you can run: `sudo apt update && sudo apt install hipfft`. ### Building from source To build hipFFT from source, follow these steps: 1. Install the library build dependencies: * On AMD platforms, you must install [rocFFT](https://github.com/ROCm/rocm-libraries/tree/develop/projects/rocfft). 2. Install the client build dependencies: * The clients (samples, tests, etc) included with the hipFFT source depend on hipRAND, FFTW and GoogleTest. 3. Build hipFFT. Run these commands from the `rocm-libraries/projects/hipfft` directory: To show all build options: ```bash mkdir build && cd build cmake -LH .. ``` Here are some CMake build examples for an AMD GPU: * Case: Build a project using HIP language APIs + hipFFT with standard host compiler * Code: `cmake -DCMAKE_CXX_COMPILER=g++ -DCMAKE_BUILD_TYPE=Release -L ..` * Case: Build a project using HIP language APIs + hipFFT + device kernels with HIP-Clang * Code: `cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_BUILD_TYPE=Release -DBUILD_CLIENTS=ON -L ..` ```note The `-DBUILD_CLIENTS=ON` option is only allowed with the amdclang++ or HIPCC compilers. ``` ## Code Coverage You can generate a test coverage report with the following: ```bash cmake -DCMAKE_CXX_COMPILER=amdclang++ -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CODE_COVERAGE=ON .. make -j coverage ``` The commands above will output the coverage report to the terminal and save an html coverage report to `$PWD/coverage-report`. Note that hipFFT uses llvm for code coverage, which only works with clang compilers. ## Porting from CUDA If you have existing CUDA code and want to transition to HIP, follow these steps: 1. [HIPIFY](https://github.com/ROCm-Developer-Tools/HIPIFY) your code and fix all unsupported CUDA features and user-defined macros 2. Build with HIP-Clang to run on an AMD device More information about porting to HIP is available in the [HIP porting guide](https://rocm.docs.amd.com/projects/HIP/en/develop/user_guide/hip_porting_guide.html). ## Support You can report bugs and feature requests through the rocm-libraries GitHub [issue tracker](https://github.com/ROCm/rocm-libraries/issues). ## Contribute If you want to contribute to hipFFT, you must follow our [contribution guidelines](https://github.com/ROCm/rocm-libraries/blob/develop/projects/hipfft/.github/CONTRIBUTING.md). hipFFT-rocm-7.1.0/clients/000077500000000000000000000000001506642153200152425ustar00rootroot00000000000000hipFFT-rocm-7.1.0/clients/CMakeLists.txt000066400000000000000000000110051506642153200177770ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # CMake version according to latest ROCm platform requirements cmake_minimum_required( VERSION 3.16 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) set(CMAKE_CXX_EXTENSIONS OFF) # Consider removing this in the future # This should appear before the project command, because it does not use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does # not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( HIPFFT_CLIENTS_BUILD_SCOPE ON ) # This project may compile dependencies for clients project( hipfft-clients LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) include( build-options ) if(NOT (CMAKE_CXX_COMPILER MATCHES ".*hipcc$" OR CMAKE_CXX_COMPILER MATCHES ".*clang\\+\\+" OR CMAKE_CXX_COMPILER MATCHES ".*nvcc" OR CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+" ) ) if(BUILD_CLIENTS) message( FATAL_ERROR "Using BUILD_CLIENTS=ON requires a compiler capable of building device code (hipcc, clang, nvcc, nvc++)." ) endif() endif() # This option only works for make/nmake and the ninja generators, but no reason it shouldn't be on # all the time # This tells cmake to create a compile_commands.json file that can be used with clang tooling or vim set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) # if hipfft is not a target, then we know clients are built separately from the library and we must # search for the hipfft package if( NOT TARGET hipfft ) find_package( hipfft REQUIRED CONFIG PATHS ) endif( ) if( BUILD_CLIENTS_SAMPLES ) add_subdirectory( samples ) endif( ) if( BUILD_CLIENTS_TESTS ) find_package( GTest QUIET ) include( ExternalProject ) if( NOT GTest_FOUND ) set( GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include ) set( GTEST_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX} ) ExternalProject_Add( gtest URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz URL_HASH SHA256=b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} -DBUILD_SHARED_LIBS=OFF -DCMAKE_POSITION_INDEPENDENT_CODE=ON INSTALL_COMMAND "" BUILD_BYPRODUCTS ${GTEST_LIBRARIES} ) ExternalProject_Get_Property( gtest source_dir binary_dir ) endif() add_subdirectory( tests ) endif( ) if( BUILD_CLIENTS_BENCH ) add_subdirectory( bench ) endif( ) hipFFT-rocm-7.1.0/clients/bench/000077500000000000000000000000001506642153200163215ustar00rootroot00000000000000hipFFT-rocm-7.1.0/clients/bench/CMakeLists.txt000066400000000000000000000073401506642153200210650ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# CMAKE_MINIMUM_REQUIRED(VERSION 3.16) project( hipfft-clients-bench LANGUAGES CXX ) set( hipfft_bench_source bench.cpp ../../shared/array_validator.cpp ) set( hipfft_bench_includes bench.h ../../shared/array_validator.h ) add_executable( hipfft-bench ${hipfft_bench_source} ${hipfft_bench_includes} ) target_compile_options( hipfft-bench PRIVATE ${WARNING_FLAGS} ) set_target_properties( hipfft-bench PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) target_include_directories( hipfft-bench PRIVATE $ $ $ $ ) if((NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$") OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang") if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP MODULE REQUIRED ) endif() target_link_libraries( hipfft-bench PRIVATE hip::host hip::device ) else() target_compile_definitions( hipfft-bench PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( hipfft-bench PRIVATE ${HIP_INCLUDE_DIRS}) endif() endif() if ( BUILD_WITH_LIB STREQUAL "CUDA" ) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( hipfft-bench PRIVATE -cuda -Xptxas=-w) target_link_options( hipfft-bench PRIVATE -cuda) else() target_compile_options( hipfft-bench PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() target_link_libraries( hipfft-bench PRIVATE ${CUDA_LIBRARIES} ) else() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if( USE_HIPRAND ) target_link_libraries( hipfft-bench PRIVATE hip::hiprand ) endif() endif() target_link_libraries( hipfft-bench PRIVATE hip::hipfft ) set_target_properties( hipfft-bench PROPERTIES CXX_EXTENSIONS NO ) set_target_properties( hipfft-bench PROPERTIES RUNTIME_OUTPUT_DIRECTORY "${PROJECT_BINARY_DIR}/staging" ) if( HIPFFT_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../bin" ) else() set( BENCH_OUT_DIR "/bin") endif() string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} ) set_target_properties( hipfft-bench PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BENCH_OUT_DIR} ) rocm_install(TARGETS hipfft-bench COMPONENT benchmarks) hipFFT-rocm-7.1.0/clients/bench/bench.cpp000066400000000000000000000364271506642153200201200ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include "bench.h" #include "../../shared/CLI11.hpp" #include "../../shared/client_except.h" #include "../../shared/gpubuf.h" // initialize static class member of hipfft_params std::vector hipfft_params::externally_managed_workareas = std::vector(); int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio(false); // Control output verbosity: int verbose{}; // hip Device number for running tests: int deviceId{}; // Number of performance trial samples int ntrial{}; // FFT parameters: hipfft_params params; // Token string to fully specify fft params. std::string token; // Declare the supported options. CLI::App app{"hipfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("-v, --version", "Print queryable version information from the rocfft library") // ->each([](const std::string&) { // char v[256]; // rocfft_get_version_string(v, 256); // std::cout << "version " << v << std::endl; // std::exit(EXIT_SUCCESS); // }) ; CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--auto_allocation", params.auto_allocate, "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } } else { if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; // Fixme: set the device id properly after the IDs are synced // bewteen hip runtime and rocm-smi. // HIP_V_THROW(hipSetDevice(deviceId), "set device failed!"); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str() << std::endl; std::cout << "Token: " << params.token() << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; if(hipMemGetInfo(&free, &total) != hipSuccess) throw std::runtime_error("hipMemGetInfo failed"); const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } size_t vram_footprint = 0; try { vram_footprint = params.vram_footprint(); } catch(ROCFFT_SKIP& e) { std::cout << "SKIPPED: " << e.msg << "\n"; return EXIT_SUCCESS; } if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } // Create plans: auto ret = params.create_plan(); if(ret != fft_status_success) throw std::runtime_error("Plan creation failed"); hipError_t hip_rt; // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { hip_rt = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_rt != hipSuccess) throw std::runtime_error("Creating input Buffer failed"); pibuffer[i] = ibuffer[i].data(); } // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); #ifdef USE_HIPRAND if(!is_host_gen) { // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { HIP_V_THROW(hipMemcpy(ibuffer_cpu.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } } #endif if(is_host_gen) { // Input data: ibuffer_cpu = allocate_host_buffer(params.precision, params.itype, params.isize); params.compute_input(ibuffer_cpu); if(verbose > 1) { std::cout << "GPU input:\n"; params.print_ibuffer(ibuffer_cpu); } for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { hip_rt = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_rt != hipSuccess) throw std::runtime_error("Creating output Buffer failed"); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } auto res = params.execute(pibuffer.data(), pobuffer.data()); if(res != fft_status_success) throw std::runtime_error("Execution failed"); // Run the transform several times and record the execution time: std::vector gpu_time(ntrial); hipEvent_t start, stop; hip_rt = hipEventCreate(&start); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventCreate failed"); hip_rt = hipEventCreate(&stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventCreate failed"); for(size_t itrial = 0; itrial < gpu_time.size(); ++itrial) { #ifdef USE_HIPRAND // Compute input on default device if(!is_host_gen) params.compute_input(ibuffer); #endif if(is_host_gen) { for(unsigned int idx = 0; idx < ibuffer_cpu.size(); ++idx) { HIP_V_THROW(hipMemcpy(pibuffer[idx], ibuffer_cpu[idx].data(), ibuffer_cpu[idx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } } hip_rt = hipEventRecord(start); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventRecord failed"); res = params.execute(pibuffer.data(), pobuffer.data()); hip_rt = hipEventRecord(stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventRecord failed"); hip_rt = hipEventSynchronize(stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventSynchronize failed"); if(res != fft_status_success) throw std::runtime_error("Execution failed"); float time; hip_rt = hipEventElapsedTime(&time, start, stop); if(hip_rt != hipSuccess) throw std::runtime_error("hipEventElapsedTime failed"); gpu_time[itrial] = time; if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int idx = 0; idx < output.size(); ++idx) { hip_rt = hipMemcpy( output[idx].data(), pobuffer[idx], output[idx].size(), hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); } std::cout << "GPU output:\n"; params.print_obuffer(output); } } std::cout << "\nExecution gpu time:"; for(const auto& i : gpu_time) { std::cout << " " << i; } std::cout << " ms" << std::endl; } hipFFT-rocm-7.1.0/clients/bench/bench.h000066400000000000000000000054151506642153200175560ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPFFT_BENCH_H #define HIPFFT_BENCH_H #include "../hipfft_params.h" #include "hipfft/hipfft.h" #include // This is used to either wrap a HIP function call, or to explicitly check a variable // for an error condition. If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings // supported inline void hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != hipSuccess) { std::stringstream tmp; tmp << "HIP_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } inline void lib_V_Throw(hipfftResult res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != HIPFFT_SUCCESS) { std::stringstream tmp; tmp << "LIB_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) #endif // HIPFFT_BENCH_H hipFFT-rocm-7.1.0/clients/cmake/000077500000000000000000000000001506642153200163225ustar00rootroot00000000000000hipFFT-rocm-7.1.0/clients/cmake/FindFFTW.cmake000066400000000000000000000105451506642153200207000ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# #if( FFTW_FIND_VERSION VERSION_LESS "3" ) # message( FFTW_FIND_VERION is ${FFTW_FIND_VERSION}) # message( FATAL_ERROR "FindFFTW can not configure versions less than FFTW 3.0.0" ) #endif( ) find_path(FFTW_INCLUDE_DIRS NAMES fftw3.h HINTS ${FFTW_ROOT}/include $ENV{FFTW_ROOT}/include PATHS /usr/include /usr/local/include ) mark_as_advanced( FFTW_INCLUDE_DIRS ) # message( STATUS "FFTW_FIND_COMPONENTS: ${FFTW_FIND_COMPONENTS}" ) # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE NAMES fftw3f fftw3f-3 fftw3 fftw3-3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library single" ) mark_as_advanced( FFTW_LIBRARIES_SINGLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTWF_OMP_LIBRARY fftw3f_omp ) find_library( FFTWF_THREADS_LIBRARY fftw3f_threads ) if( FFTWF_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTWF_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) find_library( FFTW_LIBRARIES_DOUBLE NAMES fftw3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library double" ) mark_as_advanced( FFTW_LIBRARIES_DOUBLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTW_OMP_LIBRARY fftw3_omp ) find_library( FFTW_THREADS_LIBRARY fftw3_threads ) if( FFTW_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTW_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() endif( ) include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) # assume the threads feature is always enabled on Windows, since it's # not a separate library there if( FFTW_FOUND AND WIN32 ) set( FFTW_MULTITHREAD TRUE ) endif() if( NOT FFTW_FOUND ) message( STATUS "FindFFTW could not find all of the following fftw libraries" ) message( STATUS "${FFTW_FIND_COMPONENTS}" ) else( ) message(STATUS "FindFFTW configured variables:" ) message(STATUS "FFTW_INCLUDE_DIRS: ${FFTW_INCLUDE_DIRS}" ) message(STATUS "FFTW_LIBRARIES: ${FFTW_LIBRARIES}" ) endif() hipFFT-rocm-7.1.0/clients/cmake/build-options.cmake000066400000000000000000000036751506642153200221270ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # This file is intended to be used in two ways; independently in a stand alone PROJECT # and as part of a superbuild. If the file is included in a stand alone project, the # variables are not expected to be preset, and this will produce options() in the GUI # for the user to examine. If this file is included in a superbuild, the options will be # presented in the superbuild GUI, but then passed into the ExternalProject as -D # parameters, which would already define them. if( NOT BUILD_CLIENTS_TESTS ) option( BUILD_CLIENTS_TESTS "Build hipFFT unit tests" OFF ) endif( ) if( NOT BUILD_CLIENTS_SAMPLES ) option( BUILD_CLIENTS_SAMPLES "Build hipFFT samples" OFF ) endif( ) hipFFT-rocm-7.1.0/clients/hipfft_params.h000066400000000000000000002057461506642153200202540ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPFFT_PARAMS_H #define HIPFFT_PARAMS_H #include #include #include #include #include "../shared/client_except.h" #include "../shared/concurrency.h" #include "../shared/fft_params.h" #include "../shared/hipfft_brick.h" #include "hipfft/hipfft.h" #include "hipfft/hipfftXt.h" #include #ifdef HIPFFT_MPI_ENABLE #include "hipfft/hipfftMp.h" #include #endif template && (std::is_same_v && ...), bool> = true> static void set_with_random_nonnegative_values(const std::string& token, T& val, Args&... args) { // using a hash of the token as random seed to avoid // dependencies on externally-defined variables std::hash hasher; std::ranlux24_base gen(hasher(token)); std::uniform_int_distribution dis(static_cast(0), std::numeric_limits::max()); val = dis(gen); ((args = dis(gen)), ...); return; } inline fft_status fft_status_from_hipfftparams(const hipfftResult_t val) { switch(val) { case HIPFFT_SUCCESS: return fft_status_success; case HIPFFT_INVALID_PLAN: case HIPFFT_ALLOC_FAILED: return fft_status_failure; case HIPFFT_INVALID_TYPE: case HIPFFT_INVALID_VALUE: case HIPFFT_INVALID_SIZE: case HIPFFT_INCOMPLETE_PARAMETER_LIST: case HIPFFT_INVALID_DEVICE: case HIPFFT_NOT_IMPLEMENTED: case HIPFFT_NOT_SUPPORTED: return fft_status_invalid_arg_value; case HIPFFT_INTERNAL_ERROR: case HIPFFT_EXEC_FAILED: case HIPFFT_SETUP_FAILED: case HIPFFT_UNALIGNED_DATA: case HIPFFT_PARSE_ERROR: return fft_status_failure; case HIPFFT_NO_WORKSPACE: return fft_status_invalid_work_buffer; default: return fft_status_failure; } } inline std::string hipfftResult_string(const hipfftResult_t val) { switch(val) { case HIPFFT_SUCCESS: return "HIPFFT_SUCCESS (0)"; case HIPFFT_INVALID_PLAN: return "HIPFFT_INVALID_PLAN (1)"; case HIPFFT_ALLOC_FAILED: return "HIPFFT_ALLOC_FAILED (2)"; case HIPFFT_INVALID_TYPE: return "HIPFFT_INVALID_TYPE (3)"; case HIPFFT_INVALID_VALUE: return "HIPFFT_INVALID_VALUE (4)"; case HIPFFT_INTERNAL_ERROR: return "HIPFFT_INTERNAL_ERROR (5)"; case HIPFFT_EXEC_FAILED: return "HIPFFT_EXEC_FAILED (6)"; case HIPFFT_SETUP_FAILED: return "HIPFFT_SETUP_FAILED (7)"; case HIPFFT_INVALID_SIZE: return "HIPFFT_INVALID_SIZE (8)"; case HIPFFT_UNALIGNED_DATA: return "HIPFFT_UNALIGNED_DATA (9)"; case HIPFFT_INCOMPLETE_PARAMETER_LIST: return "HIPFFT_INCOMPLETE_PARAMETER_LIST (10)"; case HIPFFT_INVALID_DEVICE: return "HIPFFT_INVALID_DEVICE (11)"; case HIPFFT_PARSE_ERROR: return "HIPFFT_PARSE_ERROR (12)"; case HIPFFT_NO_WORKSPACE: return "HIPFFT_NO_WORKSPACE (13)"; case HIPFFT_NOT_IMPLEMENTED: return "HIPFFT_NOT_IMPLEMENTED (14)"; case HIPFFT_NOT_SUPPORTED: return "HIPFFT_NOT_SUPPORTED (16)"; default: return "invalid hipfftResult"; } } class hipfft_params : public fft_params { public: // plan handles are pointers for rocFFT backend, and ints for cuFFT #ifdef __HIP_PLATFORM_AMD__ static constexpr hipfftHandle INVALID_PLAN_HANDLE = nullptr; #else static constexpr hipfftHandle INVALID_PLAN_HANDLE = -1; #endif hipfftHandle plan = INVALID_PLAN_HANDLE; // keep track of token to check when attempting to create new plan std::string current_token; // hipFFT has two ways to specify transform type - the hipfftType // enum, and separate hipDataType enums for input/output. // hipfftType has no way to express an fp16 transform, so // hipfft_transform_type will not be set in that case. std::optional hipfft_transform_type; hipDataType inputType = HIP_C_32F; hipDataType outputType = HIP_C_32F; int direction; std::vector int_length; std::vector int_inembed; std::vector int_onembed; std::vector ll_length; std::vector ll_inembed; std::vector ll_onembed; template struct many_api_layout_args { T *input_embed, *output_embed; T input_stride, output_stride, input_distance, output_distance; }; struct hipLibXtDesc_deleter { void operator()(hipLibXtDesc* d) { hipfftXtFree(d); } }; // allocated memory on devices for multi-GPU transforms - inplace // just uses xt_output std::unique_ptr xt_input; std::unique_ptr xt_output; // rocFFT brick decomposition for Xt memory - multi-GPU tests will // confirm that rocFFT's decomposition matches cuFFT's std::vector xt_inBricks; std::vector xt_outBricks; // backend library can write N worksize values for N GPUs, so // allocate a vector for that if necessary std::vector auto_allocated_worksizes; // if auto_allocate == fft_auto_allocation_off, the hipFFT plan(s) // will be provided with externally-managed work area(s): static std::vector externally_managed_workareas; size_t auto_allocated_extra_vram_footprint() const { return std::accumulate(auto_allocated_worksizes.begin(), auto_allocated_worksizes.end(), static_cast(0)); } static size_t externally_managed_extra_vram_footprint() { return std::accumulate(externally_managed_workareas.begin(), externally_managed_workareas.end(), static_cast(0), [](size_t total, const gpubuf& buf) { return total + buf.size(); }); } bool is_preventing_auto_allocation_at_generation() const { if(auto_allocate != fft_auto_allocation_off) return false; // Let hipFFT sometimes auto-allocate nonetheless so that tests cover its // ability to free resources (allocated at generation) when/if some // externally-managed workarea(s) are provided after plan generation // Note: this member function must return the same result even if called // more than once by a given instance, it must be stable for any instance return std::hash()(token()) % 2 == 1; } hipfft_params() = default; hipfft_params(const fft_params& p) : fft_params(p) { } hipfft_params(hipfft_params&& p) = default; hipfft_params& operator=(hipfft_params&& other) = default; ~hipfft_params() { free(); }; void free() { if(plan != INVALID_PLAN_HANDLE) { hipfftDestroy(plan); plan = INVALID_PLAN_HANDLE; } xt_input.reset(); xt_output.reset(); } size_t vram_footprint() override { size_t val = fft_params::vram_footprint(); // auto-allocated plans fail here if not enough VRAM, skip these tests try { if(create_plan() != fft_status_success) { throw std::runtime_error("Plan creation or struct setup failed"); } } catch(fft_params::work_buffer_alloc_failure& e) { val += auto_allocated_extra_vram_footprint(); val += externally_managed_extra_vram_footprint(); std::stringstream msg; msg << "Plan work buffer size (" << val << " bytes raw data) too large for device"; throw ROCFFT_SKIP{msg.str()}; } val += auto_allocated_extra_vram_footprint(); val += externally_managed_extra_vram_footprint(); return val; } fft_status setup_structs() { // set direction switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_real_forward: direction = HIPFFT_FORWARD; break; case fft_transform_type_complex_inverse: case fft_transform_type_real_inverse: direction = HIPFFT_BACKWARD; break; } // set i/o types and transform type switch(transform_type) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: { switch(precision) { case fft_precision_half: inputType = HIP_C_16F; outputType = HIP_C_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_C_32F; outputType = HIP_C_32F; hipfft_transform_type = HIPFFT_C2C; break; case fft_precision_double: inputType = HIP_C_64F; outputType = HIP_C_64F; hipfft_transform_type = HIPFFT_Z2Z; break; } break; } case fft_transform_type_real_forward: { switch(precision) { case fft_precision_half: inputType = HIP_R_16F; outputType = HIP_C_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_R_32F; outputType = HIP_C_32F; hipfft_transform_type = HIPFFT_R2C; break; case fft_precision_double: inputType = HIP_R_64F; outputType = HIP_C_64F; hipfft_transform_type = HIPFFT_D2Z; break; } break; } case fft_transform_type_real_inverse: { switch(precision) { case fft_precision_half: inputType = HIP_C_16F; outputType = HIP_R_16F; hipfft_transform_type.reset(); break; case fft_precision_single: inputType = HIP_C_32F; outputType = HIP_R_32F; hipfft_transform_type = HIPFFT_C2R; break; case fft_precision_double: inputType = HIP_C_64F; outputType = HIP_R_64F; hipfft_transform_type = HIPFFT_Z2D; break; } break; } default: throw std::runtime_error("Invalid transform type"); } int_length.resize(dim()); int_inembed.resize(dim()); int_onembed.resize(dim()); ll_length.resize(dim()); ll_inembed.resize(dim()); ll_onembed.resize(dim()); switch(dim()) { case 3: ll_inembed[2] = istride[1] / istride[2]; ll_onembed[2] = ostride[1] / ostride[2]; [[fallthrough]]; case 2: ll_inembed[1] = istride[0] / istride[1]; ll_onembed[1] = ostride[0] / ostride[1]; [[fallthrough]]; case 1: ll_inembed[0] = istride[dim() - 1]; ll_onembed[0] = ostride[dim() - 1]; break; default: throw std::runtime_error("Invalid dimension"); } for(size_t i = 0; i < dim(); ++i) { ll_length[i] = length[i]; int_length[i] = length[i]; int_inembed[i] = ll_inembed[i]; int_onembed[i] = ll_onembed[i]; } // reset auto_allocated_worksizes auto_allocated_worksizes.resize(get_num_used_gpus()); std::for_each(auto_allocated_worksizes.begin(), auto_allocated_worksizes.end(), [](decltype(auto_allocated_worksizes)::value_type& val) { val = 0; }); hipfftResult ret = HIPFFT_SUCCESS; return fft_status_from_hipfftparams(ret); } fft_status create_plan() override { // check if we need to make a new plan if(current_token == token()) { return fft_status_success; } else { if(plan != INVALID_PLAN_HANDLE) { hipfftDestroy(plan); plan = INVALID_PLAN_HANDLE; } } auto fft_ret = setup_structs(); if(fft_ret != fft_status_success) { return fft_ret; } hipfftResult ret{HIPFFT_INTERNAL_ERROR}; switch(get_create_type()) { case PLAN_Nd: { ret = create_plan_Nd(); break; } case PLAN_MANY: { ret = create_plan_many(); break; } case CREATE_MAKE_PLAN_Nd: { ret = create_make_plan_Nd(); break; } case CREATE_MAKE_PLAN_MANY: { ret = create_make_plan_many(); break; } case CREATE_MAKE_PLAN_MANY64: { ret = create_make_plan_many64(); break; } case CREATE_XT_MAKE_PLAN_MANY: { ret = create_xt_make_plan_many(); break; } default: { throw std::runtime_error("no valid plan creation type"); } } if(ret == HIPFFT_SUCCESS && auto_allocate == fft_auto_allocation_off) { ret = set_externally_managed_work_areas(); } // hipFFT can fail plan creation due to allocation failure - // tests are expecting a specific exception in that case, // because the test was unable to run. Doesn't mean the test // case failed. if(ret == HIPFFT_ALLOC_FAILED) { if(!final_attempt_at_plan_creation && externally_managed_extra_vram_footprint() > 0) { final_attempt_at_plan_creation = true; // device allocation(s) in externally_managed_workareas might be // larger than needed or even unnecessary for the instance of interest. // Free them up and try again before concluding. externally_managed_workareas.clear(); return create_plan(); } else { throw fft_params::work_buffer_alloc_failure( "plan create failed due to allocation failure", externally_managed_extra_vram_footprint() + auto_allocated_extra_vram_footprint()); } } // store token to check if plan was already made current_token = token(); return fft_status_from_hipfftparams(ret); } hipfftResult_t set_stream(hipStream_t stream) { if(plan == INVALID_PLAN_HANDLE) throw std::runtime_error("Plan must be created before setting a desired stream"); return hipfftSetStream(plan, stream); } void validate_fields() const override { validate_brick_volume(); // multi-process only works with batch-1 FFTs, as hipFFT has // no place in the API to communicate batch indexes for // bricks if(mp_lib != fft_mp_lib_none && nbatch > 1) throw std::runtime_error("multi-process FFTs require batch-1"); // if user provided decomposition if(!ifields.empty() || !ofields.empty()) { // then library-decomposed multi-GPU must not also be requested if(multiGPU > 1) throw std::runtime_error( "cannot request both library-decomposed GPU and user decomposition"); // count bricks per rank std::map rank_ibrick_count; std::map rank_obrick_count; for(const auto& b : ifields.front().bricks) rank_ibrick_count[b.rank]++; for(const auto& b : ofields.front().bricks) rank_obrick_count[b.rank]++; // make sure there's only one input/output brick per rank auto count_is_one = [](const std::pair& entry) { return entry.second == 1; }; if(!std::all_of(rank_ibrick_count.begin(), rank_ibrick_count.end(), count_is_one) || !std::all_of(rank_obrick_count.begin(), rank_obrick_count.end(), count_is_one)) throw std::runtime_error("multiple bricks per rank are not supported"); // also ensure that each input brick maps to an output on same rank if(rank_ibrick_count != rank_obrick_count) throw std::runtime_error("input and output bricks do not match up"); } } fft_status set_callbacks(void* load_cb_host, void* load_cb_data, void* store_cb_host, void* store_cb_data, size_t load_cb_shared_mem_bytes = 0, size_t store_cb_shared_mem_bytes = 0) override { if(run_callbacks) { if(!hipfft_transform_type) throw std::runtime_error("callbacks require a valid hipfftType"); hipfftResult ret{HIPFFT_EXEC_FAILED}; switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_REAL, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_REAL, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_COMPLEX, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_D2Z: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_REAL_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_REAL_DOUBLE, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_COMPLEX_DOUBLE, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_C2R: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback(plan, &store_cb_host, HIPFFT_CB_ST_REAL, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_COMPLEX, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_REAL, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_Z2D: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_REAL_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_COMPLEX_DOUBLE, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_REAL_DOUBLE, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_C2C: ret = hipfftXtSetCallback(plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_COMPLEX, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_COMPLEX, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; case HIPFFT_Z2Z: ret = hipfftXtSetCallback( plan, &load_cb_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &load_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallback( plan, &store_cb_host, HIPFFT_CB_ST_COMPLEX_DOUBLE, &store_cb_data); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_LD_COMPLEX_DOUBLE, load_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); ret = hipfftXtSetCallbackSharedSize( plan, HIPFFT_CB_ST_COMPLEX_DOUBLE, store_cb_shared_mem_bytes); if(ret != HIPFFT_SUCCESS) return fft_status_from_hipfftparams(ret); break; default: throw std::runtime_error("Invalid execution type"); } } return fft_status_success; } virtual fft_status execute(void** in, void** out) override { return execute(in[0], out[0]); }; fft_status execute(void* ibuffer, void* obuffer) { hipfftResult ret{HIPFFT_EXEC_FAILED}; // if we're doing multi-GPU, we need to use ExecDescriptor // methods to execute. if(multiGPU > 1) { // rotate between generic ExecDescriptor and specific // ExecDescriptorX2Y functions by hashing token (for // stability across reruns of test cases) // // the specific functions are only for the main transform // types expressible through the hipfftType enum bool generic_ExecDescriptor = !hipfft_transform_type || std::hash()(token()) % 2; if(generic_ExecDescriptor) { ret = hipfftXtExecDescriptor(plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); } else { switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftXtExecDescriptorR2C( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_C2R: ret = hipfftXtExecDescriptorC2R( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_C2C: ret = hipfftXtExecDescriptorC2C( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); break; case HIPFFT_D2Z: ret = hipfftXtExecDescriptorD2Z( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_Z2D: ret = hipfftXtExecDescriptorZ2D( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get()); break; case HIPFFT_Z2Z: ret = hipfftXtExecDescriptorZ2Z( plan, placement == fft_placement_inplace ? xt_output.get() : xt_input.get(), xt_output.get(), direction); } } return fft_status_from_hipfftparams(ret); } // otherwise, we have two ways to execute in hipFFT - // hipfftExecFOO and hipfftXtExec // Transforms that aren't supported by the hipfftType enum // require using the Xt method, but otherwise we hash the // token to decide how to execute this FFT. we want test // cases to rotate between different execution APIs, but we also // need the choice of API to be stable across reruns of the // same test cases. if(!hipfft_transform_type || std::hash()(token()) % 2) { ret = hipfftXtExec(plan, ibuffer, obuffer, direction); } else { try { switch(*hipfft_transform_type) { case HIPFFT_R2C: ret = hipfftExecR2C( plan, (hipfftReal*)ibuffer, (hipfftComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_D2Z: ret = hipfftExecD2Z(plan, (hipfftDoubleReal*)ibuffer, (hipfftDoubleComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_C2R: ret = hipfftExecC2R( plan, (hipfftComplex*)ibuffer, (hipfftReal*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_Z2D: ret = hipfftExecZ2D(plan, (hipfftDoubleComplex*)ibuffer, (hipfftDoubleReal*)(placement == fft_placement_inplace ? ibuffer : obuffer)); break; case HIPFFT_C2C: ret = hipfftExecC2C( plan, (hipfftComplex*)ibuffer, (hipfftComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer), direction); break; case HIPFFT_Z2Z: ret = hipfftExecZ2Z(plan, (hipfftDoubleComplex*)ibuffer, (hipfftDoubleComplex*)(placement == fft_placement_inplace ? ibuffer : obuffer), direction); break; default: throw std::runtime_error("Invalid execution type"); } } catch(const std::exception& e) { std::cerr << e.what() << std::endl; } catch(...) { std::cerr << "unknown exception in execute(void* ibuffer, void* obuffer)" << std::endl; } } return fft_status_from_hipfftparams(ret); } bool is_contiguous() const { // compute contiguous stride, dist and check that the actual // strides/dists match std::vector contiguous_istride = compute_stride(ilength(), {}, placement == fft_placement_inplace && transform_type == fft_transform_type_real_forward); std::vector contiguous_ostride = compute_stride(olength(), {}, placement == fft_placement_inplace && transform_type == fft_transform_type_real_inverse); if(istride != contiguous_istride || ostride != contiguous_ostride) return false; return compute_idist() == idist && compute_odist() == odist; } // stride is row-major like everything else in fft_params. brick // indexes/strides are col-major because those would normally be // passed to rocFFT directly static bool xt_desc_matches_brick(const hostbuf& field, const std::vector& stride, size_t dist, const hipXtDesc* desc, const std::vector& bricks, size_t elem_size, const char* dir) { // construct field stride that includes batch distance too, since // brick coordinates include it auto field_stride_cm = stride; std::reverse(field_stride_cm.begin(), field_stride_cm.end()); field_stride_cm.push_back(dist); std::atomic compare_err = false; std::atomic runtime_err = false; std::vector brick_hosts; brick_hosts.resize(bricks.size()); #ifdef _OPENMP #pragma omp parallel for num_threads(rocfft_concurrency()) #endif for(size_t i = 0; i < bricks.size(); ++i) { // copy the ith brick back to host memory rocfft_scoped_device device(desc->GPUs[i]); hostbuf& brick_host = brick_hosts[i]; brick_host.alloc(desc->size[i]); if(hipMemcpy(brick_host.data(), desc->data[i], brick_host.size(), hipMemcpyDeviceToHost) != hipSuccess) { runtime_err = true; continue; } // convert to row-major auto brick_length_rm = bricks[i].length(); std::reverse(brick_length_rm.begin(), brick_length_rm.end()); // start at brick origin auto brick_idx_rm = brick_length_rm; std::fill(brick_idx_rm.begin(), brick_idx_rm.end(), 0); do { auto brick_idx_cm = brick_idx_rm; std::reverse(brick_idx_cm.begin(), brick_idx_cm.end()); auto field_offset = bricks[i].field_offset(brick_idx_cm, field_stride_cm); auto brick_offset = bricks[i].brick_offset(brick_idx_cm); if(memcmp(brick_host.data_offset(brick_offset * elem_size), field.data_offset(field_offset * elem_size), elem_size) != 0) { compare_err = true; break; } } while(increment_rowmajor(brick_idx_rm, brick_length_rm)); } if(runtime_err) throw std::runtime_error("failed to memcpy brick back to host"); return !compare_err; } // call the hipFFT APIs to distribute data to multiple GPUs void multi_gpu_prepare(std::vector& ibuffer, std::vector& pibuffer, std::vector& pobuffer) override { if(multiGPU <= 1) return; // input data is on the device - copy it back to the host so // hipfftXtMemcpy can deal with it hostbuf input_host; input_host.alloc(ibuffer.front().size()); if(hipMemcpy(input_host.data(), ibuffer.front().data(), ibuffer.front().size(), hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("copy back to host failed"); // allocate data on the multiple GPUs if(placement == fft_placement_inplace) { hipLibXtDesc* xt_tmp = nullptr; if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_INPLACE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_output.reset(xt_tmp); xt_tmp = nullptr; if(hipfftXtMemcpy(plan, xt_output.get(), input_host.data(), HIPFFT_COPY_HOST_TO_DEVICE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); pibuffer.clear(); std::copy_n(xt_output->descriptor->data, xt_output->descriptor->nGPUs, std::back_inserter(pibuffer)); pobuffer.clear(); } else { hipLibXtDesc* xt_tmp = nullptr; if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_INPUT) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_input.reset(xt_tmp); xt_tmp = nullptr; if(hipfftXtMemcpy(plan, xt_input.get(), input_host.data(), HIPFFT_COPY_HOST_TO_DEVICE) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); if(hipfftXtMalloc(plan, &xt_tmp, HIPFFT_XT_FORMAT_OUTPUT) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed"); xt_output.reset(xt_tmp); xt_tmp = nullptr; pibuffer.clear(); std::copy_n(xt_input->descriptor->data, xt_input->descriptor->nGPUs, std::back_inserter(pibuffer)); pobuffer.clear(); std::copy_n(xt_output->descriptor->data, xt_output->descriptor->nGPUs, std::back_inserter(pobuffer)); } // create bricks for this transform so we can confirm data layout hipLibXtDesc* compare_desc = placement == fft_placement_inplace ? xt_output.get() : xt_input.get(); xt_inBricks.resize(compare_desc->descriptor->nGPUs); xt_outBricks.resize(compare_desc->descriptor->nGPUs); set_io_bricks(ilength_cm(), olength_cm(), nbatch, xt_inBricks, xt_outBricks); // check cufftXtMemcpy versus hipfft's implementation if(!xt_desc_matches_brick(input_host, istride, idist, compare_desc->descriptor, xt_inBricks, var_size(precision, itype), "input")) throw std::runtime_error("Xt input does not match"); } // call the hipFFT APIs to gather the data back from the multiple GPUs virtual void multi_gpu_finalize(std::vector& obuffer, std::vector& pobuffer) override { if(multiGPU <= 1) return; // allocate a host buffer for hipFFTXtMemcpy's sake hostbuf output_host; output_host.alloc(obuffer.front().size()); if(hipfftXtMemcpy(plan, output_host.data(), xt_output.get(), HIPFFT_COPY_DEVICE_TO_HOST) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed"); // check cufftXtMemcpy versus hipfft's implementation if(placement == fft_placement_notinplace) { if(!xt_desc_matches_brick(output_host, ostride, odist, xt_output->descriptor, xt_outBricks, var_size(precision, otype), "output")) throw std::runtime_error("Xt output does not match"); } // copy final result back to device for comparison if(hipMemcpy(obuffer.front().data(), output_host.data(), obuffer.front().size(), hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("finalizing hipMemcpy failed"); pobuffer.clear(); pobuffer.push_back(obuffer.front().data()); } private: // hipFFT provides multiple ways to create FFT plans: // - hipfftPlan1d/2d/3d (combined allocate + init for specific dim) // - hipfftPlanMany (combined allocate + init with dim as param) // - hipfftCreate + hipfftMakePlan1d/2d/3d (separate alloc + init for specific dim) // - hipfftCreate + hipfftMakePlanMany (separate alloc + init with dim as param) // - hipfftCreate + hipfftMakePlanMany64 (separate alloc + init with dim as param, 64-bit) // - hipfftCreate + hipfftXtMakePlanMany (separate alloc + init with separate i/o/exec types) // // Rotate through the choices for better test coverage. enum PlanCreateAPI { PLAN_Nd, PLAN_MANY, CREATE_MAKE_PLAN_Nd, CREATE_MAKE_PLAN_MANY, CREATE_MAKE_PLAN_MANY64, CREATE_XT_MAKE_PLAN_MANY, }; // check that worksize estimates can be successfully queried with or without a valid plan hipfftResult_t check_worksize_estimate() { hipfftResult_t ret{HIPFFT_INTERNAL_ERROR}; if(!hipfft_transform_type) { throw std::runtime_error("Estimating worksize requires a valid type of transform"); } std::vector worksize_estimate(get_num_used_gpus(), absurd_init_worksize_estimate); switch(get_create_type()) { case CREATE_MAKE_PLAN_Nd: { switch(dim()) { case 1: if(plan == INVALID_PLAN_HANDLE) ret = hipfftEstimate1d( int_length[0], *hipfft_transform_type, nbatch, worksize_estimate.data()); else ret = hipfftGetSize1d(plan, int_length[0], *hipfft_transform_type, nbatch, worksize_estimate.data()); break; case 2: if(plan == INVALID_PLAN_HANDLE) ret = hipfftEstimate2d(int_length[0], int_length[1], *hipfft_transform_type, worksize_estimate.data()); else ret = hipfftGetSize2d(plan, int_length[0], int_length[1], *hipfft_transform_type, worksize_estimate.data()); break; case 3: if(plan == INVALID_PLAN_HANDLE) ret = hipfftEstimate3d(int_length[0], int_length[1], int_length[2], *hipfft_transform_type, worksize_estimate.data()); else ret = hipfftGetSize3d(plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type, worksize_estimate.data()); break; default: throw std::runtime_error("invalid dim"); } break; } case CREATE_MAKE_PLAN_MANY: { auto layout_args = make_valid_layout_args_for_plan_many(); if(plan == INVALID_PLAN_HANDLE) ret = hipfftEstimateMany(dim(), int_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch, worksize_estimate.data()); else ret = hipfftGetSizeMany(plan, dim(), int_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch, worksize_estimate.data()); break; } case CREATE_MAKE_PLAN_MANY64: { if(plan == INVALID_PLAN_HANDLE) { // no direct equivalent in estimate-fetching APIs std::for_each(worksize_estimate.begin(), worksize_estimate.end(), [](decltype(worksize_estimate)::value_type& val) { val = 0; }); ret = HIPFFT_SUCCESS; } else { auto layout_args = make_valid_layout_args_for_plan_many(); ret = hipfftGetSizeMany64(plan, dim(), ll_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch, worksize_estimate.data()); } break; } case CREATE_XT_MAKE_PLAN_MANY: { if(plan == INVALID_PLAN_HANDLE) { // no direct equivalent in estimate-fetching APIs std::for_each(worksize_estimate.begin(), worksize_estimate.end(), [](decltype(worksize_estimate)::value_type& val) { val = 0; }); ret = HIPFFT_SUCCESS; } else { auto executionType = get_xt_api_execution_type(); auto layout_args = make_valid_layout_args_for_plan_many(); ret = hipfftXtGetSizeMany(plan, dim(), ll_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, inputType, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, outputType, nbatch, worksize_estimate.data(), executionType); } break; } case PLAN_Nd: case PLAN_MANY: default: { // should be indirectly disabled via get_create_type() return HIPFFT_INTERNAL_ERROR; } } // check that the value(s) of worksize_estimate were actually set, assuming that // setting a worksize_estimate equal to absurd_init_worksize_estimate by hipFFT // cannot be considered "correct". // Note: worksize_estimate value(s) are *not* guaranteed to be greater than or equal // to the actual value(s) of the work area(s), queriable after plan generation via // hipfftGetSize. if(ret == HIPFFT_SUCCESS) { // the estimate can't have any knowledge about the number of GPUs being used if // the plan wasn't created first const size_t num_values_to_check = plan == INVALID_PLAN_HANDLE ? 1 : worksize_estimate.size(); for(size_t idx = 0; ret == HIPFFT_SUCCESS && idx < num_values_to_check; idx++) { ret = worksize_estimate[idx] != absurd_init_worksize_estimate ? HIPFFT_SUCCESS : HIPFFT_INTERNAL_ERROR; } } return ret; } // provide a work area to a successfully generated plan hipfftResult_t set_externally_managed_work_areas() { std::vector req_workarea_sizes(get_num_used_gpus(), absurd_init_worksize_estimate); hipfftResult_t ret = hipfftGetSize(plan, req_workarea_sizes.data()); if(ret != HIPFFT_SUCCESS) { return ret; } else if(std::any_of(req_workarea_sizes.begin(), req_workarea_sizes.end(), [](const decltype(req_workarea_sizes)::value_type& val) { return val == absurd_init_worksize_estimate; })) { return HIPFFT_INTERNAL_ERROR; } // req_workarea_sizes are known and validated // check if the current externally_managed_workareas can be used as is or not if(externally_managed_workareas.size() < get_num_used_gpus()) externally_managed_workareas.resize(get_num_used_gpus()); std::vector workareas(get_num_used_gpus(), nullptr); for(auto workarea_idx = 0; workarea_idx < get_num_used_gpus(); workarea_idx++) { const auto req_size = req_workarea_sizes[workarea_idx]; auto& buf = externally_managed_workareas[workarea_idx]; if(buf.size() < req_size) { // too small, free and reallocate to meet current needs buf.free(); if(buf.alloc(req_size) != hipSuccess) { return HIPFFT_ALLOC_FAILED; } } workareas[workarea_idx] = buf.data(); } if(get_num_used_gpus() > 1) { // TODO: enable below once hipfftXtSetWorkArea is enabled #if(0) ret = hipfftXtSetWorkArea(plan, workareas.data); #else throw unimplemented_exception( "No implementation support for externally-managed work areas with multi-gpu usage"); #endif } else { ret = hipfftSetWorkArea(plan, workareas[0]); } if(ret == HIPFFT_SUCCESS) { // the above "SetWorkArea" frees auto_allocated worksizes (if any) auto_allocated_worksizes.clear(); } return ret; } // return true if we need to use hipFFT APIs that separate plan // allocation and plan init bool need_separate_create_make() const { // scale factor and multi-GPU and disabled auto-allocation need API // calls between create + init if(scale_factor != 1.0 || multiGPU > 1 || mp_lib != fft_mp_lib_none || auto_allocate == fft_auto_allocation_off) return true; return false; } template < typename T, std::enable_if_t || std::is_same_v, bool> = true> many_api_layout_args make_valid_layout_args_for_plan_many() { many_api_layout_args ret; if constexpr(std::is_same_v) { ret.input_embed = int_inembed.data(); ret.output_embed = int_onembed.data(); } else { ret.input_embed = ll_inembed.data(); ret.output_embed = ll_onembed.data(); } ret.input_stride = static_cast(istride.back()); ret.output_stride = static_cast(ostride.back()); ret.input_distance = static_cast(idist); ret.output_distance = static_cast(odist); if(is_using_default_layout()) { // If using a default layout, users can // (A) either set explicitly inembed, onembed, strides, and distances (like above); // (B) or use nullptr as arguments for inembed and onembed. Strides and // distances are supposed to be ignored in that case. // --> choose randomly between either valid usage when a default layout is // used, so that all possible valid use case scenarios are considered. const std::string test_token = token(); int randomizer; set_with_random_nonnegative_values(test_token, randomizer); if(randomizer % 2 == 0) { ret.input_embed = nullptr; ret.output_embed = nullptr; // FIXME: negative values are not truly ignored for now. set_with_random_nonnegative_values(test_token, ret.input_stride, ret.output_stride, ret.input_distance, ret.output_distance); } } return ret; } // Not all plan options work with all creation types. Return a // suitable plan creation type for the current FFT parameters. int get_create_type() { bool contiguous = is_contiguous(); bool batched = nbatch > 1; std::vector allowed_apis; // half-precision requires XtMakePlanMany if(precision == fft_precision_half) { allowed_apis.push_back(CREATE_XT_MAKE_PLAN_MANY); } else { // separate alloc + init "Many" APIs are always allowed allowed_apis.push_back(CREATE_MAKE_PLAN_MANY); allowed_apis.push_back(CREATE_MAKE_PLAN_MANY64); allowed_apis.push_back(CREATE_XT_MAKE_PLAN_MANY); if(!need_separate_create_make()) allowed_apis.push_back(PLAN_MANY); // non-many APIs are only allowed if FFT is contiguous, and // only the 1D API allows for batched FFTs. if(contiguous && (!batched || dim() == 1)) { if(!need_separate_create_make()) allowed_apis.push_back(PLAN_Nd); allowed_apis.push_back(CREATE_MAKE_PLAN_Nd); } } // hash the token to decide how to create this FFT. we want // test cases to rotate between different create APIs, but we // also need the choice of API to be stable across reruns of // the same test cases. return allowed_apis[std::hash()(token()) % allowed_apis.size()]; } // call hipfftPlan* functions hipfftResult_t create_plan_Nd() { auto ret = HIPFFT_INVALID_PLAN; switch(dim()) { case 1: ret = hipfftPlan1d(&plan, int_length[0], *hipfft_transform_type, nbatch); break; case 2: ret = hipfftPlan2d(&plan, int_length[0], int_length[1], *hipfft_transform_type); break; case 3: ret = hipfftPlan3d( &plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type); break; default: throw std::runtime_error("invalid dim"); } return ret; } hipfftResult_t create_plan_many() { auto layout_args = make_valid_layout_args_for_plan_many(); auto ret = hipfftPlanMany(&plan, dim(), int_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch); return ret; } // call hipfftCreate + hipfftMake* functions, inserting calls to // relevant pre-Make APIs (scale factor, XtSetGPUs) hipfftResult_t create_with_pre_make() { hipfftResult_t ret{HIPFFT_INVALID_PLAN}; if(auto_allocate == fft_auto_allocation_off) { ret = check_worksize_estimate(); // read worksize estimate before plan creation if(ret != HIPFFT_SUCCESS) return ret; } ret = hipfftCreate(&plan); if(ret != HIPFFT_SUCCESS) return ret; if(scale_factor != 1.0) { ret = hipfftExtPlanScaleFactor(plan, scale_factor); if(ret != HIPFFT_SUCCESS) return ret; } if(multiGPU > 1) { int deviceCount = 0; if(hipGetDeviceCount(&deviceCount) != hipSuccess) throw std::runtime_error("hipGetDeviceCount failed"); // ensure that users request less than or equal to the total number of devices if(static_cast(multiGPU) > deviceCount) throw std::runtime_error("not enough devices for requested multi-gpu computation!"); std::vector GPUs(multiGPU); std::iota(GPUs.begin(), GPUs.end(), 0); ret = hipfftXtSetGPUs(plan, static_cast(multiGPU), GPUs.data()); if(ret != HIPFFT_SUCCESS) return ret; } if(mp_lib == fft_mp_lib_mpi) { #ifdef HIPFFT_MPI_ENABLE ret = hipfftMpAttachComm(plan, HIPFFT_COMM_MPI, mp_comm); if(ret != HIPFFT_SUCCESS) return ret; int mpi_rank = 0; MPI_Comm_rank(*static_cast(mp_comm), &mpi_rank); const auto& in_bricks = ifields.front().bricks; const auto& out_bricks = ofields.front().bricks; // find the input/output brick for this rank auto curr_rank_brick = [mpi_rank](const fft_brick& b) { return b.rank == mpi_rank; }; auto in_brick = std::find_if(in_bricks.begin(), in_bricks.end(), curr_rank_brick); auto out_brick = std::find_if(out_bricks.begin(), out_bricks.end(), curr_rank_brick); if(in_brick != in_bricks.end() && out_brick != out_bricks.end()) { std::vector input_lower; std::vector input_upper; std::vector output_lower; std::vector output_upper; std::vector input_stride; std::vector output_stride; // convert brick info to long long int for hipFFT auto convert_intvec = [](const std::vector& in, std::vector& out) { // start with index 1 because hipFFT only wants to be // told about FFT dimensions, not batch dimension for(size_t i = 1; i < in.size(); ++i) out.push_back(static_cast(in[i])); }; convert_intvec(in_brick->lower, input_lower); convert_intvec(in_brick->upper, input_upper); convert_intvec(out_brick->lower, output_lower); convert_intvec(out_brick->upper, output_upper); convert_intvec(in_brick->stride, input_stride); convert_intvec(out_brick->stride, output_stride); ret = hipfftXtSetDistribution(plan, static_cast(dim()), input_lower.data(), input_upper.data(), output_lower.data(), output_upper.data(), input_stride.data(), output_stride.data()); } if(ret != HIPFFT_SUCCESS) return ret; #else throw std::runtime_error("MPI is not enabled"); #endif } if(auto_allocate == fft_auto_allocation_off) { ret = check_worksize_estimate(); // read worksize estimate again after plan creation if(ret != HIPFFT_SUCCESS) return ret; } if(is_preventing_auto_allocation_at_generation()) { ret = hipfftSetAutoAllocation(plan, 0); } return ret; } hipfftResult_t create_make_plan_Nd() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented std::vector tmp_worksize(get_num_used_gpus()); size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() ? tmp_worksize.data() : auto_allocated_worksizes.data(); switch(dim()) { case 1: return hipfftMakePlan1d( plan, int_length[0], *hipfft_transform_type, nbatch, worksize_ptr); case 2: return hipfftMakePlan2d( plan, int_length[0], int_length[1], *hipfft_transform_type, worksize_ptr); case 3: return hipfftMakePlan3d(plan, int_length[0], int_length[1], int_length[2], *hipfft_transform_type, worksize_ptr); default: throw std::runtime_error("invalid dim"); } } hipfftResult_t create_make_plan_many() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented std::vector tmp_worksize(get_num_used_gpus()); size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() ? tmp_worksize.data() : auto_allocated_worksizes.data(); auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftMakePlanMany(plan, dim(), int_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch, worksize_ptr); } hipfftResult_t create_make_plan_many64() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented std::vector tmp_worksize(get_num_used_gpus()); size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() ? tmp_worksize.data() : auto_allocated_worksizes.data(); auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftMakePlanMany64(plan, dim(), ll_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, *hipfft_transform_type, nbatch, worksize_ptr); } hipDataType get_xt_api_execution_type() const { // execution type is always complex, matching the precision // of the transform // Initializing as double by default hipDataType ret = HIP_C_64F; switch(precision) { case fft_precision_half: ret = HIP_C_16F; break; case fft_precision_single: ret = HIP_C_32F; break; case fft_precision_double: ret = HIP_C_64F; break; default: throw std::runtime_error("Invalid precision"); } return ret; } hipfftResult_t create_xt_make_plan_many() { auto ret = create_with_pre_make(); if(ret != HIPFFT_SUCCESS) return ret; // do not register plan's worksizes as "auto-allocated" if auto-allocation was explicitly prevented std::vector tmp_worksize(get_num_used_gpus()); size_t* worksize_ptr = is_preventing_auto_allocation_at_generation() ? tmp_worksize.data() : auto_allocated_worksizes.data(); auto executionType = get_xt_api_execution_type(); auto layout_args = make_valid_layout_args_for_plan_many(); return hipfftXtMakePlanMany(plan, dim(), ll_length.data(), layout_args.input_embed, layout_args.input_stride, layout_args.input_distance, inputType, layout_args.output_embed, layout_args.output_stride, layout_args.output_distance, outputType, nbatch, worksize_ptr, executionType); } static constexpr size_t absurd_init_worksize_estimate = std::numeric_limits::max(); bool final_attempt_at_plan_creation = false; size_t get_num_used_gpus() const { return multiGPU > 1 ? multiGPU : 1; }; }; #endif hipFFT-rocm-7.1.0/clients/hipfftw_helper.h000066400000000000000000001460661506642153200204360ustar00rootroot00000000000000// Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef HIPFFTW_HELPER_H #define HIPFFTW_HELPER_H #include "../shared/environment.h" #include "../shared/fft_params.h" #include #include #include #include #include #include #include #ifdef WIN32 #include // psapi.h requires windows.h to be included first #include typedef HMODULE LIB_HANDLE_T; #else #include #include typedef void* LIB_HANDLE_T; #endif template struct hipfftw_trait; template <> struct hipfftw_trait { using plan_t = fftwf_plan; using complex_t = fftwf_complex; using real_t = float; }; template <> struct hipfftw_trait { using plan_t = fftw_plan; using complex_t = fftw_complex; using real_t = double; }; template using hipfftw_real_t = typename hipfftw_trait::real_t; template using hipfftw_complex_t = typename hipfftw_trait::complex_t; template using hipfftw_plan_t = typename hipfftw_trait::plan_t; // singleton class encapsulating the dynamically-loaded hipfftw library class dynamically_loaded_hipfftw { private: LIB_HANDLE_T lib_handle; std::ostringstream load_error_info; dynamically_loaded_hipfftw() { #ifdef __HIP_PLATFORM_AMD__ const std::string lib_basename = "hipfftw"; #else const std::string lib_basename = "cufftw"; #endif #ifdef WIN32 const std::string lib_fullame = lib_basename + ".dll"; lib_handle = LoadLibraryA(lib_fullame.c_str()); #else const std::string lib_fullame = "lib" + lib_basename + ".so"; lib_handle = dlopen(lib_fullame.c_str(), RTLD_LAZY); #endif load_error_info.clear(); if(!lib_handle) { load_error_info << "failed to open library " << lib_fullame; #ifdef WIN32 load_error_info << ". System's error code = " << GetLastError(); #else load_error_info << ". System's error message = " << dlerror(); #endif // do not throw from here to ease exception handling } } /* disable copies and moves */ dynamically_loaded_hipfftw(const dynamically_loaded_hipfftw&) = delete; dynamically_loaded_hipfftw(dynamically_loaded_hipfftw&&) = delete; dynamically_loaded_hipfftw& operator=(const dynamically_loaded_hipfftw&) = delete; dynamically_loaded_hipfftw& operator=(dynamically_loaded_hipfftw&&) = delete; static const dynamically_loaded_hipfftw& get_instance() { static dynamically_loaded_hipfftw singleton_instance; return singleton_instance; } public: static LIB_HANDLE_T get_lib() { return get_instance().lib_handle; } static std::string get_load_error_info() { return get_instance().load_error_info.str(); } ~dynamically_loaded_hipfftw() { if(lib_handle) { #ifdef WIN32 (void)FreeLibrary(lib_handle); #else (void)dlclose(lib_handle); #endif } lib_handle = nullptr; } }; // exception specific to issues when loading hipfftw and/or when fetching // the address of the supposedly-available functions therefrom struct hipfftw_undefined_function_ptr : std::runtime_error { using std::runtime_error::runtime_error; }; // helper struct for retrieving a function's return type template struct func_ret; template struct func_ret { using type = R; }; template using func_ret_t = typename func_ret::type; template , bool> = true> struct dynamically_loaded_function_t { private: // address of the desired function, to be fetched from a dynamically loaded shared library func_type* func_ptr; // address of the reference function (from linked fftw3) func_type* const reference_func_ptr; // symbol of said function std::string func_symbol; public: dynamically_loaded_function_t(const char* symbol, func_type* ref_func_address) : func_ptr(nullptr) , reference_func_ptr(ref_func_address) , func_symbol(symbol) { } // forwarding functional calls template func_ret_t operator()(Args... args) const { if(!may_be_used()) throw hipfftw_undefined_function_ptr(dynamically_loaded_hipfftw::get_load_error_info()); return func_ptr(args...); } template func_ret_t call(Args... args) const { if constexpr(!call_reference) { return this->operator()(args...); } else { if(!reference_func_ptr) throw hipfftw_undefined_function_ptr( "Ill-defined reference function pointer for symbol " + func_symbol); return reference_func_ptr(args...); } // unreachable } void load_implementation() { const auto hipfftw_lib = dynamically_loaded_hipfftw::get_lib(); if(!hipfftw_lib) { // make func_ptr unambiguously unset to force the dedicated exception // to be thrown at forwarded functional call(s) func_ptr = nullptr; return; } #ifdef WIN32 func_ptr = reinterpret_cast(GetProcAddress(hipfftw_lib, func_symbol.c_str())); #else func_ptr = reinterpret_cast(dlsym(hipfftw_lib, func_symbol.c_str())); #endif } bool may_be_used() const { return func_ptr != nullptr; } std::string get_symbol() const { return func_symbol; } }; template static void load_implementations(dynamically_loaded_function_t& first, Args&... others) { first.load_implementation(); if constexpr(sizeof...(others) > 0) load_implementations(others...); } // define singleton structures encapsulating all the hipfftw function // pointers (one specialization per supported precision) template struct hipfftw_funcs; #define HIPFFTW_STRINGIFY(x) #x #define HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, func) \ dynamically_loaded_function_t func \ = dynamically_loaded_function_t(HIPFFTW_STRINGIFY(prefix##func), \ &(prefix##func)); #define HIPFFTW_FUNCS_SPECIALIZATION(prefix, specialization) \ template <> \ struct hipfftw_funcs \ { \ private: \ hipfftw_funcs() \ { \ load_implementations(malloc, \ alloc_real, \ alloc_complex, \ free, \ destroy_plan, \ cleanup, \ execute, \ plan_dft_1d, \ plan_dft_2d, \ plan_dft_3d, \ plan_dft, \ plan_dft_r2c_1d, \ plan_dft_r2c_2d, \ plan_dft_r2c_3d, \ plan_dft_r2c, \ plan_dft_c2r_1d, \ plan_dft_c2r_2d, \ plan_dft_c2r_3d, \ plan_dft_c2r, \ print_plan, \ set_timelimit, \ cost, \ flops); \ } \ /* disable copies and moves */ \ hipfftw_funcs(const hipfftw_funcs&) = delete; \ hipfftw_funcs& operator=(const hipfftw_funcs&) = delete; \ hipfftw_funcs(hipfftw_funcs&&) = delete; \ hipfftw_funcs& operator=(hipfftw_funcs&&) = delete; \ \ public: \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, malloc) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, alloc_real) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, alloc_complex) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, free) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, destroy_plan) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, cleanup) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, execute) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_1d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_2d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_3d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_r2c_1d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_r2c_2d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_r2c_3d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_r2c) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_c2r_1d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_c2r_2d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_c2r_3d) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, plan_dft_c2r) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, print_plan) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, set_timelimit) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, cost) \ HIPFFTW_DECLARE_DYNAMICALLY_LOADED_FUNCTION_POINTER(prefix, flops) \ static const hipfftw_funcs& get_instance() \ { \ static const hipfftw_funcs instance; \ return instance; \ } \ } HIPFFTW_FUNCS_SPECIALIZATION(fftwf_, fft_precision_single); HIPFFTW_FUNCS_SPECIALIZATION(fftw_, fft_precision_double); // structure enabling verbosity for hipfftw's exception handler and redirecting std::cerr // to a runtime buffer throughout its lifetime (unless it was already enabled prior/externally) struct hipfftw_exception_logger { bool active; std::stringstream buffer; std::streambuf* const original_cerr_rdbuf = nullptr; std::unique_ptr hipfftw_temp_logger_env; public: hipfftw_exception_logger() : active(false) , original_cerr_rdbuf(std::cerr.rdbuf()) { #ifdef __HIP_PLATFORM_AMD__ const auto env_val = rocfft_getenv("HIPFFTW_LOG_EXCEPTIONS"); // activate temporary redirection only if not already used otherwise // (e.g., in test user's environment ) if(env_val.empty() || std::stoull(env_val) == 0) { hipfftw_temp_logger_env = std::make_unique("HIPFFTW_LOG_EXCEPTIONS", "1"); const auto temp_env_val = rocfft_getenv("HIPFFTW_LOG_EXCEPTIONS"); active = !temp_env_val.empty() && std::stoull(temp_env_val) != 0; } #endif if(active) std::cerr.rdbuf(buffer.rdbuf()); } hipfftw_exception_logger(const hipfftw_exception_logger&) = delete; hipfftw_exception_logger(hipfftw_exception_logger&&) = delete; hipfftw_exception_logger& operator=(const hipfftw_exception_logger&) = delete; hipfftw_exception_logger& operator=(hipfftw_exception_logger&&) = delete; ~hipfftw_exception_logger() { if(active) { // restore cerr to its original state std::cerr.rdbuf(original_cerr_rdbuf); } } bool is_active() const { return active; } std::string get_log() const { return buffer.str(); } }; // bit-flagging enum used for representing (combinations of) plan creation // function(s) to consider enum hipfftw_plan_creation_func : unsigned { NONE = 0x0, // not to be used (exceptfor validating values) PLAN_DFT_ND = 0x1 << 0, PLAN_DFT = 0x1 << 1, PLAN_MANY = 0x1 << 2, PLAN_GURU = 0x1 << 3, PLAN_GURU64 = 0x1 << 4, ANY = PLAN_DFT_ND | PLAN_DFT | PLAN_MANY | PLAN_GURU | PLAN_GURU64 }; static const std::vector hipfftw_plan_creation_func_candidates = {hipfftw_plan_creation_func::PLAN_DFT_ND, hipfftw_plan_creation_func::PLAN_DFT, hipfftw_plan_creation_func::PLAN_MANY, hipfftw_plan_creation_func::PLAN_GURU, hipfftw_plan_creation_func::PLAN_GURU64}; static bool hipfftw_creation_options_are_well_defined(hipfftw_plan_creation_func creation_options) { return creation_options == (creation_options & hipfftw_plan_creation_func::ANY); } static std::string hipfftw_creation_options_to_string(hipfftw_plan_creation_func creation_options, fft_transform_type dft_type, int intended_rank) { if(!hipfftw_creation_options_are_well_defined(creation_options)) throw std::invalid_argument( "invalid creation_options for hipfftw_creation_options_to_string"); if(creation_options == hipfftw_plan_creation_func::NONE) return "none"; if(creation_options == hipfftw_plan_creation_func::ANY) return "any"; if(std::find(hipfftw_plan_creation_func_candidates.begin(), hipfftw_plan_creation_func_candidates.end(), creation_options) == hipfftw_plan_creation_func_candidates.end()) { // 2 or more qualifying candidates flagged in creation_options std::string ret; for(auto candidate : hipfftw_plan_creation_func_candidates) { if(creation_options & candidate) { if(!ret.empty()) ret += "_or_"; ret += hipfftw_creation_options_to_string(candidate, dft_type, intended_rank); } } return ret; } // creation_options is one unique qualifying candidate std::ostringstream ret; const std::string real_or_empty_qualifier = is_real(dft_type) ? (is_fwd(dft_type) ? "_r2c" : "_c2r") : ""; switch(creation_options) { case hipfftw_plan_creation_func::PLAN_DFT_ND: ret << "plan_dft" << real_or_empty_qualifier << "_" << (intended_rank < 0 ? "negative" : "") << std::abs(intended_rank) << "d"; break; case hipfftw_plan_creation_func::PLAN_DFT: ret << "plan_dft" << real_or_empty_qualifier; break; case hipfftw_plan_creation_func::PLAN_MANY: ret << "plan_many_dft" << real_or_empty_qualifier; break; case hipfftw_plan_creation_func::PLAN_GURU: ret << "plan_guru_dft" << real_or_empty_qualifier; break; case hipfftw_plan_creation_func::PLAN_GURU64: ret << "plan_guru64_dft" << real_or_empty_qualifier; break; default: throw std::runtime_error("hipfftw_creation_options_to_string: internal error encountered " "(unexpected value for creation_options)"); break; } return ret.str(); } template < fft_precision prec, std::enable_if_t = true> struct hipfftw_plan_bundle_t { private: const decltype(hipfftw_funcs::destroy_plan)& plan_destructor; public: hipfftw_plan_t plan; std::pair creation_io; // not owned hipfftw_plan_creation_func creation_func; std::string plan_token; // <-- plan details, except for creation io data pointers hipfftw_plan_bundle_t(decltype(plan_destructor) plan_destructor_func) : plan_destructor(plan_destructor_func) , plan(nullptr) , creation_io({nullptr, nullptr}) , creation_func(hipfftw_plan_creation_func::NONE) , plan_token("") { } ~hipfftw_plan_bundle_t() { // make sure the plan destructor may be used to avoid // throwing from the hipfftw_plan_bundle_t destructor if(plan_destructor.may_be_used()) { // should be stable even if plan == nullptr; plan_destructor(plan); } else if(plan) { std::cerr << "WARNING: A " << (prec == fft_precision_single ? "single" : "double") << "-precision plan was seemingly created but its destructor cannot be used " << std::endl; } } // disable copies and moves hipfftw_plan_bundle_t(const hipfftw_plan_bundle_t&) = delete; hipfftw_plan_bundle_t& operator=(const hipfftw_plan_bundle_t&) = delete; hipfftw_plan_bundle_t(hipfftw_plan_bundle_t&&) = delete; hipfftw_plan_bundle_t& operator=(hipfftw_plan_bundle_t&&) = delete; }; static bool rank_is_valid_for_hipfftw(int r) { return r > 0; } template , bool> = true> static bool lengths_are_valid_for_hipfftw_as(const std::vector len, int intended_rank) { if(!rank_is_valid_for_hipfftw(intended_rank)) return false; // impossible to validate lengths for an invalid rank // check that lengths are all strictly positive and representable with // type T without data loss return len.size() == intended_rank && std::all_of(len.begin(), len.end(), [](const decltype(len)::value_type& val) { return val > 0 && val <= std::numeric_limits::max(); }); } static bool sign_is_valid_for_hipfftw(int s, const fft_transform_type& dft_kind) { if(is_real(dft_kind)) return true; // sign is irrelevant for real transforms return s == (is_fwd(dft_kind) ? FFTW_FORWARD : FFTW_BACKWARD); } static constexpr unsigned hipfftw_valid_flags_mask = FFTW_WISDOM_ONLY | FFTW_MEASURE | FFTW_DESTROY_INPUT | FFTW_UNALIGNED | FFTW_CONSERVE_MEMORY | FFTW_EXHAUSTIVE | FFTW_PRESERVE_INPUT | FFTW_PATIENT | FFTW_ESTIMATE; static bool flags_are_valid_for_hipfftw(unsigned f) { return (f & hipfftw_valid_flags_mask) == f; } template < fft_precision prec, std::enable_if_t = true> struct hipfftw_helper { private: // plan_bundle stores information about the latest plan possibly created by this // object. A shard_ptr is used to make hipfftw_helper safe w.r.t. shallow // copies (as required by gtest for parameterized tests). // This member is also made mutable so we can release/create it even from a // const-qualified objects (e.g., to release owned resources upon test completion, // or to re-create the plan at execution if needed or found necessary) mutable std::shared_ptr> plan_bundle; fft_transform_type dft_kind; int rank = 0; std::vector lengths; fft_result_placement plan_placement; int sign = 0; unsigned flags = std::numeric_limits::max(); template void reset_member_value(T& member, const T& new_value) { if(new_value != member) { member = new_value; plan_bundle.reset(); } } hipfftw_plan_creation_func get_creation_func(hipfftw_plan_creation_func creation_options) const { if(!hipfftw_creation_options_are_well_defined(creation_options)) throw std::invalid_argument("invalid creation_options for get_creation_func"); if(!can_use_creation_options(creation_options)) { // e.g., rank < 0 with creation_options == hipfftw_plan_creation_func::PLAN_DFT_ND throw std::invalid_argument( "The plan creation options " + hipfftw_creation_options_to_string(creation_options, dft_kind, rank) + " cannot be used with this object"); } std::vector valid_candidates; for(auto candidate : hipfftw_plan_creation_func_candidates) { if(!(creation_options & candidate)) continue; // candidate is not in given creation_options if(can_use_creation_options(candidate)) { // If creation_options != candidate for all candidates, creation_optionsactually // combines 2 or more candidates --> only the candidates actually supporting plan // creation will be considered "valid". If there exists one (usable) candidate s.t. // creation_options == candidate however, this choice is considered "enforced" // (e.g. for function-specific argument validation testing purposes) if(creation_options == candidate || can_create_plan_with(candidate)) valid_candidates.push_back(candidate); } } if(valid_candidates.empty()) return hipfftw_plan_creation_func::NONE; // "randomly" (yet reproducibly) choose return valid_candidates[std::hash()(token()) % valid_candidates.size()]; } template hipfftw_plan_t make_plan(void* in, void* out, hipfftw_plan_creation_func chosen_creation) const { if(std::find(hipfftw_plan_creation_func_candidates.begin(), hipfftw_plan_creation_func_candidates.end(), chosen_creation) == hipfftw_plan_creation_func_candidates.end()) { throw std::invalid_argument("Invalid chosen_creation for hipfftw_helper::make_plan"); } // fetch/infer plan creation function arguments const auto& hipfftw_impl = hipfftw_funcs::get_instance(); const auto int_len = get_length_as(); const int* int_len_ptr = int_len.empty() ? nullptr : int_len.data(); switch(chosen_creation) { case hipfftw_plan_creation_func::PLAN_DFT_ND: { if(!can_use_creation_options(hipfftw_plan_creation_func::PLAN_DFT_ND)) throw std::runtime_error("hipfftw_plan_creation_func::PLAN_DFT_ND cannot be used."); if(rank == 1) { if(dft_kind == fft_transform_type_real_forward) { return hipfftw_impl.plan_dft_r2c_1d.template call( int_len_ptr[0], static_cast*>(in), static_cast*>(out), flags); } else if(dft_kind == fft_transform_type_real_inverse) { return hipfftw_impl.plan_dft_c2r_1d.template call( int_len_ptr[0], static_cast*>(in), static_cast*>(out), flags); } else { return hipfftw_impl.plan_dft_1d.template call( int_len_ptr[0], static_cast*>(in), static_cast*>(out), sign, flags); } } else if(rank == 2) { if(dft_kind == fft_transform_type_real_forward) { return hipfftw_impl.plan_dft_r2c_2d.template call( int_len_ptr[0], int_len_ptr[1], static_cast*>(in), static_cast*>(out), flags); } else if(dft_kind == fft_transform_type_real_inverse) { return hipfftw_impl.plan_dft_c2r_2d.template call( int_len_ptr[0], int_len_ptr[1], static_cast*>(in), static_cast*>(out), flags); } else { return hipfftw_impl.plan_dft_2d.template call( int_len_ptr[0], int_len_ptr[1], static_cast*>(in), static_cast*>(out), sign, flags); } } else { if(dft_kind == fft_transform_type_real_forward) { return hipfftw_impl.plan_dft_r2c_3d.template call( int_len_ptr[0], int_len_ptr[1], int_len_ptr[2], static_cast*>(in), static_cast*>(out), flags); } else if(dft_kind == fft_transform_type_real_inverse) { return hipfftw_impl.plan_dft_c2r_3d.template call( int_len_ptr[0], int_len_ptr[1], int_len_ptr[2], static_cast*>(in), static_cast*>(out), flags); } else { return hipfftw_impl.plan_dft_3d.template call( int_len_ptr[0], int_len_ptr[1], int_len_ptr[2], static_cast*>(in), static_cast*>(out), sign, flags); } } } break; case hipfftw_plan_creation_func::PLAN_DFT: { if(!can_use_creation_options(hipfftw_plan_creation_func::PLAN_DFT)) throw std::runtime_error("hipfftw_plan_creation_func::PLAN_DFT cannot be used."); if(dft_kind == fft_transform_type_real_forward) { return hipfftw_impl.plan_dft_r2c.template call( rank, int_len_ptr, static_cast*>(in), static_cast*>(out), flags); } else if(dft_kind == fft_transform_type_real_inverse) { return hipfftw_impl.plan_dft_c2r.template call( rank, int_len_ptr, static_cast*>(in), static_cast*>(out), flags); } else { return hipfftw_impl.plan_dft.template call( rank, int_len_ptr, static_cast*>(in), static_cast*>(out), sign, flags); } } break; case hipfftw_plan_creation_func::PLAN_MANY: [[fallthrough]]; case hipfftw_plan_creation_func::PLAN_GURU: [[fallthrough]]; case hipfftw_plan_creation_func::PLAN_GURU64: throw std::runtime_error("Enforced plan creation is not implemented yet"); break; default: throw std::runtime_error("Unknown kind of plan creation"); break; } // unreachable } public: hipfftw_helper() = default; ~hipfftw_helper() = default; hipfftw_helper(hipfftw_helper&& other) = default; hipfftw_helper& operator=(hipfftw_helper&& other) = default; hipfftw_helper(const hipfftw_helper& other) = default; hipfftw_helper& operator=(const hipfftw_helper& rhs) = default; void set_creation_args(fft_transform_type dft_kind_to_set, int rank_to_set, const std::vector& lengths_to_set, fft_result_placement placement_to_set, int sign_to_set, unsigned flags_to_set) { reset_member_value(dft_kind, dft_kind_to_set); reset_member_value(rank, rank_to_set); reset_member_value(lengths, lengths_to_set); reset_member_value(plan_placement, placement_to_set); reset_member_value(sign, sign_to_set); reset_member_value(flags, flags_to_set); } // getters fft_transform_type get_dft_kind() const { return dft_kind; } int get_rank() const { return rank; } // returns the lengths as an std::vector if they may all be safely converted to T // (the returned vector is empty otherwise) template , bool> = true> std::vector get_length_as() const { if constexpr(std::is_same_v) return lengths; std::vector ret; if(std::any_of(lengths.begin(), lengths.end(), [](const typename decltype(lengths)::value_type& val) { return val < std::numeric_limits::lowest() || val > std::numeric_limits::max(); })) { // not a safe conversion, return empty lengths return ret; } ret.assign(lengths.begin(), lengths.end()); return ret; } fft_result_placement get_placement() const { return plan_placement; } int get_sign() const { return sign; } unsigned get_flags() const { return flags; } std::shared_ptr> get_plan_bundle() const { return plan_bundle; } template , bool> = true> std::vector get_strides_as(fft_io io) const { if(!rank_is_valid_for_hipfftw(rank) || !has_valid_lengths()) throw std::runtime_error( "cannot calculate default strides with invalid rank or invalid lengths"); // only default strides for now std::vector strides(rank, 1); if(rank > 1) { if(is_complex(dft_kind)) strides[rank - 2] = lengths.back(); else { if(is_fwd(dft_kind) == (io == fft_io::fft_io_out)) strides[rank - 2] = lengths.back() / 2 + 1; else { if(plan_placement == fft_placement_inplace) strides[rank - 2] = 2 * (lengths.back() / 2 + 1); else strides[rank - 2] = lengths.back(); } } } for(auto dim = rank - 3; dim >= 0; dim--) strides[dim] = strides[dim + 1] * lengths[dim + 1]; std::vector ret; if(std::any_of(strides.begin(), strides.end(), [](const typename decltype(strides)::value_type& val) { return val < std::numeric_limits::lowest() || val > std::numeric_limits::max(); })) { // not a safe conversion, return empty lengths return ret; } ret.assign(strides.begin(), strides.end()); return ret; } template , bool> = true> T get_dist_as(fft_io io) const { if(!rank_is_valid_for_hipfftw(rank) || !has_valid_lengths()) throw std::runtime_error( "cannot calculate default distance(s) with invalid rank or invalid lengths"); // only default distances for now ptrdiff_t dist = 0; if(rank == 1) { if(is_complex(dft_kind)) dist = lengths.back(); else { if(is_fwd(dft_kind) == (io == fft_io::fft_io_out)) dist = lengths.back() / 2 + 1; else { if(plan_placement == fft_placement_inplace) dist = 2 * (lengths.back() / 2 + 1); else dist = lengths.back(); } } } else { const auto strides = get_strides_as(io); dist = strides.front() * lengths.front(); } if(dist < std::numeric_limits::lowest() || dist > std::numeric_limits::max()) throw std::runtime_error("distance cannot be safely converted to the desired type"); return static_cast(dist); } template , bool> = true> T get_nbatch_as(fft_io io) const { // only unbatched for now T ret = 1; return ret; } // validity checks bool has_valid_rank() const { return rank_is_valid_for_hipfftw(rank); } bool has_valid_lengths() const { return lengths_are_valid_for_hipfftw_as(lengths, rank); } bool has_valid_sign() const { return sign_is_valid_for_hipfftw(sign, dft_kind); } bool has_valid_flags() const { return flags_are_valid_for_hipfftw(flags); } // checks if the current parameters can be used with (any of) the given option(s) of // plan creation (NOT whether they're valid or not). For instance, one cannot possibly // communicate rank > 3 with hipfftw_plan_creation_func::PLAN_DFT_ND, or communicate // non-default strides with hipfftw_plan_creation_func::PLAN_DFT_ND or // hipfftw_plan_creation_func::PLAN_DFT... // TODO: expand logic when extra configuration parameters are added (e.g. batch sizes, // strides, etc.) bool can_use_creation_options(hipfftw_plan_creation_func creation_options) const { if(!hipfftw_creation_options_are_well_defined(creation_options)) throw std::invalid_argument( "ill-defined creation_options used in can_use_creation_options"); if(creation_options == hipfftw_plan_creation_func::NONE) return false; if(std::find(hipfftw_plan_creation_func_candidates.begin(), hipfftw_plan_creation_func_candidates.end(), creation_options) == hipfftw_plan_creation_func_candidates.end()) { // creation_options combines several candidates in hipfftw_plan_creation_func_candidates // --> parse them individually and find out if any applicable can be used return std::any_of(hipfftw_plan_creation_func_candidates.begin(), hipfftw_plan_creation_func_candidates.end(), [=](const hipfftw_plan_creation_func& candidate) { return (creation_options & candidate) && can_use_creation_options(candidate); }); } // "creation_options" actually is an individual value in hipfftw_plan_creation_func_candidates switch(creation_options) { case hipfftw_plan_creation_func::PLAN_DFT_ND: // rank is not passed as an argument but dictated by the called function, // (must be 1, 2, or 3), and as many lengths must be passed as individual // integer values return (rank == 1 || rank == 2 || rank == 3) && get_length_as().size() == rank; break; case hipfftw_plan_creation_func::PLAN_DFT: // the lengths must be representable as integers, if not empty (supposedly // intentionally, e.g., for input validation testing purposes) return lengths.empty() || get_length_as().size() == rank; break; case hipfftw_plan_creation_func::PLAN_MANY: [[fallthrough]]; case hipfftw_plan_creation_func::PLAN_GURU: [[fallthrough]]; case hipfftw_plan_creation_func::PLAN_GURU64: return false; break; default: throw std::runtime_error("hipfftw_helper: internal error encountered (unexpected value " "for creation_options)"); break; } // unreachable } // checks validity of configuration parameters and whether creation can be // attempted via (any of) the given option(s) bool is_valid_for_creation_with(hipfftw_plan_creation_func creation_options) const { if(!hipfftw_creation_options_are_well_defined(creation_options)) throw std::invalid_argument("invalid creation_options for is_valid_for_creation_with"); // TODO: expand the global validity checks below when this struct is // expanded to cover more configurations (e.g., batching, srides, etc.) return has_valid_rank() && has_valid_lengths() && has_valid_sign() && has_valid_flags() && can_use_creation_options(creation_options); } bool is_valid_for_creation() const { return is_valid_for_creation_with(hipfftw_plan_creation_func::ANY); } // check expected support by (any of) the given option(s) bool has_unsupported_args_for(hipfftw_plan_creation_func creation_options) const { // extra conditions for configurations supported by hipfftw: if(rank > 3) return true; if(flags & FFTW_WISDOM_ONLY) return true; if(dft_kind == fft_transform_type_real_inverse && rank > 1 && (flags & FFTW_PRESERVE_INPUT)) return true; if(!(creation_options & hipfftw_plan_creation_func::PLAN_GURU64) && has_valid_rank() && has_valid_lengths()) { // cannot handle data sizes involving more elements than the // largest representable int value if(get_num_elements_in(fft_io_in) > std::numeric_limits::max() || get_num_elements_in(fft_io_out) > std::numeric_limits::max()) return true; } return false; } bool can_create_plan_with(hipfftw_plan_creation_func creation_options) const { if(!hipfftw_creation_options_are_well_defined(creation_options)) throw std::invalid_argument("invalid creation_option for can_create_plan_with"); if(!is_valid_for_creation_with(creation_options)) return false; if(has_unsupported_args_for(creation_options)) return false; return true; } bool can_create_plan() const { return can_create_plan_with(hipfftw_plan_creation_func::ANY); } // create a token consistent with other tests to enable kernel precompilation // for valid cases, and/or capturing all required details about members otherwise std::string token() const { std::ostringstream ret; switch(dft_kind) { case fft_transform_type_complex_forward: ret << "complex_forward"; break; case fft_transform_type_complex_inverse: ret << "complex_inverse"; break; case fft_transform_type_real_forward: ret << "real_forward"; break; case fft_transform_type_real_inverse: ret << "real_inverse"; break; default: throw std::runtime_error("unknown type of transform"); } // report rank if invalid if(!has_valid_rank() || lengths.empty()) ret << "_invalid_rank" << (rank < 0 ? "_negative_" : "_") << std::abs(rank); ret << "_len"; if(lengths.empty()) ret << "_none"; else { for(const auto& len : lengths) ret << (len < 0 ? "_negative_" : "_") << std::abs(len); } if constexpr(prec == fft_precision_single) ret << "_single"; else ret << "_double"; ret << (plan_placement == fft_placement_inplace ? "_ip" : "_op"); // only supporting unbatched cases as of now ret << "_batch_1"; if(has_valid_rank() && has_valid_lengths()) { ret << "_istride"; for(const auto& stride : get_strides_as(fft_io::fft_io_in)) ret << "_" << stride; if(!is_real(dft_kind)) ret << "_CI"; else if(dft_kind == fft_transform_type_real_forward) ret << "_R"; else ret << "_HI"; ret << "_ostride"; for(const auto& stride : get_strides_as(fft_io::fft_io_out)) ret << "_" << stride; if(!is_real(dft_kind)) ret << "_CI"; else if(dft_kind == fft_transform_type_real_forward) ret << "_HI"; else ret << "_R"; ret << "_idist_" << get_dist_as(fft_io::fft_io_in); ret << "_odist_" << get_dist_as(fft_io::fft_io_out); ret << "_ioffset_0_0_ooffset_0_0"; } if(!has_valid_sign()) ret << "_invalid_sign" << (sign < 0 ? "_negative_" : "_") << std::abs(sign); ret << "_flags_" << flags; return ret.str(); } // create_plan invokes an hipfftw plan creation function for the object's configuration // parameters, the corresponding plan pointer returned by hipfftw is stored internally. // IMPORTANT NOTE: if one wants to target a specific creation function (as represented // by any value in hipfftw_plan_creation_func_candidates), setting the creation_options // argument to that specific value effectively bypasses the verification that the // object's configuration is actually (expected to be) supported and attempts the plan // creation anyways (unless it simply cannot be done, e.g., attempting // creation_options = hipfftw_plan_creation_func::PLAN_DFT_ND herein on an object // holding a value for rank > 3 simply cannot be done) void create_plan(void* in, void* out, hipfftw_plan_creation_func creation_options = hipfftw_plan_creation_func::ANY) const { const auto& hipfftw_impl = hipfftw_funcs::get_instance(); const hipfftw_plan_creation_func chosen_option = get_creation_func(creation_options); if(chosen_option == hipfftw_plan_creation_func::NONE) { plan_bundle = std::make_shared>(hipfftw_impl.destroy_plan); plan_bundle->creation_io = {in, out}; plan_bundle->plan = nullptr; plan_bundle->creation_func = chosen_option; plan_bundle->plan_token = ""; return; } // early return if there is no need to (re)build if(plan_bundle && plan_bundle->plan_token == token() && plan_bundle->creation_io.first == in && plan_bundle->creation_io.second == out && plan_bundle->creation_func == chosen_option) return; // create the desired plan plan_bundle = std::make_shared>(hipfftw_impl.destroy_plan); plan_bundle->plan = make_plan(in, out, chosen_option); plan_bundle->creation_io = {in, out}; plan_bundle->creation_func = chosen_option; plan_bundle->plan_token = token(); } // returns a reference FFTW plan for the current configuration // The returned plan is NOT owned by this object! hipfftw_plan_t get_reference_plan(void* in, void* out, hipfftw_plan_creation_func creation_options = hipfftw_plan_creation_func::ANY) const { const hipfftw_plan_creation_func chosen_option = get_creation_func(creation_options); if(chosen_option == hipfftw_plan_creation_func::NONE) { return nullptr; } constexpr bool make_reference_plan = true; return make_plan(in, out, chosen_option); } void execute(void* execute_in, void* execute_out) const { if(!plan_bundle || plan_bundle->plan_token != token()) { // plan is not created or possibly not up-to-date create_plan(execute_in, execute_out); } const auto& hipfftw_impl = hipfftw_funcs::get_instance(); if(execute_in == plan_bundle->creation_io.first && execute_out == plan_bundle->creation_io.second) { hipfftw_impl.execute(plan_bundle->plan); } else { throw std::runtime_error("New-array execution functions not implemented yet."); } } // TODO: revise/expand logic below when the structure is expanded for more cases (batches, // non-default strides, etc.) size_t get_num_elements_in(fft_io in_or_out) const { if(in_or_out != fft_io_in && in_or_out != fft_io_out) throw std::invalid_argument("invalid in_or_out for get_num_elements_in"); if(!has_valid_rank() || !has_valid_lengths()) throw std::runtime_error("get_num_elements_in requires valid rank and lengths"); const auto tmp = get_length_as(); if(tmp.empty() || tmp.size() != rank) { throw std::runtime_error( "get_num_elements_in failed to correctly convert lengths to size_t values"); } size_t num_elems = 1; if(is_complex(dft_kind)) { num_elems *= tmp[rank - 1]; } else { const size_t cmplx_len = tmp[rank - 1] / 2 + 1; if(is_fwd(dft_kind) == (in_or_out == fft_io_out)) num_elems *= cmplx_len; else num_elems *= plan_placement == fft_placement_inplace ? 2 * cmplx_len : tmp[rank - 1]; } num_elems *= product(tmp.begin(), tmp.begin() + rank - 1); return num_elems; } size_t get_data_byte_size(fft_io in_or_out) const { if(in_or_out != fft_io_in && in_or_out != fft_io_out) throw std::invalid_argument("invalid in_or_out for get_data_byte_size"); // for in-place, input and output data sizes are enforced equal std::vector io_range_to_consider = {in_or_out}; if(plan_placement == fft_placement_inplace) io_range_to_consider.push_back(in_or_out == fft_io::fft_io_in ? fft_io::fft_io_out : fft_io::fft_io_in); size_t ret = 0; for(auto io : io_range_to_consider) { const size_t num_elems = get_num_elements_in(io); if(is_complex(dft_kind) || (is_fwd(dft_kind) == (io == fft_io_out))) ret = std::max(ret, num_elems * sizeof(hipfftw_complex_t)); else ret = std::max(ret, num_elems * sizeof(hipfftw_real_t)); } return ret; } void release_plan() const { plan_bundle.reset(); } }; #endif hipFFT-rocm-7.1.0/clients/samples/000077500000000000000000000000001506642153200167065ustar00rootroot00000000000000hipFFT-rocm-7.1.0/clients/samples/CMakeLists.txt000066400000000000000000000115731506642153200214550ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# CMAKE_MINIMUM_REQUIRED( VERSION 3.16 ) project( hipfft-clients-samples-rocfft LANGUAGES CXX ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) if( NOT TARGET hipfft ) find_package( hipfft REQUIRED CONFIG PATHS ) endif( ) set( sample_list hipfft_1d_z2z hipfft_1d_d2z hipfft_2d_z2z hipfft_2d_d2z hipfft_3d_z2z hipfft_3d_d2z hipfft_planmany_2d_z2z hipfft_planmany_2d_r2c hipfft_multigpu_2d_z2z hipfft_setworkarea ) # callback sample has its own HIP code, so it needs to be built with hipcc or clang++ if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang" ) # on cuFFT backend, use of callbacks requires linking against the # static cuFFT library if( NOT (BUILD_WITH_LIB STREQUAL "CUDA") OR NOT BUILD_SHARED_LIBS ) list( APPEND sample_list hipfft_callback ) else() message( STATUS "hipfft_callback sample disabled on non-static CUDA build" ) endif() else() message( STATUS "hipfft_callback sample disabled, requires hipcc or Clang++ build" ) endif() foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) target_link_libraries( ${sample} PRIVATE hip::hipfft ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP MODULE REQUIRED ) endif() if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) target_link_libraries( ${sample} PRIVATE hip::host hip::device ) else() target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( ${sample} PRIVATE ${HIP_INCLUDE_DIRS}) endif() endif() if ( BUILD_WITH_LIB STREQUAL "CUDA" ) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( ${sample} PRIVATE -cuda -Xptxas=-w) target_link_options( ${sample} PRIVATE -cuda) else() target_compile_options( ${sample} PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() target_link_libraries( ${sample} PRIVATE ${CUDA_LIBRARIES} ) else() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if ( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) endif() endif() target_include_directories( ${sample} PRIVATE $ $ $ ${HIP_ROOT_DIR} ) set_target_properties( ${sample} PROPERTIES CXX_EXTENSIONS NO ) if( HIPFFT_BUILD_SCOPE ) set( SAMPLES_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_OUT_DIR "/../bin" ) else() set( SAMPLES_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_OUT_DIR} ) set_target_properties( ${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_OUT_DIR} ) endforeach() # callback code must be compiled as relocatable device code if( hipfft_callback IN_LIST sample_list ) if( BUILD_WITH_LIB STREQUAL "CUDA" ) target_compile_options( hipfft_callback PRIVATE -dc ) else() # -fgpu-rdc causes failure at link stage on Windows if (NOT WIN32) target_compile_options( hipfft_callback PRIVATE -fgpu-rdc ) target_link_options( hipfft_callback PRIVATE -fgpu-rdc ) endif() endif() endif() hipFFT-rocm-7.1.0/clients/samples/hipfft_1d_d2z.cpp000066400000000000000000000074131506642153200220420ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D double-precision real-to-complex transform\n"; const size_t Nx = 8; const size_t Ncomplex = Nx / 2 + 1; std::vector rdata(Nx); size_t real_bytes = sizeof(decltype(rdata)::value_type) * rdata.size(); std::vector> cdata(Ncomplex); size_t complex_bytes = sizeof(std::complex) * cdata.size(); // Create HIP device object double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx; i++) { rdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < rdata.size(); i++) { std::cout << rdata[i] << " "; } std::cout << std::endl; hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_D2Z, // transform type (HIPFFT_R2C for single-precision) 1); // number of transforms (deprecated) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // Execute plan: // hipfftExecD2Z: double precision, hipfftExecR2C: for single-precision // Direction is implied by real-to-complex direction hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_1d_z2z.cpp000066400000000000000000000072251506642153200220710ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D double-precision complex-to-complex transform\n"; const int Nx = 8; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device // Use hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx; i++) { cdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_Z2Z, // transform type (HIPFFT_C2C for single-precision) 1); // number of transforms if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // Execute plan: // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_2d_d2z.cpp000066400000000000000000000100261506642153200220350ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 2D double-precision real-to-complex transform\n"; const size_t Nx = 4; const size_t Ny = 5; std::cout << "Nx: " << Nx << "\tNy: " << Ny << std::endl; const size_t Nycomplex = Ny / 2 + 1; const size_t rstride = Nycomplex * 2; // Ny for out-of-place std::cout << "Input:\n"; std::vector rdata(Nx * rstride); for(size_t i = 0; i < Nx * rstride; i++) { rdata[i] = i; } for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * rstride + j; std::cout << rdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, rdata.size() * sizeof(decltype(rdata)::value_type)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy( x, rdata.data(), rdata.size() * sizeof(decltype(rdata)::value_type), hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan: hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan2d(&plan, // plan handle Nx, // transform length Ny, // transform length HIPFFT_D2Z); // transform type (HIPFFT_R2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlandd failed"); // Execute plan: // hipfftExecD2Z: double precision. hipfftExecR2C: single-precision hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); // Copy the output data to the host: std::vector> cdata(Nx * Nycomplex); hip_rt = hipMemcpy( cdata.data(), x, cdata.size() * sizeof(decltype(cdata)::value_type), hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Nycomplex; j++) { auto pos = i * Nycomplex + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_2d_z2z.cpp000066400000000000000000000076241506642153200220750ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 2D double-precision complex-to-complex transform\n"; const int Nx = 4; const int Ny = 4; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx * Ny); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device: // hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data for(size_t i = 0; i < Nx * Ny; i++) { cdata[i] = i; } std::cout << "input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan2d(&plan, // plan handle Nx, // transform length Ny, // transform length HIPFFT_Z2Z); // transform type (HIPFFT_C2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlandd failed"); // Execute plan // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_3d_d2z.cpp000066400000000000000000000105061506642153200220410ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 3D double-precision real-to-complex transform\n"; const size_t Nx = 4; const size_t Ny = 5; const size_t Nz = 6; std::cout << "Nx: " << Nx << "\tNy " << Ny << "\tNz " << Nz << std::endl; const size_t Nzcomplex = Nz / 2 + 1; const size_t rstride = Nzcomplex * 2; // Nz for out-of-place const size_t real_bytes = sizeof(double) * Nx * Ny * rstride; const size_t complex_bytes = 2 * sizeof(double) * Nx * Ny * Nzcomplex; double* x; hipError_t hip_rt; hip_rt = hipMalloc(&x, real_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Inititalize the data std::vector rdata(Nx * Ny * rstride); for(size_t i = 0; i < Nx * Ny * rstride; i++) { rdata[i] = i; } std::cout << "input:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { for(size_t k = 0; k < rstride; k++) { auto pos = (i * Ny + j) * rstride + k; std::cout << rdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan: hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan3d(&plan, // plan handle Nx, Ny, Nz, // transform lengths HIPFFT_D2Z); // transform type (HIPFFT_R2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan3d failed"); // Execute plan: // hipfftExecD2Z: double precision, hipfftExecR2C: single-precision hipfft_rt = hipfftExecD2Z(plan, x, (hipfftDoubleComplex*)x); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecD2Z failed"); std::cout << "output:\n"; std::vector> cdata(Nx * Ny * Nz); hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { for(size_t k = 0; k < Nzcomplex; k++) { auto pos = (i * Ny + j) * Nzcomplex + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_3d_z2z.cpp000066400000000000000000000102711506642153200220660ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 3D double-precision complex-to-complex transform\n"; const int Nx = 4; const int Ny = 4; const int Nz = 4; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector> cdata(Nx * Ny * Nz); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device: // hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex* x; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); std::cout << "Input:\n"; for(size_t i = 0; i < Nx * Ny * Nz; i++) { cdata[i] = i; } for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { for(int k = 0; k < Nz; k++) { int pos = (i * Ny + j) * Nz + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Create plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan3d(&plan, // plan handle Nx, // transform length Ny, // transform length Nz, // transform length HIPFFT_Z2Z); // transform type (HIPFFT_C2C for single-precision) if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan3d failed"); // Execute plan // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { for(int k = 0; k < Nz; k++) { int pos = (i * Ny + j) * Nz + k; std::cout << cdata[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_callback.cpp000066400000000000000000000140471506642153200225140ustar00rootroot00000000000000// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #ifndef WIN32 #include #include #include #include #include "../hipfft_params.h" struct load_cbdata { hipfftDoubleComplex* filter; double scale; }; __device__ hipfftDoubleComplex load_callback(hipfftDoubleComplex* input, size_t offset, void* cbdata, void* sharedMem) { auto data = static_cast(cbdata); // NB: for optimal performance, one may need a custom // multiplication operator. return hipCmul(hipCmul(input[offset], data->filter[offset]), make_hipDoubleComplex(data->scale, 0)); } __device__ auto load_callback_dev = load_callback; #endif int main() { #ifdef WIN32 std::cout << "This sample is temporarily disabled on Windows" << std::endl; return EXIT_SUCCESS; #else std::cout << "hipfft 1D double-precision complex-to-complex transform with callback\n"; const int Nx = 8; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 std::vector cdata(Nx), filter(Nx); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); // Create HIP device object and copy data to device // Use hipfftComplex for single-precision hipError_t hip_rt; hipfftDoubleComplex *x, *filter_dev; hip_rt = hipMalloc(&x, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMalloc(&filter_dev, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Initialize the data and filter for(size_t i = 0; i < Nx; i++) { cdata[i].x = i; cdata[i].y = i; filter[i].x = rand() / static_cast(RAND_MAX); filter[i].y = 0; } hip_rt = hipMemcpy(x, cdata.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hip_rt = hipMemcpy(filter_dev, filter.data(), complex_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "input:\n"; for(size_t i = 0; i < cdata.size(); i++) { std::cout << "(" << cdata[i].x << ", " << cdata[i].y << ") "; } std::cout << std::endl; // Create the plan hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfftResult hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfft_rt = hipfftPlan1d(&plan, // plan handle Nx, // transform length HIPFFT_Z2Z, // transform type (HIPFFT_C2C for single-precision) 1); // number of transforms if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftPlan1d failed"); // prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(Nx); void* cbdata_dev; hip_rt = hipMalloc(&cbdata_dev, sizeof(load_cbdata)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); void* cbptr_host = nullptr; hip_rt = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpyFromSymbol failed"); // set callback hipfft_rt = hipfftXtSetCallback(plan, &cbptr_host, HIPFFT_CB_LD_COMPLEX_DOUBLE, &cbdata_dev); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetCallback failed"); // Execute plan: // hipfftExecZ2Z: double precision, hipfftExecC2C: for single-precision hipfft_rt = hipfftExecZ2Z(plan, x, x, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftExecZ2Z failed"); std::cout << "output:\n"; hip_rt = hipMemcpy(cdata.data(), x, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(size_t i = 0; i < cdata.size(); i++) { std::cout << "(" << cdata[i].x << ", " << cdata[i].y << ") "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(cbdata_dev); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(filter_dev); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; #endif } hipFFT-rocm-7.1.0/clients/samples/hipfft_multigpu_2d_z2z.cpp000066400000000000000000000127111506642153200240140ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "Multi-gpu hipFFT 2D double-precision complex-to-complex transform\n"; // 2D FFTs are encountered in diverse applications of image processing, // examples range from image denoising to RTM seismic imaging. // In this example we compare the 2D FFT computation using single vs multiple GPUs. // Note that when using cuFFTXt with two or more GPUs, its latest version requires // a minimum size per dimension greater or equal than 32 and less equal than 4096 // for single precision, and 2048 for double precision. const int Nx = 512; const int Ny = 512; int direction = HIPFFT_FORWARD; // forward=-1, backward=1 int verbose = 0; // Initialize reference data std::vector> cinput(Nx * Ny); for(size_t i = 0; i < Nx * Ny; i++) { cinput[i] = i; } if(verbose) { std::cout << "Input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Define list of GPUs to use std::array gpus = {0, 1}; // Create the multi-gpu plan hipLibXtDesc* desc; // input descriptor hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; if(hipfftCreate(&plan) != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Create a GPU stream and assign it to the plan hipStream_t stream{}; if(hipStreamCreate(&stream) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); if(hipfftSetStream(plan, stream) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetStream failed."); // Assign GPUs to the plan hipfftResult hipfft_rt = hipfftXtSetGPUs(plan, gpus.size(), gpus.data()); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetGPUs failed."); // Make the 2D plan size_t workSize[gpus.size()]; hipfft_rt = hipfftMakePlan2d(plan, Nx, Ny, HIPFFT_Z2Z, workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan2d failed."); // Copy input data to GPUs hipfftXtSubFormat_t format = HIPFFT_XT_FORMAT_INPLACE_SHUFFLED; hipfft_rt = hipfftXtMalloc(plan, &desc, format); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed."); hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(desc), reinterpret_cast(cinput.data()), HIPFFT_COPY_HOST_TO_DEVICE); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Execute the plan hipfft_rt = hipfftXtExecDescriptor(plan, desc, desc, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Print output if(verbose) { // Move result to the host hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(cinput.data()), reinterpret_cast(desc), HIPFFT_COPY_DEVICE_TO_HOST); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy D2H failed."); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Clean up if(hipfftXtFree(desc) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtFree failed."); if(hipfftDestroy(plan) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftDestroy failed."); if(hipStreamDestroy(stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_planmany_2d_r2c.cpp000066400000000000000000000125731506642153200237340ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP int main() { std::cout << "hipfft 2D single-precision real-to-complex transform using " "advanced interface\n"; int rank = 2; int n[2] = {4, 5}; int howmany = 3; // batch size int n1_complex_elements = n[1] / 2 + 1; int n1_padding_real_elements = n1_complex_elements * 2; int istride = 1; int ostride = istride; int inembed[2] = {istride * n[0], istride * n1_padding_real_elements}; int onembed[2] = {ostride * n[0], ostride * n1_complex_elements}; int idist = inembed[0] * inembed[1]; int odist = onembed[0] * onembed[1]; std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; std::vector data(howmany * idist); const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) { for(int j = 0; j < n[1]; j++) { const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = i + ibatch + j; } } } for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftHandle hipForwardPlan; hipfftResult hipfft_rt; hipfft_rt = hipfftPlanMany(&hipForwardPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_R2C, // Use HIPFFT_D2Z for double-precsion. howmany); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipfftReal* gpu_data; hipError_t hip_rt; hip_rt = hipMalloc((void**)&gpu_data, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(gpu_data, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hipfft_rt = hipfftExecR2C(hipForwardPlan, gpu_data, (hipfftComplex*)gpu_data); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); hip_rt = hipMemcpy((void*)data.data(), gpu_data, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; const std::complex* output = (std::complex*)data.data(); for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << output[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftDestroy(hipForwardPlan); hip_rt = hipFree(gpu_data); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_planmany_2d_z2z.cpp000066400000000000000000000117361506642153200237730ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP int main() { std::cout << "hipfft 2D double-precision complex-to-complex transform using " "advanced interface\n"; int rank = 2; int n[2] = {4, 5}; int howmany = 3; // array is contiguous in memory int istride = 1; // in-place transforms require istride=ostride int ostride = istride; // we choose to have no padding around our data: int inembed[2] = {istride * n[0], istride * n[1]}; // in-place transforms require inembed=oneembed: int onembed[2] = {inembed[0], inembed[1]}; int idist = inembed[0] * inembed[1]; int odist = onembed[0] * onembed[1]; std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; std::vector> data(howmany * idist); const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) { for(int j = 0; j < n[1]; j++) { const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = std::complex(i + ibatch, j); } } } for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hipfftHandle hipPlan; hipfftResult hipfft_rt; hipfft_rt = hipfftPlanMany( &hipPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_Z2Z, howmany); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); hipError_t hip_rt; hipfftDoubleComplex* d_in_out; hip_rt = hipMalloc((void**)&d_in_out, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hip_rt = hipMemcpy(d_in_out, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); hipfft_rt = hipfftExecZ2Z(hipPlan, d_in_out, d_in_out, HIPFFT_FORWARD); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); hip_rt = hipMemcpy((void*)data.data(), d_in_out, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; hip_rt = hipFree(d_in_out); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/samples/hipfft_setworkarea.cpp000066400000000000000000000113751506642153200233100ustar00rootroot00000000000000// Copyright (C) 2019 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP #include "../hipfft_params.h" int main() { std::cout << "hipfft 1D single-precision real-to-complex transform showing " "work memory usage\n"; int major_version; hipfftGetProperty(HIPFFT_MAJOR_VERSION, &major_version); std::cout << "hipFFT major_version " << major_version << std::endl; const size_t N = 9; const size_t Ncomplex = (N / 2 + 1); std::vector rdata(N); std::vector> cdata(Ncomplex); size_t real_bytes = sizeof(decltype(rdata)::value_type) * rdata.size(); size_t complex_bytes = sizeof(decltype(cdata)::value_type) * cdata.size(); hipError_t hip_rt = hipSuccess; hipfftResult hipfft_rt = HIPFFT_SUCCESS; std::cout << "input:\n"; for(size_t i = 0; i < N; i++) { rdata[i] = i; } for(size_t i = 0; i < N; i++) { std::cout << rdata[i] << " "; } std::cout << std::endl; // Create HIP device object. hipfftReal* x; hip_rt = hipMalloc(&x, real_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hipfftComplex* y; hip_rt = hipMalloc(&y, complex_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Copy input data to device hip_rt = hipMemcpy(x, rdata.data(), real_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); size_t workSize; hipfft_rt = hipfftEstimate1d(N, HIPFFT_R2C, 1, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftEstimate1d failed"); std::cout << "hipfftEstimate 1d workSize: " << workSize << std::endl; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; hipfft_rt = hipfftCreate(&plan); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftCreate failed"); hipfft_rt = hipfftSetAutoAllocation(plan, 0); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetAutoAllocation failed"); hipfft_rt = hipfftMakePlan1d(plan, N, HIPFFT_R2C, 1, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan1d failed"); // Set work buffer hipfftComplex* workBuf; hip_rt = hipMalloc(&workBuf, workSize); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); hipfft_rt = hipfftSetWorkArea(plan, workBuf); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetWorkArea failed"); hipfft_rt = hipfftGetSize(plan, &workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftGetSize failed"); std::cout << "hipfftGetSize workSize: " << workSize << std::endl; // Execute plan hipfft_rt = hipfftExecR2C(plan, x, (hipfftComplex*)y); // Copy result back to host hip_rt = hipMemcpy(cdata.data(), y, complex_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); std::cout << "output:\n"; for(size_t i = 0; i < Ncomplex; i++) { std::cout << cdata[i] << " "; } std::cout << std::endl; hipfftDestroy(plan); hip_rt = hipFree(x); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); hip_rt = hipFree(workBuf); if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } hipFFT-rocm-7.1.0/clients/tests/000077500000000000000000000000001506642153200164045ustar00rootroot00000000000000hipFFT-rocm-7.1.0/clients/tests/CMakeLists.txt000066400000000000000000000256431506642153200211560ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # Dependencies find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS /opt/rocm ) include(ROCMInstallTargets) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( hipfft-clients-tests LANGUAGES CXX ) if( NOT HIPFFT_BUILD_SCOPE ) find_package( hipfft REQUIRED CONFIG PATHS ) endif() find_package( Boost REQUIRED) set( Boost_USE_STATIC_LIBS OFF ) find_package( FFTW 3.0 REQUIRED MODULE COMPONENTS FLOAT DOUBLE ) set( BUILD_WITH_LIB "ROCM" CACHE STRING "Build ${PROJECT_NAME} with ROCM or CUDA libraries" ) set( THREADS_PREFER_PTHREAD_FLAG ON ) find_package( Threads REQUIRED ) set( hipfft-test_source gtest_main.cpp hipfft_accuracy_test.cpp simple_test.cpp accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp accuracy_test_callback.cpp hipfftw_test.cpp multi_device_test.cpp multi_stream_test.cpp ../../shared/array_validator.cpp ) add_executable( hipfft-test ${hipfft-test_source} ${hipfft-test_includes} ) set( TEST_TARGETS hipfft-test ) # MPI worker for MPI tests if( HIPFFT_MPI_ENABLE ) # build MPI worker to support the tests add_executable( hipfft_mpi_worker hipfft_mpi_worker.cpp ) list( APPEND TEST_TARGETS hipfft_mpi_worker ) target_include_directories( hipfft_mpi_worker PRIVATE ${MPI_C_INCLUDE_PATH} ) add_compile_definitions( HIPFFT_MPI_ENABLE ) endif() if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( hip REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hip/ ) endif() endif() if( HIPFFT_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../staging" ) elseif( HIPFFT_CLIENTS_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../bin" ) else() set( TESTS_OUT_DIR "/bin" ) endif() string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} ) option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) if( BUILD_CLIENTS_TESTS_OPENMP AND NOT BUILD_WITH_LIB STREQUAL "CUDA" ) # Attempt to find a config version, which provides openmp_LIB_DIR. find_package( OpenMP CONFIG PATHS "${HIP_CLANG_ROOT}/lib/cmake" ) if( NOT OPENMP_FOUND OR NOT DEFINED ${openmp_LIB_DIR} ) # Fall-back to module mode. find_package( OpenMP REQUIRED ) set( BUILD_RPATH "${HIP_CLANG_ROOT}/lib" ) set( INSTALL_RPATH "$ORIGIN/../llvm/lib" ) else() set( BUILD_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" ) set( INSTALL_RPATH "$ORIGIN/../llvm/${openmp_LIB_DIR}" ) endif() endif() foreach( target ${TEST_TARGETS} ) set_target_properties( ${target} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( BUILD_CLIENTS_TESTS_OPENMP ) set_target_properties( ${TEST_TARGETS} PROPERTIES BUILD_RPATH "${BUILD_RPATH}" ) set_target_properties( ${TEST_TARGETS} PROPERTIES INSTALL_RPATH "${INSTALL_RPATH}" ) endif() if( BUILD_WITH_LIB STREQUAL "ROCM" ) target_compile_options( ${target} PRIVATE ${WARNING_FLAGS} ) target_link_libraries( ${target} PRIVATE hip::host hip::device ) foreach( gpu_target ${GPU_TARGETS} ) target_compile_options( ${target} PRIVATE --offload-arch=${gpu_target} ) endforeach() if( NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() if( USE_HIPRAND ) target_link_libraries( ${target} PRIVATE hip::hiprand ) endif() else() target_compile_definitions( ${target} PRIVATE __HIP_PLATFORM_NVIDIA__) target_include_directories( ${target} PRIVATE ${HIP_INCLUDE_DIRS}) if( CMAKE_CXX_COMPILER MATCHES ".*nvc\\+\\+$" ) target_compile_options( ${target} PRIVATE -cuda -Xptxas=-w) target_link_options( ${target} PRIVATE -cuda) else() target_compile_options( ${target} PRIVATE -arch sm_53 -gencode=arch=compute_53,code=sm_53 -Xptxas=-w) endif() if( NVHPC_FOUND ) target_link_libraries( ${target} PRIVATE NVHPC::CUDART ) else() target_link_libraries( ${target} PRIVATE CUDA::cudart ) endif() target_compile_definitions( ${target} PUBLIC _CUFFT_BACKEND ) endif() target_include_directories( ${target} PRIVATE $ $ $ ) target_link_libraries( ${target} PRIVATE hip::hipfft ${FFTW_LIBRARIES} ) if( BUILD_CLIENTS_TESTS_OPENMP ) if( BUILD_WITH_LIB STREQUAL "CUDA" ) message( STATUS "OpenMP is not supported on CUDA, building tests without it" ) else() if( DEFINED ${openmp_LIB_DIR} ) set_target_properties( ${target} PROPERTIES BUILD_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" ) set_target_properties( ${target} PROPERTIES INSTALL_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" ) endif() target_link_libraries( ${target} PRIVATE OpenMP::OpenMP_CXX ) endif() endif() if( HIPFFT_MPI_ENABLE ) target_link_libraries( ${target} PRIVATE MPI::MPI_CXX ) endif() set_target_properties(${target} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS ${target} COMPONENT tests) endforeach() find_package( GTest 1.11.0 ) if( GTest_FOUND ) target_include_directories( hipfft-test PRIVATE $ ) target_link_libraries( hipfft-test PRIVATE GTest::gtest ) else() # gtest build by the hipFFT add_dependencies( hipfft-test gtest ) target_include_directories( hipfft-test PRIVATE hipfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) target_link_libraries( hipfft-test PRIVATE ${GTEST_LIBRARIES} ) endif() # tests have callback functions, which need to be built as relocatable device code if( BUILD_WITH_LIB STREQUAL "CUDA" ) target_compile_options( hipfft-test PRIVATE -dc ) else() # -fgpu-rdc causes failure at link stage on Windows if (NOT WIN32) target_compile_options( hipfft-test PRIVATE -fgpu-rdc ) target_link_options( hipfft-test PRIVATE -fgpu-rdc ) endif() endif() if(FFTW_MULTITHREAD) target_compile_options( hipfft-test PRIVATE -DFFTW_MULTITHREAD ) endif( ) target_link_libraries( hipfft-test PRIVATE Threads::Threads ${CMAKE_DL_LIBS} ) # hipfft-test will opens the hipfftw library but does not link to it if( TARGET hipfftw ) add_dependencies( hipfft-test hipfftw ) endif() if (WIN32) # Ensure tests run with HIP DLLs and not anything the driver owns # in system32. Libraries like amdhip64.dll are also in the HIP # runtime, and we need run with those. But the only way to make a # same-named DLL override something in system32 is to have it next # to the executable. So copy them in. file( GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET hipfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() option(BUILD_CODE_COVERAGE "Build with code coverage flags (clang only)" OFF) set(COVERAGE_TEST_OPTIONS "--smoketest;--gtest_filter=-*call*" CACHE STRING "Command line arguments for hipfft-test when generating a code coverage report (Note: an additional run of hipfft-test targeting multi_gpu* and callback* tests is always executed and coverage results are aggregated)") if (BUILD_CODE_COVERAGE) add_custom_target( code_cov_tests DEPENDS hipfft-test COMMAND ${CMAKE_COMMAND} -E rm -rf ./coverage-report COMMAND ${CMAKE_COMMAND} -E make_directory ./coverage-report/profraw COMMAND ${CMAKE_COMMAND} -E env LLVM_PROFILE_FILE="./coverage-report/profraw/hipfft-coverage_%p.profraw" GTEST_LISTENER=NO_PASS_LINE_IN_LOG $ --precompile=./clients/staging/hipfft-test-precompile.db ${COVERAGE_TEST_OPTIONS} COMMAND ${CMAKE_COMMAND} -E env LLVM_PROFILE_FILE="./coverage-report/profraw/hipfft-coverage_%p.profraw" GTEST_LISTENER=NO_PASS_LINE_IN_LOG $ --precompile=./clients/staging/hipfft-test-precompile-multi_gpu-plus-callback.db --gtest_filter=multi_gpu*:callback* WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) find_program( LLVM_PROFDATA llvm-profdata REQUIRED HINTS ${ROCM_PATH}/llvm/bin PATHS /opt/rocm/llvm/bin ) find_program( LLVM_COV llvm-cov REQUIRED HINTS ${ROCM_PATH}/llvm/bin PATHS /opt/rocm/llvm/bin ) add_custom_target( coverage DEPENDS code_cov_tests COMMAND ${LLVM_PROFDATA} merge -sparse ./coverage-report/profraw/hipfft-coverage_*.profraw -o ./coverage-report/hipfft.profdata COMMAND ${LLVM_COV} report -object ./library/libhipfftw.so -object ./library/libhipfft.so -instr-profile=./coverage-report/hipfft.profdata COMMAND ${LLVM_COV} show -object ./library/libhipfftw.so -object ./library/libhipfft.so -instr-profile=./coverage-report/hipfft.profdata -format=html -output-dir=coverage-report WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() hipFFT-rocm-7.1.0/clients/tests/accuracy_test_1D.cpp000066400000000000000000000734121506642153200222740ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // TODO: handle special case where length=2 for real/complex transforms. const static std::vector pow2_range = {2, 4, 8, 16, 32, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912}; // 2^30 is 1073741824; const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; const static std::vector pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, 1594323, 4782969, 14348907, 43046721, 129140163, 387420489}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625, 78125, 390625, 1953125, 9765625, 48828125, 244140625}; // radix 7, 11, 13 sizes that are either pure powers or sizes people have wanted in the wild const static std::vector radX_range = {7, 49, 84, 112, 11, 13, 52, 104, 208, 343, 2401, 16807}; const static std::vector mix_range = {6, 10, 12, 15, 20, 30, 56, 120, 150, 225, 240, 300, 336, 486, 600, 900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000}; const static std::vector prime_range = {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static std::vector small_1D_sizes() { static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered std::vector covered_sizes = merge_and_sort_values( {pow2_range, pow3_range, pow5_range, radX_range, mix_range, prime_range}); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) { if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i)) { output.push_back(i); } } return output; } const static std::vector> stride_range = {{1}}; const static std::vector batch_range_1D = {4, 2, 1}; const static std::vector> stride_range_for_prime = {{1}, {2}, {3}, {64}, {65}}; //TODO: this will be merged back to stride_range const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved const static std::vector small_1D_lengths = small_1D_sizes(); INSTANTIATE_TEST_SUITE_P( small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, {fft_transform_type_complex_forward}, generate_lengths({small_1D_lengths}), {fft_precision_single}, {1}, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, false /* planar */)), accuracy_test::TestName); // NB: // We have known non-unit strides issues for 1D: // - C2C middle size(for instance, single precision, 8192) // - C2C large size(for instance, single precision, 524288) // We need to fix non-unit strides first, and then address non-unit strides + batch tests. // Then check these problems of R2C and C2R. After that, we could open arbitrary permutations in the // main tests. // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // involved. const static std::vector pow2_range_for_stride = {4096, 8192, 524288}; const static std::vector pow2_range_for_stride_half = {4096, 8192}; const static std::vector> stride_range_for_pow2 = {{2}, {3}}; const static std::vector batch_range_for_stride = {2, 1}; INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_for_stride, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_for_stride, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half}), {fft_precision_half}, batch_range_for_stride, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // Create an array parameters for strided 2D batched transforms. inline auto param_generator_complex_1d_batched_2d(const double base_prob, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range) { std::vector params; // for(auto& transform_type : // {fft_transform_type_complex_forward, fft_transform_type_complex_inverse}) // { for(auto& transform_type : trans_type_range_complex) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { assert(false); continue; } for(const auto precision : precision_range) { for(const auto& types : generate_types(transform_type, place_range, false)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = lengths; param.ostride = lengths; param.nbatch = lengths[0]; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = 1; param.odist = 1; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = base_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } } return params; } const static std::vector pow2_range_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; INSTANTIATE_TEST_SUITE_P( pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow2_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow3_range_2D = {3, 27, 81, 243, 729, 2187, 6561}; INSTANTIATE_TEST_SUITE_P( pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow3_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow5_range_2D = {5, 25, 125, 625, 3125, 15625}; INSTANTIATE_TEST_SUITE_P( pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow5_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector prime_range_2D = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; INSTANTIATE_TEST_SUITE_P( prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({prime_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector lengths_for_disabled_autoalloc = merge_and_sort_values( {pow2_range, pow3_range, pow5_range, radX_range, mix_range, small_1D_lengths, prime_range}, 128); INSTANTIATE_TEST_SUITE_P( various_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({lengths_for_disabled_autoalloc}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false, fft_auto_allocation_off)), accuracy_test::TestName); hipFFT-rocm-7.1.0/clients/tests/accuracy_test_2D.cpp000066400000000000000000000430601506642153200222710ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: enable 16384, 32768 when omp support is available (takes too // long!) const static std::vector pow2_range = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024}; const static std::vector pow3_range = {3, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625}; const static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; const static std::vector mix_range = {56, 120, 336, 2160, 5000, 6000, 8000}; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range, mix_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range, mix_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); const static std::vector lengths_for_disabled_autoalloc = merge_and_sort_values({pow2_range, pow3_range, prime_range, mix_range}, 12); INSTANTIATE_TEST_SUITE_P( various_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({lengths_for_disabled_autoalloc, lengths_for_disabled_autoalloc}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false, fft_auto_allocation_off)), accuracy_test::TestName); hipFFT-rocm-7.1.0/clients/tests/accuracy_test_3D.cpp000066400000000000000000000335121506642153200222730ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/fft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: 512, 1024, 2048 make the tests take too long; re-enable when // test speed is improved. static std::vector pow2_range = {4, 8, 16, 32, 128, 256}; static std::vector pow2_range_half = {4, 8, 16, 32}; // SBCC+SBRC as a sub-node of a 3D TRTRTR std::vector> pow2_adhoc = {{4, 4, 8192}}; static std::vector pow3_range = {3, 9, 27, 81, 243}; static std::vector pow5_range = {5, 25, 125}; static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29}; static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P( pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range, prime_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, false)), accuracy_test::TestName); // Test combinations of SBRC sizes, plus a non-SBRC size (10) to // exercise fused SBRC+transpose kernels. static std::vector sbrc_range = {50, 64, 81, 100, 200, 10, 128, 256}; static std::vector sbrc_batch_range = {2, 1}; INSTANTIATE_TEST_SUITE_P( sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({sbrc_range, sbrc_range, sbrc_range}), precision_range_sp_dp, sbrc_batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; static std::vector inner_batch_3D_batch_range = {3, 2, 1}; INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, // TODO: enable for real as well, but currently real kernels have // trouble with weird strides ::testing::ValuesIn(param_generator_complex( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, false, false)), accuracy_test::TestName); const static std::vector lengths_for_disabled_autoalloc = merge_and_sort_values( {pow2_range, pow3_range, pow5_range, prime_range, sbrc_range}, 5); INSTANTIATE_TEST_SUITE_P( various_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({lengths_for_disabled_autoalloc, lengths_for_disabled_autoalloc, lengths_for_disabled_autoalloc}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, false, fft_auto_allocation_off)), accuracy_test::TestName); hipFFT-rocm-7.1.0/clients/tests/accuracy_test_callback.cpp000066400000000000000000000136041506642153200235610ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> callback_sizes = { // some single kernel sizes {4}, {16}, {81}, {100}, // L1D_TRTRT sizes {70}, {77}, {1344}, // L1D_CC sizes {8192}, {10000}, // prime {23}, {29}, // 2D_SINGLE sizes, small and big {16, 8}, {32, 32}, {9, 81}, {27, 81}, {81, 27}, {256, 9}, {9, 256}, {125, 32}, {32, 125}, // 2D_RTRT {20, 40}, {81, 81}, // 2D_RC {128, 64}, {128, 256}, // more complicated children of 2D_RTRT (L1D_TRTRT, L1D_CC, prime) {4, 63}, {63, 4}, {4, 8192}, {8192, 4}, {4, 23}, {23, 4}, // 3D_TRTRTR, with complicated children {63, 5, 6}, {6, 5, 63}, {23, 5, 6}, {6, 5, 23}, {70, 5, 6}, {6, 5, 70}, {8192, 5, 6}, {6, 5, 8192}, // 3D_RTRT, with complicated children {23, 4, 4}, {4, 4, 23}, {70, 4, 4}, {4, 4, 70}, {8192, 4, 4}, {4, 4, 8192}, // 3D odd lengths {27, 27, 27}, // 3D_BLOCK_RC {64, 64, 64}, }; const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; auto transform_types = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; #ifdef __HIP__ INSTANTIATE_TEST_SUITE_P( #ifdef WIN32 DISABLED_callback_no_offset, #else callback, #endif accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, true)), accuracy_test::TestName); #endif // one of the obvious use cases for callbacks is to implement result // scaling manually, so use the same sizes to test rocFFT's own // result scaling feature. inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(test_prob, callback_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false); for(auto& param : params) param.scale_factor = 7.23; return params; } // cuFFT does not support result scaling #ifndef _CUFFT_BACKEND INSTANTIATE_TEST_SUITE_P(scaling, accuracy_test, ::testing::ValuesIn(param_generator_scaling(callback_sizes)), accuracy_test::TestName); #endif hipFFT-rocm-7.1.0/clients/tests/gtest_main.cpp000066400000000000000000000621531506642153200212510ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /// @file /// @brief googletest based unit tester for hipfft /// #include #include #include #include #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/concurrency.h" #include "../../shared/device_properties.h" #include "../../shared/environment.h" #include "../../shared/hostbuf.h" #include "../../shared/sys_mem.h" #include "../../shared/work_queue.h" #include "../hipfft_params.h" #include "hipfft/hipfft.h" #include "hipfft_accuracy_test.h" // initialize static class member of hipfft_params std::vector hipfft_params::externally_managed_workareas = std::vector(); // Control output verbosity: int verbose; // User-defined random seed size_t random_seed; std::random_device default_seed_dev; // Overall probability of running conventional tests double test_prob; // Modifier for probability of running tests with complex interleaved data double complex_interleaved_prob_factor; // Modifier for probability of running tests with real data double real_prob_factor; // Modifier for probability of running tests with complex planar data double complex_planar_prob_factor; // Modifier for probability of running tests with callbacks double callback_prob_factor; // Constraints for the hipfftw tests size_t max_length_for_hipfftw_test; size_t max_io_gb_for_hipfftw_test; // Transform parameters for manual test: hipfft_params manual_params; // Host memory limitation for tests (GiB): size_t ramgb; // Device memory limitation for tests (GiB): size_t vramgb; // Allow skipping tests if there is a runtime error bool skip_runtime_fails; // But count the number of failures int n_hip_failures = 0; // Manually specified precision cutoffs: double single_epsilon; double double_epsilon; double half_epsilon; // Measured precision cutoffs: double max_linf_eps_double = 0.0; double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; double max_linf_eps_half = 0.0; double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; // Compare results against FFTW in accuracy tests bool fftw_compare = true; // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; // Multi-process library to use fft_params::fft_mp_lib mp_lib = fft_params::fft_mp_lib_none; // Number of multi-process ranks to launch int mp_ranks = 1; // Multi-process launch command (e.g. mpirun --np 4 /path/to/hipfft_mpi_worker) std::string mp_launch; void init_gtest_flags() { // HACK: gtest maintains a "should run" flag on each test case, // but only sets it during RUN_ALL_TESTS. Precompiling should // ideally only happen for the test cases that would actually // run. // // So call RUN_ALL_TESTS once with the "list tests" temporarily set // to true, to initialize all of that. // // gtest will then print all of the test cases to stdout. // Temporarily redirect stdout to /dev/null as well. bool temp_list_tests = true; std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); // move stdout to devnull #ifdef WIN32 int stdout_fd = _fileno(stdout); int devnull = _open("NUL", _O_WRONLY); int stdout_copy = _dup(stdout_fd); _dup2(devnull, stdout_fd); #else int stdout_fd = STDOUT_FILENO; int devnull = open("/dev/null", O_WRONLY); int stdout_copy = dup(stdout_fd); dup2(devnull, stdout_fd); #endif (void)RUN_ALL_TESTS(); // put stdout back #ifdef WIN32 _dup2(stdout_copy, stdout_fd); _close(stdout_copy); _close(devnull); #else dup2(stdout_copy, stdout_fd); close(stdout_copy); close(devnull); #endif std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); } void precompile_test_kernels(const std::string& precompile_file) { std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; init_gtest_flags(); std::vector tokens; auto ut = testing::UnitTest::GetInstance(); for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index) { const auto ts = ut->GetTestSuite(ts_index); for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index) { const auto ti = ts->GetTestInfo(ti_index); std::string name = ti->name(); // only precompile test cases that will run if(!ti->should_run()) continue; // only care about accuracy tests const auto pos = name.find("vs_fftw/"); if(pos != std::string::npos) { name.erase(0, pos + 8); // change batch to 1, so we don't waste time creating // multiple plans that differ only by batch auto idx = name.find("_batch_"); if(idx == std::string::npos) continue; // advance idx to batch number idx += 7; auto end = name.find('_', idx); if(end == std::string::npos) continue; name.replace(idx, end - idx, "1"); tokens.emplace_back(std::move(name)); } } } std::random_device dev; std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); std::cout << "precompiling kernels for " << tokens.size() << " tokens...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); EnvironmentSetTemp env_compile_only{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"}; const size_t NUM_THREADS = rocfft_concurrency(); std::vector threads; for(size_t i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([&tokenQueue]() { for(;;) { std::string token{tokenQueue.pop()}; if(token.empty()) break; try { hipfft_params params; params.from_token(token); params.validate(); params.create_plan(); if(params.is_forward()) { hipfft_params inverse_params; inverse_params.inverse_from_forward(params); inverse_params.validate(); inverse_params.create_plan(); } } catch(fft_params::work_buffer_alloc_failure&) { continue; } catch(std::exception& e) { // failed to create a plan, abort // // we could continue on, but the test should just // fail later anyway in the same way. so report // which token failed early and get out throw std::runtime_error(token + " plan creation failure: " + e.what()); } } }); // insert empty tokens to tell threads to stop tokenQueue.push({}); } for(auto& t : threads) t.join(); auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) << " ms\n"; } int main(int argc, char* argv[]) { CLI::App app{ "\n" "hipFFT Runtime Test command line options\n" "NB: input parameters are row-major.\n" "\n" "FFTW accuracy test cases are named using these identifiers:\n" "\n" " len_: problem dimensions, row-major\n" " single,double: precision\n" " ip,op: in-place or out-of-place\n" " batch_: batch size\n" " istride__: input stride (ostride for output stride), format may be:\n" " CI - complex interleaved\n" " CP - complex planar\n" " R - real\n" " HI - hermitian interleaved\n" " HP - hermitian planar\n" "\n" "Usage"}; // Override CLI11 help to print it along gtest's help app.set_help_flag(""); const auto opt_help = app.add_flag("-h, --help", "Produces this help message"); app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests") ->default_val(0); app.add_option("--test_prob", test_prob, "Probability of running individual tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option("--real_prob", real_prob_factor, "Probability multiplier for running individual real/complex transforms") ->default_val(1.0) ->check(CLI::PositiveNumber); app.add_option("--planar_prob", complex_planar_prob_factor, "Probability multiplier for running individual planar transforms") ->default_val(0.1) ->check(CLI::PositiveNumber); app.add_option( "--complex_interleaved_prob_factor", complex_interleaved_prob_factor, "Probability multiplier for running individual transforms with complex interleaved data") ->default_val(1) ->check(CLI::PositiveNumber); app.add_option("--callback_prob", callback_prob_factor, "Probability multiplier for running individual callback transforms") ->default_val(0.1) ->check(CLI::NonNegativeNumber); app.add_option("--max_hipfftw_test_len", max_length_for_hipfftw_test, "Maximum length to be considered in hipfftw tests") ->default_val(8192) ->check(CLI::PositiveNumber); app.add_option("--max_io_gb_for_hipfftw_test", max_io_gb_for_hipfftw_test, "Maximum size of I/O to be considered in hipfftw tests in GiB") ->default_val(1) /* 1 GiB */ ->check(CLI::PositiveNumber); app.add_option("--fftw_compare", fftw_compare, "Compare to FFTW in accuracy tests") ->default_val(true); app.add_option("--mp_lib", mp_lib, "Multi-process library type: none (default), mpi") ->default_val("none"); app.add_option("--mp_ranks", mp_ranks, "Number of multi-process ranks to launch") ->default_val(1) ->check(CLI::NonNegativeNumber); app.add_option("--mp_launch", mp_launch, "Command line prefix to launch multi-process transforms, e.g. \"mpirun --np 4 " "/path/to/hipfft_mpi_worker\"") ->default_val("") ->each([&](const std::string&) { if(mp_lib == fft_params::fft_mp_lib_none) { throw CLI::ValidationError( "--mp_launch requires an mp library (see mp_lib in --help)"); } }) ->needs("--mp_lib"); app.add_option("--seed", random_seed, "Random seed; if unset, use an actual random seed") ->default_val(default_seed_dev()); app.add_flag("--smoketest", "Run a short (approx 5 minute) randomized selection of tests") ->each([&](const std::string&) { // The objective is to have an test that takes about 5 minutes, so just set the // probability per test to a small value to achieve this result. test_prob = 0.002; }); // Token string to fully specify fft params for the manual test. std::string test_token; auto* opt_token = app.add_option("--token", test_token, "Test token name for manual test")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token->excludes(opt_token); // Declare the supported options. Some option pointers are declared to track passed opts. non_token->add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) { manual_params.run_callbacks = true; }); non_token ->add_option("--auto_allocation", manual_params.auto_allocate, "hipFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { manual_params.precision = fft_precision_double; }); non_token ->add_option("-t, --transformType", manual_params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--precision", manual_params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { manual_params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", manual_params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", manual_params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token->add_option("--length", manual_params.length, "Lengths")->expected(1, 3); non_token ->add_option("-b, --batchSize", manual_params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); non_token->add_option("--istride", manual_params.istride, "Input stride"); non_token->add_option("--ostride", manual_params.ostride, "Output stride"); non_token->add_option("--idist", manual_params.idist, "Logical distance between input batches") ->default_val(0); non_token->add_option("--odist", manual_params.odist, "Logical distance between output batches") ->default_val(0); non_token->add_option("--ioffset", manual_params.ioffset, "Input offset"); non_token->add_option("--ooffset", manual_params.ooffset, "Output offset"); non_token->add_option("--isize", manual_params.isize, "Logical size of input buffer"); non_token->add_option("--osize", manual_params.osize, "Logical size of output buffer"); non_token->add_option( "--scalefactor", manual_params.scale_factor, "Scale factor to apply to output"); // Default value is set in fft_params.h based on if device-side PRNG was enabled. non_token->add_option("-g, --inputGen", manual_params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); const auto* opt_version = app.add_flag( "--version", "Print queryable version information from the hipfft library's backend (and return)"); app.add_option("--R", ramgb, "RAM limit in GiB for tests") ->default_val(host_memory::singleton().get_total_gbytes()); app.add_option("--V", vramgb, "VRAM limit in GiB for tests")->default_val(0); app.add_option("--half_epsilon", half_epsilon)->default_val(9.77e-4); app.add_option("--single_epsilon", single_epsilon)->default_val(3.75e-5); app.add_option("--double_epsilon", double_epsilon)->default_val(1e-15); app.add_option("--skip_runtime_fails", skip_runtime_fails, "Skip the test if there is a runtime failure") ->default_val(true); app.add_option("-w, --wise", use_fftw_wisdom, "Use FFTW wisdom"); // Filename for fftw and fftwf wisdom. std::string fftw_wisdom_filename; app.add_option("-W, --wisdomfile", fftw_wisdom_filename, "FFTW3 wisdom filename") ->default_val("wisdom3.txt"); // Filename for precompiled kernels to be written to std::string precompile_file; app.add_option("--precompile", precompile_file, "Precompile kernels to a file for all test cases before running tests") ->default_val(""); // Try parsing initial args that will be used to configure tests // Allow extras to pass on gtest arguments without error app.allow_extras(); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!test_token.empty()) { std::cout << "Reading fft params from token:\n" << test_token << std::endl; try { manual_params.from_token(test_token); std::cout << "manual_params.token() = " << manual_params.token() << std::endl; } catch(...) { std::cout << "Unable to parse token." << std::endl; return 1; } } if(manual_params.length.empty()) { manual_params.length.push_back(8); // TODO: add random size? } if(manual_params.istride.empty()) { manual_params.istride.push_back(1); // TODO: add random size? } if(manual_params.ostride.empty()) { manual_params.ostride.push_back(1); // TODO: add random size? } // User-settable options defining the values of all the actual test parameters // (e.g., probability factors and value of manual_params) must be handled // before invoking ::testing::InitGoogleTest as it triggers evaluation of said // parameters (e.g., args of "::testing::Values{In}" in instantiations of test // suites). // set any "unset" parameters of manual_params before initiating gtests // (makes the token reported by gtest less ambiguous) manual_params.validate(); // extract remaining arguments for subsequent gtest initialization std::vector remaining_args = app.remaining(); std::string gtest_help_opt = "--help"; // NB: If we initialize gtest first, then it removes all of its own command-line // arguments and sets argc and argv correctly; std::vector gtest_argv; gtest_argv.insert(gtest_argv.begin(), argv[0]); for(std::string& s : remaining_args) { gtest_argv.push_back(&s[0]); } if(*opt_help) { // make sure gtest prints its help as well gtest_argv.push_back(>est_help_opt[0]); } gtest_argv.push_back(NULL); decltype(argc) gtest_argc = gtest_argv.size() - 1; ::testing::InitGoogleTest(>est_argc, gtest_argv.data()); // gtest-relevant args are removed if(*opt_help) { std::cout << app.help() << "\n"; return EXIT_SUCCESS; } // no help was used, gtest_argc is expected to be 1 at this point. If not, some of the // used options were not recognized at all if(gtest_argc > 1) { std::cout << "Unrecognised option(s) found:\n "; for(auto i = 1; i < gtest_argc; i++) std::cout << gtest_argv[i] << " "; std::cout << "\nRun with --help for more information.\n"; return EXIT_FAILURE; } if(*opt_version || verbose > 0) { int hipfft_version; hipfftGetVersion(&hipfft_version); std::cout << "hipFFT version: " << hipfft_version << std::endl; if(*opt_version) { return EXIT_SUCCESS; } } std::cout << "Using random_seed = " << random_seed << std::endl; std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon << std::endl; // if precompiling, tell rocFFT to use the specified cache file // to write kernels to // // but if our environment already has a cache file for RTC, then // we should just use that std::unique_ptr env_precompile; if(!precompile_file.empty() && rocfft_getenv("ROCFFT_RTC_CACHE_PATH").empty()) { env_precompile = std::make_unique("ROCFFT_RTC_CACHE_PATH", precompile_file.c_str()); } #ifdef FFTW_MULTITHREAD fftw_init_threads(); fftwf_init_threads(); fftw_plan_with_nthreads(rocfft_concurrency()); fftwf_plan_with_nthreads(rocfft_concurrency()); #endif // Set host memory limit from command-line options host_memory::singleton().set_limit_gbytes(ramgb); std::cout << "Host memory limit: " << ramgb << " GiB" << std::endl; if(use_fftw_wisdom) { if(verbose) { std::cout << "Using " << fftw_wisdom_filename << " wisdom file\n"; } std::ifstream fftw_wisdom_file(fftw_wisdom_filename); std::string allwisdom = std::string(std::istreambuf_iterator(fftw_wisdom_file), std::istreambuf_iterator()); std::string fftw_wisdom; std::string fftwf_wisdom; bool load_wisdom = false; bool load_fwisdom = false; std::istringstream input; input.str(allwisdom); // Separate the single-precision and double-precision wisdom: for(std::string line; std::getline(input, line);) { if(line.rfind("(fftw", 0) == 0 && line.find("fftw_wisdom") != std::string::npos) { load_wisdom = true; } if(line.rfind("(fftw", 0) == 0 && line.find("fftwf_wisdom") != std::string::npos) { load_fwisdom = true; } if(load_wisdom) { fftw_wisdom.append(line + "\n"); } if(load_fwisdom) { fftwf_wisdom.append(line + "\n"); } if(line.rfind(")", 0) == 0) { load_wisdom = false; load_fwisdom = false; } } fftw_import_wisdom_from_string(fftw_wisdom.c_str()); fftwf_import_wisdom_from_string(fftwf_wisdom.c_str()); } if(!precompile_file.empty()) precompile_test_kernels(precompile_file); auto retval = RUN_ALL_TESTS(); if(use_fftw_wisdom) { std::string fftw_wisdom = std::string(fftw_export_wisdom_to_string()); std::string fftwf_wisdom = std::string(fftwf_export_wisdom_to_string()); fftw_wisdom.append(std::string(fftwf_export_wisdom_to_string())); std::ofstream fftw_wisdom_file(fftw_wisdom_filename); fftw_wisdom_file << fftw_wisdom; fftw_wisdom_file << fftwf_wisdom; fftw_wisdom_file.close(); } std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << std::endl; std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << std::endl; std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << std::endl; std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << std::endl; std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << std::endl; std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << std::endl; hipfft_params::externally_managed_workareas.clear(); return retval; } // instantiation of the paramameterized accuracy_test for the // configuration set manually: INSTANTIATE_TEST_SUITE_P(manual, accuracy_test, ::testing::Values(static_cast(manual_params)), accuracy_test::TestName); hipFFT-rocm-7.1.0/clients/tests/hipfft_accuracy_test.cpp000066400000000000000000001010361506642153200233020ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "hipfft/hipfft.h" #include "../hipfft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/gpubuf.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_complex.h" #include "../../shared/subprocess.h" extern std::string mp_launch; extern last_cpu_fft_cache last_cpu_fft_data; // clang-format off // tokens of tests found to be symptomatic static const std::vector symptomatic_tokens = { #ifndef _CUFFT_BACKEND // cases specific to ROCM backend #else // cases specific to CUFFT backend "real_forward_len_16384_half_ip_batch_4_istride_1_R_ostride_1_HI_idist_16386_odist_8193_ioffset_0_0_ooffset_0_0", "real_forward_len_32768_half_ip_batch_4_istride_1_R_ostride_1_HI_idist_32770_odist_16385_ioffset_0_0_ooffset_0_0", "real_forward_len_65536_half_ip_batch_2_istride_1_R_ostride_1_HI_idist_65538_odist_32769_ioffset_0_0_ooffset_0_0", #endif // common to both backends }; // clang-format on void fft_vs_reference(hipfft_params& params, bool round_trip) { switch(params.precision) { case fft_precision_half: fft_vs_reference_impl(params, round_trip); break; case fft_precision_single: fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: fft_vs_reference_impl(params, round_trip); break; } } // Test for comparison between FFTW and hipFFT. TEST_P(accuracy_test, vs_fftw) { hipfft_params params(GetParam()); params.validate(); if(!params.valid(verbose)) { if(verbose) { std::cout << "Invalid parameters, skip this test." << std::endl; } GTEST_SKIP(); } switch(params.mp_lib) { case fft_params::fft_mp_lib_none: { // skipping symptomatic case(s), unless forcefully/knowingly executing normally-disabled // test tokens (e.g., by using --gtest_also_run_disabled_tests) const char* test_suite_name = ::testing::UnitTest::GetInstance()->current_test_info()->test_suite_name(); if(!symptomatic_tokens.empty() && std::strstr(test_suite_name, "DISABLED") == nullptr && std::find(symptomatic_tokens.begin(), symptomatic_tokens.end(), params.token()) != symptomatic_tokens.end()) { GTEST_SKIP() << "Symptomatic test that's currently disabled by default (force-skipping). Use " "CLI arguments '--gtest_also_run_disabled_tests' to force the test execution " "(via another test suite)."; } // only do round trip for forward FFTs const bool do_round_trip = params.is_forward(); try { fft_vs_reference(params, do_round_trip); } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(const fft_params::unimplemented_exception& e) { GTEST_SKIP() << "Unimplemented exception: " << e.what(); } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } break; } case fft_params::fft_mp_lib_mpi: { // split launcher into tokens since the first one is the exe // and the remainder is the start of its argv boost::escaped_list_separator sep('\\', ' ', '\"'); boost::tokenizer> tokenizer(mp_launch, sep); std::string exe; std::vector argv; for(auto t : tokenizer) { if(t.empty()) continue; if(exe.empty()) exe = t; else argv.push_back(t); } // append test token and ask for accuracy test argv.push_back("--token"); argv.push_back(params.token()); argv.push_back("--accuracy"); // throws an exception if launch fails or if subprocess // returns nonzero exit code execute_subprocess(exe, argv, {}); break; } default: GTEST_FAIL() << "Invalid communicator choice!"; break; } SUCCEED(); } INSTANTIATE_TEST_SUITE_P(DISABLED_symptomatic_tokens, accuracy_test, ::testing::ValuesIn(param_generator_token(test_prob, symptomatic_tokens)), accuracy_test::TestName); #ifdef __HIP__ // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // multiply each element by scalar if(input == testdata->base) return input[offset] * testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_dev_half = load_callback; __device__ auto load_callback_dev_complex_half = load_callback>; __device__ auto load_callback_dev_float = load_callback; __device__ auto load_callback_dev_complex_float = load_callback>; __device__ auto load_callback_dev_double = load_callback; __device__ auto load_callback_dev_complex_double = load_callback>; // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // subtract each element by scalar if(input == testdata->base) return input[offset] - testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_round_trip_inverse_dev_half = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_half = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_float = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_float = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_double = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_double = load_callback_round_trip_inverse>; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } default: // planar is unsupported for now return load_callback_host; } } template __host__ __device__ static void store_callback(Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_dev_half = store_callback; __device__ auto store_callback_dev_complex_half = store_callback>; __device__ auto store_callback_dev_float = store_callback; __device__ auto store_callback_dev_complex_float = store_callback>; __device__ auto store_callback_dev_double = store_callback; __device__ auto store_callback_dev_complex_double = store_callback>; template __host__ __device__ static void store_callback_round_trip_inverse( Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // divide each element by scalar if(output == testdata->base) { output[offset] = element / testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_round_trip_inverse_dev_half = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_half = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_float = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_float = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_double = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_double = store_callback_round_trip_inverse>; void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } default: // planar is unsupported for now return store_callback_host; } } // implement result scaling as a store callback, as rocFFT tests do void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; callback_test_data cbdata; cbdata.scalar = params.store_cb_scalar; cbdata.base = output.front().data(); switch(params.otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } // apply load callback if necessary void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; // we're applying callbacks to FFTW input/output which we can // assume is contiguous and non-planar callback_test_data cbdata; cbdata.scalar = params.load_cb_scalar; cbdata.base = input.front().data(); switch(params.itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(std::complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } #else // Stubs for callback tests. // Many seem to be called unconditionally, so we can't throw exceptions in // most cases. void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { return nullptr; } void apply_load_callback(const fft_params& params, std::vector& input) {} // implement result scaling as a store callback, as rocFFT tests do void apply_store_callback(const fft_params& params, std::vector& output) { if(params.scale_factor == 1.0) return; switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = static_cast(element) * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; element = element * params.scale_factor; } } break; } } } void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { throw std::runtime_error("get_store_callback_host not implemented"); return nullptr; } #endif hipFFT-rocm-7.1.0/clients/tests/hipfft_accuracy_test.h000066400000000000000000000025431506642153200227520ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST #include "../../shared/accuracy_test.h" #include "../hipfft_params.h" void fft_vs_reference(hipfft_params& params, bool round_trip = false); #endif hipFFT-rocm-7.1.0/clients/tests/hipfft_mpi_worker.cpp000066400000000000000000000033441506642153200226320ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "../../shared/mpi_worker.h" #include "../hipfft_params.h" // initialize static class member of hipfft_params std::vector hipfft_params::externally_managed_workareas = std::vector(); int main(int argc, char* argv[]) { return mpi_worker_main, false>( "hipFFT MPI worker process", argc, argv, [](const std::vector& lib_strings) { return std::array(); }); } hipFFT-rocm-7.1.0/clients/tests/hipfftw_test.cpp000066400000000000000000003116241506642153200216250ustar00rootroot00000000000000// Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../hipfftw_helper.h" #include "../../shared/environment.h" #include "../../shared/gpubuf.h" #include "../../shared/hostbuf.h" #include "../../shared/params_gen.h" #include "../../shared/test_params.h" #include #include #include #include #include #include #include #ifdef WIN32 #include #else #include #endif #ifdef _OPENMP #include #endif // test details namespace { // //--------------------------------------------------------------------------------------------- // COMMONS AND HELPERS //--------------------------------------------------------------------------------------------- // size_t max_byte_size_for_hipfftw_tests() { auto get_io_byte_size_limit = []() { size_t tmp = vramgb * ONE_GiB; if(tmp == 0) { size_t free = 0, total = 0; if(hipMemGetInfo(&free, &total) == hipSuccess) tmp = total / 8; } tmp = std::min(tmp, ramgb * ONE_GiB); tmp = std::min(tmp, max_io_gb_for_hipfftw_test * ONE_GiB); if(verbose > 0) { std::cout << "Limit for the size of I/O data used in hipfftw tests: "; if(tmp >= ONE_GiB) std::cout << float(tmp) / ONE_GiB << " GiB." << std ::endl; else if(tmp >= ONE_MiB) std::cout << float(tmp) / ONE_MiB << " MiB." << std ::endl; else std::cout << float(tmp) / ONE_KiB << " KiB." << std ::endl; } return tmp; }; static const size_t io_byte_size_limit = get_io_byte_size_limit(); return io_byte_size_limit; } std::ranlux24_base& get_pseudo_rng() { static std::ranlux24_base gen(random_seed); return gen; } // NOTE: this function makes use of comparison operator < and != which must be defined for the // specialization type T template void insert_into_unique_sorted_params(std::vector& unique_sorted_params, const T& param_to_insert) { auto it = std::lower_bound( unique_sorted_params.begin(), unique_sorted_params.end(), param_to_insert); if(it != unique_sorted_params.end() && *it == param_to_insert) return; // it's already in there generated unique_sorted_params.insert(it, param_to_insert); } enum class hipfftw_internal_exception { none, flow_redirection, invalid_args, unsupported_args, ill_defined }; // for well-defined internal exceptions, we may expect a specific report thereof // in the hipfftw exception log (if logging is activated) template constexpr std::string_view hipfftw_expected_log_instance; template <> constexpr std::string_view hipfftw_expected_log_instance< hipfftw_internal_exception::invalid_args> = R"(Invalid argument reported)"; template <> constexpr std::string_view hipfftw_expected_log_instance< hipfftw_internal_exception::unsupported_args> = R"(Unsupported usage reported)"; template <> constexpr std::string_view hipfftw_expected_log_instance< hipfftw_internal_exception::flow_redirection> = R"(Redirecting execution flow)"; // randomizers // Note: albeit not supported, ranks > 3 are "valid" rank argument // --> limiting rank value to max of 10 by default to avoid ridiculously long // lengths possibly created in automated parameter generations; template ::lowest(), int max_rank = validity_flag ? 10 : 0, std::enable_if_t<(min_rank <= max_rank) && (!validity_flag || min_rank > 0) && (validity_flag || max_rank <= 0), bool> = true> int get_random_rank() { static std::uniform_int_distribution rank_rng(min_rank, max_rank); auto ret = rank_rng(get_pseudo_rng()); if(rank_is_valid_for_hipfftw(ret) != validity_flag) { throw std::runtime_error( "failed to generate a rank value of desired validity randomly"); } return ret; } template std::vector get_random_lengths(int desired_rank, ptrdiff_t max_abs_len = std::numeric_limits::max(), ptrdiff_t min_abs_len = 0) { std::vector ret; // cannot generate lengths for invalid ranks --> return empty lengths in that case if(!rank_is_valid_for_hipfftw(desired_rank)) return ret; if(min_abs_len < 0 || min_abs_len > max_abs_len) throw std::invalid_argument("invalid bounds used for get_random_lengths"); // generate values that are all representable as integers auto& pseudo_rng = get_pseudo_rng(); std::uniform_int_distribution length_rng(min_abs_len, max_abs_len); // setter lambda auto set_random_len = [&]() { for(auto& l : ret) { const ptrdiff_t val = length_rng(pseudo_rng); if constexpr(validity_flag) l = val; else { if(pseudo_rng() % 2) l = -val; else l = val; } } }; ret.resize(desired_rank); set_random_len(); while(lengths_are_valid_for_hipfftw_as(ret, desired_rank) != validity_flag) set_random_len(); return ret; } template int get_random_sign(fft_transform_type intended_dft_kind) { if(!validity_flag && is_real(intended_dft_kind)) throw std::invalid_argument("An invalid sign cannot be generated for real transforms " "(sign is ignored in that case)"); std::uniform_int_distribution sign_rng(std::numeric_limits::lowest(), std::numeric_limits::max()); int tmp = validity_flag && is_complex(intended_dft_kind) ? (is_fwd(intended_dft_kind) ? FFTW_FORWARD : FFTW_BACKWARD) : sign_rng(get_pseudo_rng()); while(sign_is_valid_for_hipfftw(tmp, intended_dft_kind) != validity_flag) tmp = sign_rng(get_pseudo_rng()); return tmp; } template unsigned get_random_flags() { std::uniform_int_distribution flags_rng(std::numeric_limits::lowest(), std::numeric_limits::max()); auto tmp = flags_rng(get_pseudo_rng()); if constexpr(validity_flag) { tmp &= hipfftw_valid_flags_mask; if(!flags_are_valid_for_hipfftw(tmp)) throw std::runtime_error("failed to create random valid flags"); return tmp; } while(flags_are_valid_for_hipfftw(tmp)) tmp = flags_rng(get_pseudo_rng()); return tmp; } size_t get_random_idx(size_t upper_bound) { if(upper_bound == 0) throw std::invalid_argument("upper_bound must be strictly positive for get_random_idx"); std::uniform_int_distribution idx_rng(0, upper_bound - 1); return idx_rng(get_pseudo_rng()); } // calculates the threshold value X such that max_data_idx is no greater // (resp. larger) than num_elems, if all elements of lengths are all no greater // (resp. larger) than X, and lengths.size() == rank [using bisection] ptrdiff_t get_len_threshold(size_t num_elems, int rank, bool is_real_inplace) { if(rank < 1) throw std::invalid_argument("invalid rank used in get_len_threshold"); if(num_elems == 0) return 1; constexpr ptrdiff_t X_max = std::numeric_limits::max(); // we need to find X in [0, X_max] s.t. // largest_idx(X) <= num_elems && largest_idx(X + 1) > num_elems auto largest_idx = [&](ptrdiff_t X) { size_t ret = rank > 1 && is_real_inplace ? 2 * (X / 2 + 1) : X; for(auto i = 1; i < rank; i++) ret *= X; return ret; }; // initialization ptrdiff_t X_down = rank == 1 ? static_cast(num_elems) : static_cast(std::floor(std::pow(num_elems, 1.0 / rank))); ptrdiff_t diff = 1; ptrdiff_t X_up = X_down; while(largest_idx(X_up) <= num_elems && X_up < X_max) { X_down = X_up; X_up = X_up <= X_max - diff ? X_up + diff : X_max; diff *= 2; } diff = 1; while(largest_idx(X_down) > num_elems && X_down > 0) { X_up = X_down; X_down = X_down >= diff ? X_down - diff : 0; diff *= 2; } // bisection while(X_up - X_down > 1) { const auto tmp = (X_up + X_down) / 2; if(largest_idx(tmp) <= num_elems) X_down = tmp; else X_up = tmp; } return X_down; } template size_t max_num_elems_for_data_size(size_t data_byte_size, fft_transform_type dft_kind) { return data_byte_size / (is_real(dft_kind) ? sizeof(hipfftw_real_t) : sizeof(hipfftw_complex_t)); } // exception for hip runtime error(s) specifically struct hip_runtime_error : public std::runtime_error { const hipError_t hip_error; hip_runtime_error(const std::string& info, hipError_t hip_status) : std::runtime_error::runtime_error(info) , hip_error(hip_status) { } }; int get_current_device_id() { int ret = hipInvalidDeviceId; const auto hip_status = hipGetDevice(&ret); if(hip_status != hipSuccess) throw hip_runtime_error("hipGetDevice failed.", hip_status); return ret; } // //--------------------------------------------------------------------------------------------- // EXISTENCE OF UTILITY FUNCTIONS //--------------------------------------------------------------------------------------------- // template void test_existence_of_utility_functions() { try { // call utility functions - they need to exist but don't need to work const auto& hipfftw_ = hipfftw_funcs::get_instance(); hipfftw_.print_plan(nullptr); hipfftw_.set_timelimit(0.0); hipfftw_.cost(nullptr); hipfftw_.flops(nullptr, nullptr, nullptr, nullptr); hipfftw_.cleanup(); } catch(const hipfftw_undefined_function_ptr& e) { GTEST_FAIL() << "Undefined function pointers detected. Error info: " << e.what(); } catch(...) { GTEST_FAIL() << "Unexpected failure"; } } // //--------------------------------------------------------------------------------------------- // ALLOCATION AND FREE //--------------------------------------------------------------------------------------------- // enum class hipfftw_alloc_func_type { unspecified, real, complex }; bool hipfftw_alloc_func_type_is_valid(hipfftw_alloc_func_type func) { return func == hipfftw_alloc_func_type::unspecified || func == hipfftw_alloc_func_type::real || func == hipfftw_alloc_func_type::complex; } // bit mask to prevent allocation kind(s), by increasing "rank" to enable meaningful // comparison operators (implicitly defined based on the underlying type) enum hipfftw_alloc_memkind : unsigned { none = 0x0, pageable_host = 0x1 << 0, pinned_host = 0x1 << 1, any = pageable_host | pinned_host }; const std::vector hipfftw_possible_memkinds = {pinned_host, pageable_host}; bool hipfftw_alloc_kind_is_valid(hipfftw_alloc_memkind kind) { return kind == (kind & hipfftw_alloc_memkind::any); } std::string hipfftw_alloc_kind_to_string(hipfftw_alloc_memkind kind) { if(!hipfftw_alloc_kind_is_valid(kind)) throw std::invalid_argument("alloc_kind_to_string: invalid kind"); if(kind == hipfftw_alloc_memkind::none) return "none"; if(std::find(hipfftw_possible_memkinds.begin(), hipfftw_possible_memkinds.end(), kind) == hipfftw_possible_memkinds.end()) { // several values enabled std::string ret; for(auto to_consider : hipfftw_possible_memkinds) { if(!(kind & to_consider)) continue; if(!ret.empty()) ret += "_or_"; ret += hipfftw_alloc_kind_to_string(to_consider); } return ret; } // kind is a well-defined value in hipfftw_possible_memkinds switch(kind) { case hipfftw_alloc_memkind::pinned_host: return "pinned_host"; break; case hipfftw_alloc_memkind::pageable_host: return "pageable_host"; break; default: throw std::runtime_error("alloc_kind_to_string: internal error encountered " "(unexpected value for kind)"); break; } // unreachable } template struct hipfftw_malloc_params { size_t alloc_arg; hipfftw_alloc_func_type alloc_func; hipfftw_alloc_memkind alloc_kind; size_t get_byte_size() const { return alloc_arg * (alloc_func == hipfftw_alloc_func_type::unspecified ? sizeof(char) : (alloc_func == hipfftw_alloc_func_type::real ? sizeof(hipfftw_real_t) : sizeof(hipfftw_complex_t))); } std::string to_string() const { if(!hipfftw_alloc_func_type_is_valid(alloc_func)) throw std::runtime_error("invalid type of allocation function"); if(!hipfftw_alloc_kind_is_valid(alloc_kind)) throw std::runtime_error("invalid allocation kind(s)"); std::string ret; if constexpr(prec == fft_precision_single) ret += "fftwf_"; else ret += "fftw_"; if(alloc_func == hipfftw_alloc_func_type::unspecified) ret += "malloc_"; else if(alloc_func == hipfftw_alloc_func_type::real) ret += "alloc_real_"; else ret += "alloc_complex_"; ret += std::to_string(alloc_arg); ret += "_alloc_kind_" + hipfftw_alloc_kind_to_string(alloc_kind); return ret; } // for using with insert_into_unique_sorted_params bool operator<(const hipfftw_malloc_params& other) const { return to_string() < other.to_string(); } bool operator==(const hipfftw_malloc_params& other) const { return to_string() == other.to_string(); } }; template std::vector> params_for_testing_hipfftw_malloc() { std::vector> ret; // testing argument value 0 and a randomly chosen one (max 64MiB in byte size, arbitrarily chosen) constexpr size_t max_test_alloc_size = 1ULL << 26; const std::vector func_range = {hipfftw_alloc_func_type::unspecified, hipfftw_alloc_func_type::real, hipfftw_alloc_func_type::complex}; std::vector memkind_range; // add all possible combinations of memory kinds: for(auto kind = static_cast::type>( hipfftw_alloc_memkind::none); kind <= static_cast::type>( hipfftw_alloc_memkind::any); kind++) { memkind_range.push_back(static_cast(kind)); } hipfftw_malloc_params to_add; for(auto func : func_range) { size_t max_arg = max_test_alloc_size; if(func == hipfftw_alloc_func_type::real) max_arg /= sizeof(hipfftw_real_t); else if(func == hipfftw_alloc_func_type::complex) max_arg /= sizeof(hipfftw_complex_t); std::uniform_int_distribution arg_rng(1, max_arg); for(auto kind : memkind_range) { for(auto arg : {size_t(0), arg_rng(get_pseudo_rng())}) { to_add.alloc_arg = arg; to_add.alloc_func = func; to_add.alloc_kind = kind; insert_into_unique_sorted_params(ret, to_add); } } } return ret; } template class hipfftw_allocation_test : public ::testing::TestWithParam> { protected: void* test_allocation = nullptr; bool expect_no_allocation; std::map> temp_alloc_limit_env; void SetUp() override { if(test_allocation) GTEST_FAIL() << "Starting from an unclean slate (test_allocation is not nullptr)"; const hipfftw_malloc_params& params = this->GetParam(); // check validity of params if(!hipfftw_alloc_kind_is_valid(params.alloc_kind)) GTEST_FAIL() << "invalid value for allocation kind"; if(!hipfftw_alloc_func_type_is_valid(params.alloc_func)) GTEST_FAIL() << "unknown allocation function"; size_t limit_for_alloc_kind = 0; for(auto alloc_kind_candidate : hipfftw_possible_memkinds) { if(alloc_kind_candidate != hipfftw_alloc_memkind::pinned_host && alloc_kind_candidate != hipfftw_alloc_memkind::pageable_host) { throw std::runtime_error("unexpected memory allocation kind " + hipfftw_alloc_kind_to_string(alloc_kind_candidate)); } const std::string control_env_var = alloc_kind_candidate == hipfftw_alloc_memkind::pinned_host ? "HIPFFTW_BYTE_SIZE_LIMIT_PINNED_HOST_ALLOC" : "HIPFFTW_BYTE_SIZE_LIMIT_PAGEABLE_HOST_ALLOC"; if(alloc_kind_candidate & params.alloc_kind) { const auto test_user_limit = rocfft_getenv(control_env_var.c_str()); limit_for_alloc_kind = std::max(limit_for_alloc_kind, test_user_limit.empty() ? std::numeric_limits::max() : size_t(std::stoull(test_user_limit))); } else { // disable the other possible allocation kind(s) by temporarily // setting the corresponding byte size limit to 0 temp_alloc_limit_env[alloc_kind_candidate] = std::make_unique(control_env_var.c_str(), "0"); // skip if temporary limit(s) was(were) not successfully set const auto tmp_limit = rocfft_getenv(control_env_var.c_str()); if(tmp_limit.empty() || std::stoull(tmp_limit) != 0) { GTEST_SKIP() << "failed to set environment variable disabling " << hipfftw_alloc_kind_to_string(alloc_kind_candidate) << " allocation(s) by hipFFTW"; } } } const size_t req_byte_size = params.get_byte_size(); expect_no_allocation = params.alloc_kind == hipfftw_alloc_memkind::none || req_byte_size == 0 || req_byte_size > limit_for_alloc_kind; } void TearDown() override { temp_alloc_limit_env.clear(); const hipfftw_funcs& hipfftw_impl = hipfftw_funcs::get_instance(); if(test_allocation && !hipfftw_impl.free.may_be_used()) GTEST_FAIL() << "An allocation was created but it can't be freed"; // note: free should be stable even with nullptr if(hipfftw_impl.free.may_be_used()) hipfftw_impl.free(test_allocation); } void test_malloc_write_and_read() { const hipfftw_malloc_params& params = this->GetParam(); const hipfftw_funcs& hipfftw_impl = hipfftw_funcs::get_instance(); hipfftw_exception_logger exception_logger; struct allocation_test_to_be_skipped : std::runtime_error { using std::runtime_error::runtime_error; }; struct allocation_test_failed : std::runtime_error { using std::runtime_error::runtime_error; }; struct allocation_test_success { }; // used to cut test execution short when applicable try { // The test // - fills values in the allocated arrays as 0, 1, ..., max_elem, 0, 1, ..., max_elem, 0, 1, etc. // (max_elem, max_elem-1, ..., 1, 0, max_elem, max_elem-1, ..., for imaginary values) // - reads the values thereafter and accumulates them as a sum of doubles // - checks the result. // --> the expected_result's must be exactly representable as double values const size_t max_elem = static_cast(std::numeric_limits::max()); const size_t cycle_sz = max_elem + 1; const size_t max_representable_result = 1ULL << std::numeric_limits::digits; auto sum_of_integers = [](size_t to, size_t from = 0) { if(from > to) throw std::invalid_argument("invalid argument for sum_of_integers lambda"); return (to + from) * (to - from + 1) / 2; }; const size_t sum_of_cycles = (params.alloc_arg / cycle_sz) * sum_of_integers(max_elem); const size_t tail_sz = params.alloc_arg % cycle_sz; const size_t expected_result_r = sum_of_cycles + (tail_sz > 0 ? sum_of_integers(tail_sz - 1) : 0); const size_t expected_result_i = sum_of_cycles + (tail_sz > 0 ? sum_of_integers(max_elem, max_elem + 1 - tail_sz) : 0); if(expected_result_r > max_representable_result || (params.alloc_func == hipfftw_alloc_func_type::complex && expected_result_i > max_representable_result)) throw allocation_test_to_be_skipped("Test cannot reliably check for argument " + std::to_string(params.alloc_arg)); if(params.alloc_func == hipfftw_alloc_func_type::unspecified) test_allocation = hipfftw_impl.malloc(params.alloc_arg); else if(params.alloc_func == hipfftw_alloc_func_type::real) test_allocation = hipfftw_impl.alloc_real(params.alloc_arg); else test_allocation = hipfftw_impl.alloc_complex(params.alloc_arg); // check that the allocation behaved as expected if(expect_no_allocation) { if(!test_allocation) throw allocation_test_success(); else // no allocation should have happened, nullptr should have been returned throw allocation_test_failed( "An allocation was unexpectedly produced for this test"); } if(!test_allocation) { throw allocation_test_failed("allocation failed"); } // check that the allocation is of the expected type hipPointerAttribute_t attributes; auto hip_status = hipPointerGetAttributes(&attributes, test_allocation); if(hip_status != hipSuccess) throw hip_runtime_error("hipPointerGetAttributes failed.", hip_status); switch(attributes.type) { case hipMemoryType::hipMemoryTypeHost: EXPECT_NE(params.alloc_kind & hipfftw_alloc_memkind::pinned_host, 0); break; case hipMemoryType::hipMemoryTypeUnregistered: EXPECT_NE(params.alloc_kind & hipfftw_alloc_memkind::pageable_host, 0); break; default: GTEST_FAIL() << "Unexpected kind of memory created: attributes.type = " << attributes.type; break; } // check that the host can write to the entire allocation #ifdef _OPENMP #pragma omp parallel for #endif for(size_t idx = 0; idx < params.alloc_arg; idx++) { uint8_t val = static_cast(idx % cycle_sz); if(params.alloc_func == hipfftw_alloc_func_type::unspecified) // write as uint8_t static_cast(test_allocation)[idx] = val; else if(params.alloc_func == hipfftw_alloc_func_type::real) // write as float/double static_cast*>(test_allocation)[idx] = val; else // write as complex value of req. precision { static_cast*>(test_allocation)[idx][0] = val; static_cast*>(test_allocation)[idx][1] = max_elem - val; } } // check that the host can read from the entire allocation double result[2] = {0, 0}; #ifdef _OPENMP #pragma omp parallel for reduction(+ : result) #endif for(size_t idx = 0; idx < params.alloc_arg; idx++) { if(params.alloc_func == hipfftw_alloc_func_type::unspecified) { // read as uint8_t, accumulate as double result[0] += static_cast(test_allocation)[idx]; } else if(params.alloc_func == hipfftw_alloc_func_type::real) { // read as float/double, accumulate as double result[0] += static_cast*>(test_allocation)[idx]; } else // write as complex value of req. precision { // read as complex value of req. precision, accumulate as doubles result[0] += static_cast*>(test_allocation)[idx][0]; result[1] += static_cast*>(test_allocation)[idx][1]; } } // validity checks if(result[0] != expected_result_r) throw allocation_test_failed("incorrect result for accumulated real parts"); if(params.alloc_func == hipfftw_alloc_func_type::complex && result[1] != expected_result_i) throw allocation_test_failed( "incorrect result for accumulated imaginary parts"); } catch(const allocation_test_success&) { // so far so good } catch(const hipfftw_undefined_function_ptr& e) { GTEST_FAIL() << "undefined function pointers detected. Error info: " << e.what(); } catch(const hip_runtime_error& e) { ++n_hip_failures; if(skip_runtime_fails) GTEST_SKIP() << e.what() << "\nError code: " << e.hip_error << "."; else GTEST_FAIL() << e.what() << "\nError code: " << e.hip_error << "."; } catch(const allocation_test_to_be_skipped& e) { GTEST_SKIP() << e.what(); } catch(const allocation_test_failed& e) { std::ostringstream gtest_info; gtest_info << e.what(); const auto log_content = exception_logger.get_log(); if(!log_content.empty()) gtest_info << "\nContent of error log :\n " << log_content; GTEST_FAIL() << gtest_info.str(); } catch(...) { std::ostringstream gtest_info; gtest_info << "unidentified exception caught during test."; const auto log_content = exception_logger.get_log(); if(!log_content.empty()) gtest_info << "\nContent of error log :\n " << log_content; GTEST_FAIL() << gtest_info.str(); } // pinned host allocation is the first-ranked choice of hipfftw // check that the execution flow was redirected if disabled for(auto possible_memkind : hipfftw_possible_memkinds) { if(possible_memkind <= params.alloc_kind) continue; // possible_memkind is higher ranked than the test's targeted kind // flow re-direction must have happened if(test_allocation && exception_logger.is_active()) { const auto log_content = exception_logger.get_log(); if(log_content.find(hipfftw_expected_log_instance< hipfftw_internal_exception::flow_redirection>) == std::string::npos) { GTEST_FAIL() << "No instance of \"" << hipfftw_expected_log_instance< hipfftw_internal_exception:: flow_redirection> << "\" in log despite " << hipfftw_alloc_kind_to_string(possible_memkind) << " allocation kind supposedly " "disabled via environment variable.\nContent of log:\n" << log_content; } else break; } } } public: static std::string TestName( const testing::TestParamInfo& info) { return info.param.to_string(); } }; using allocation_sp = hipfftw_allocation_test; TEST_P(allocation_sp, malloc_write_and_read) { test_malloc_write_and_read(); } using allocation_dp = hipfftw_allocation_test; TEST_P(allocation_dp, malloc_write_and_read) { test_malloc_write_and_read(); } // //--------------------------------------------------------------------------------------------- // INPUT VALIDATION FOR PLAN CREATION AND EXECUTION //--------------------------------------------------------------------------------------------- // // bit-flagging enum used for configuring tests's execution I/O enum hipfftw_execution_io_args : unsigned { use_creation_io = 0x0, non_null_new_in = 0x1 << 0, non_null_new_out = 0x1 << 1, new_io_same_placement = 0x1 << 2, // all flags must be up to be generally clean with new I/O clean_new_io = non_null_new_in | non_null_new_out | new_io_same_placement }; static bool hipfftw_execution_io_args_are_well_defined(hipfftw_execution_io_args args) { return args == (args & hipfftw_execution_io_args::clean_new_io); } enum class hipfftw_step { plan_creation, plan_execution }; template struct hipfftw_input_validation_params { hipfftw_plan_creation_func creation_options; std::pair creation_io_is_null; hipfftw_execution_io_args execution_io; hipfftw_helper plan_helper; // NOTE: placement is to be respected by tests' choice of I/O at plan creation! fft_result_placement creation_placement() const { return plan_helper.get_placement(); } fft_result_placement execution_placement() const { auto ret = creation_placement(); if(!use_creation_io_at_execution() && !(execution_io & hipfftw_execution_io_args::new_io_same_placement)) { if(ret == fft_placement_inplace) ret = fft_placement_notinplace; else ret = fft_placement_inplace; } return ret; } bool use_creation_io_at_execution() const { return execution_io == hipfftw_execution_io_args::use_creation_io; } bool is_execution_arg_null(fft_io io_label) const { if(io_label != fft_io::fft_io_in && io_label != fft_io::fft_io_out) throw std::invalid_argument( "invalid io_label for hipfftw_input_validation_params::is_execution_io_null"); if(use_creation_io_at_execution()) return io_label == fft_io::fft_io_in ? creation_io_is_null.first : creation_io_is_null.second; const auto non_null_io_mask = io_label == fft_io::fft_io_in ? hipfftw_execution_io_args::non_null_new_in : hipfftw_execution_io_args::non_null_new_out; return !(execution_io & non_null_io_mask); } // checks consistency between values for test parameters that may have // overlapping scopes/meaning, in some specific cases bool can_be_tested(bool io_allocation_is_allowed = true) const { if(!use_creation_io_at_execution()) return false; // cannot be done yet if(!hipfftw_execution_io_args_are_well_defined(execution_io)) return false; if(!plan_helper.can_use_creation_options(creation_options)) return false; if(creation_placement() == fft_placement_inplace) { if(creation_io_is_null.first != creation_io_is_null.second) return false; } else { if(creation_io_is_null.first && creation_io_is_null.second) return false; } if(execution_placement() == fft_placement_inplace) { if(is_execution_arg_null(fft_io::fft_io_in) != is_execution_arg_null(fft_io::fft_io_out)) return false; // woudl be out-of-place at execution } else { if(is_execution_arg_null(fft_io::fft_io_in) && is_execution_arg_null(fft_io::fft_io_out)) return false; // would be in-place at execution } if(!io_allocation_is_allowed) { // do not tolerate allow SetUp to allocate bool ret = creation_io_is_null.first && (creation_placement() == fft_placement_inplace || creation_io_is_null.second); if(!use_creation_io_at_execution() && ret) { ret = is_execution_arg_null(fft_io::fft_io_in) && (execution_placement() == fft_placement_inplace || is_execution_arg_null(fft_io::fft_io_out)); } return ret; } return true; } bool has_valid_io_for(hipfftw_step step) const { if(step != hipfftw_step::plan_creation && step != hipfftw_step::plan_execution) throw std::invalid_argument("Invalid step for has_valid_io_for"); if(step == hipfftw_step::plan_creation) { // anything goes if using FFTW_ESTIMATE or FFTW_WISDOM_ONLY in flags const auto flags = plan_helper.get_flags(); if(flags & FFTW_ESTIMATE || flags & FFTW_WISDOM_ONLY) return true; return !creation_io_is_null.first && (creation_placement() == fft_placement_inplace || !creation_io_is_null.second); } if(!hipfftw_execution_io_args_are_well_defined(execution_io)) throw std::runtime_error("invalid plan execution args"); if(use_creation_io_at_execution()) return !creation_io_is_null.first && (creation_placement() == fft_placement_inplace || !creation_io_is_null.second); return execution_io == hipfftw_execution_io_args::clean_new_io; } // helper to determine if an internal exception may reliably be expected // for the given step (creation/execution) and, if yes, which kind of // internal exception hipfftw_internal_exception expected_internal_exception_for(hipfftw_step step) const { if(step != hipfftw_step::plan_creation && step != hipfftw_step::plan_execution) throw std::invalid_argument("Invalid step in expected_internal_exception_for"); if(!can_be_tested()) throw std::runtime_error( hipfftw_creation_options_to_string( creation_options, plan_helper.get_dft_kind(), plan_helper.get_rank()) + " cannot be tested for these parameters"); const bool valid_args_for_plan_creation = plan_helper.is_valid_for_creation_with(creation_options) && has_valid_io_for(hipfftw_step::plan_creation); const bool plan_can_be_created = valid_args_for_plan_creation && plan_helper.can_create_plan_with(creation_options); if(step == hipfftw_step::plan_creation) { if(plan_can_be_created) return hipfftw_internal_exception::none; // plan cannot be created if(valid_args_for_plan_creation) return hipfftw_internal_exception::unsupported_args; // plan cannot be created and arguments were invalid... // We may however have a mixed bag of some invalid and other unsupported args. // In such cases, the specific exception to expect would be ill-defined if(!plan_helper.has_unsupported_args_for(creation_options)) return hipfftw_internal_exception::invalid_args; else return hipfftw_internal_exception::ill_defined; } else { if(!plan_can_be_created || !has_valid_io_for(hipfftw_step::plan_execution)) { return hipfftw_internal_exception::invalid_args; } return hipfftw_internal_exception::none; } } std::string to_string() const { std::ostringstream ret; ret << plan_helper.token(); ret << "_creation_func_" << hipfftw_creation_options_to_string( creation_options, plan_helper.get_dft_kind(), plan_helper.get_rank()); ret << "_creation_in_ptr" << (creation_io_is_null.first ? "_" : "_not_") << "nullptr"; ret << "_creation_out_ptr" << (creation_io_is_null.second ? "_" : "_not_") << "nullptr"; if(!hipfftw_execution_io_args_are_well_defined(execution_io)) throw std::runtime_error("invalid plan execution args"); if(!use_creation_io_at_execution()) { ret << "_execution_new_in_ptr" << (is_execution_arg_null(fft_io::fft_io_in) ? "_" : "_not_") << "nullptr"; ret << "_execution_out_ptr" << (is_execution_arg_null(fft_io::fft_io_out) ? "_" : "_not_") << "nullptr"; ret << "_execution_placement" << ((execution_io & hipfftw_execution_io_args::new_io_same_placement) ? "_same_as_" : "_different_than_") << "creation_placement"; } return ret.str(); } // for using with insert_into_unique_sorted_params bool operator<(const hipfftw_input_validation_params& other) const { return to_string() < other.to_string(); } bool operator==(const hipfftw_input_validation_params& other) const { return to_string() == other.to_string(); } }; template std::vector> params_for_testing_input_validation_params() { // constexpr used for readability of template specialization values below constexpr bool valid_value = true; constexpr int min_unsupported_rank = 4; // scope of plan hipfftw_helpers configured with (zero or possibly many) // invalid/unsupported parameter value(s) std::vector> helper_scope; for(auto dft_kind : trans_type_range_full) { std::vector rank_range = {1, 2, 3}; rank_range.push_back(get_random_rank()); rank_range.push_back(get_random_rank()); for(auto rank : rank_range) { for(auto placement : place_range) { std::vector> range_of_lengths; // most creation funcs take lengths as pointers // --> test for empty lengths (re-interpreted as a nullptr // arg by hipfftw_helper) range_of_lengths.emplace_back(std::vector()); if(rank > 0) { const bool is_real_inplace = is_real(dft_kind) && placement == fft_placement_inplace; const ptrdiff_t allocatable_len_threshold = std::min( get_len_threshold(max_num_elems_for_data_size( max_byte_size_for_hipfftw_tests(), dft_kind), rank, is_real_inplace), static_cast(max_length_for_hipfftw_test)); const auto valid_int_lengths = get_random_lengths(rank, allocatable_len_threshold); // always add valid integer lengths for valid ranks range_of_lengths.emplace_back(valid_int_lengths); // invalid integer lengths (most likely nonzero) const auto invalid_int_lengths = get_random_lengths( rank, allocatable_len_threshold); range_of_lengths.emplace_back(invalid_int_lengths); // invalid integer lengths (some zero) auto invalid_int_lengths_due_to_some_zero = valid_int_lengths; invalid_int_lengths_due_to_some_zero[get_random_idx(rank)] = 0; range_of_lengths.emplace_back(invalid_int_lengths_due_to_some_zero); if(rank > 1 && rank < min_unsupported_rank) { // no support for layouts that trigger an int overflow for any relevant // element index (unless GURU64 creation functions are used) const auto min_overflowing_len = get_len_threshold( std::numeric_limits::max(), rank, is_real_inplace); const auto unsupported_int_lengths = get_random_lengths( rank, std::numeric_limits::max(), min_overflowing_len + 1); range_of_lengths.emplace_back(unsupported_int_lengths); } } for(const auto& lengths : range_of_lengths) { std::vector sign_range = {get_random_sign(dft_kind)}; if(is_complex(dft_kind)) sign_range.push_back(get_random_sign(dft_kind)); for(auto sign : sign_range) { // FFTW_ESTIMATE is always supported std::vector flags_range = {FFTW_ESTIMATE}; // some invalid flags flags_range.push_back(get_random_flags()); // unsupported FFTW_WISDOM_ONLY flags_range.push_back(FFTW_WISDOM_ONLY | get_random_flags()); // unsupported FFTW_PRESERVE_INPUT for multi-dimensional c2r flags_range.push_back(FFTW_PRESERVE_INPUT | get_random_flags()); for(auto flags : flags_range) { hipfftw_helper helper_to_add; helper_to_add.set_creation_args( dft_kind, rank, lengths, placement, sign, flags); helper_scope.emplace_back(helper_to_add); } } } } } } // create a full-scope map containing all the generated test parameters; the map keys // capture the hipfftw's function name that the tests would target // --> ease for guaranteeing coverage even with low test probability in the end // TODO: add messed up (new) I/O arguments when new-array executes are enabled std::map>> full_scope_tests; hipfftw_input_validation_params test_to_add; for(const auto& helper : helper_scope) { // do not allocate for the lengths designed to trigger an overflow // (allocation sizes would be ridiculously large) const bool test_may_allocate = helper.has_valid_rank() && helper.has_valid_lengths() && helper.get_data_byte_size(fft_io::fft_io_in) <= max_byte_size_for_hipfftw_tests() && helper.get_data_byte_size(fft_io::fft_io_out) <= max_byte_size_for_hipfftw_tests(); for(auto creation : hipfftw_plan_creation_func_candidates) { // full range considered for creation_io_is_null and execution_io // parameters: some might be ruled out later on because they can't // be tested (e.g., "not_inplace" required yet using nullptr for // creation input and output would be nonsensical) const std::vector> creation_io_is_null_range = {{false, false}, {true, false}, {false, true}, {true, true}}; for(auto set_creation_io_as_null : creation_io_is_null_range) { for(std::underlying_type_t exec_io = hipfftw_execution_io_args::use_creation_io; exec_io <= hipfftw_execution_io_args::clean_new_io; exec_io++) { test_to_add.creation_options = creation; test_to_add.creation_io_is_null = set_creation_io_as_null; test_to_add.execution_io = static_cast(exec_io); test_to_add.plan_helper = helper; // skip params if they can't/shouldn't be used anyways if(!test_to_add.can_be_tested(test_may_allocate)) continue; // tests expect a failure at execution at least if(test_to_add.expected_internal_exception_for(hipfftw_step::plan_execution) == hipfftw_internal_exception::none) continue; if(test_to_add.expected_internal_exception_for(hipfftw_step::plan_creation) == hipfftw_internal_exception::invalid_args || test_to_add.expected_internal_exception_for( hipfftw_step::plan_creation) == hipfftw_internal_exception::unsupported_args) { insert_into_unique_sorted_params( full_scope_tests[hipfftw_creation_options_to_string( creation, helper.get_dft_kind(), helper.get_rank())], test_to_add); } else { insert_into_unique_sorted_params(full_scope_tests["execute"], test_to_add); } } } } } std::vector> ret; for(auto pair : full_scope_tests) { const auto& targeted_func = pair.first; auto& targeted_tests = pair.second; if(targeted_tests.empty()) { throw std::runtime_error("params_for_testing_input_validation_params: empty list " "of (supposedly broad-spectrum) tests for " + targeted_func); } // add one randomly-chosen one to guarantee coverage for the targeted function const auto forced_coverage_idx = get_random_idx(targeted_tests.size()); ret.emplace_back(targeted_tests[forced_coverage_idx]); targeted_tests.erase(targeted_tests.begin() + forced_coverage_idx); // consider all other probabilistically for(const auto& test : targeted_tests) { const double roll = hash_prob(random_seed, test.to_string()); // not distinguishing between real/complex for this list generation if(roll > test_prob) { if(verbose > 4) { std::cout << "Test skipped: (roll=" << roll << " > " << test_prob << ")\n"; } continue; } ret.emplace_back(test); } } return ret; } template class hipfftw_argument_validation : public ::testing::TestWithParam> { protected: void SetUp() override { const hipfftw_input_validation_params& params = this->GetParam(); if(!params.can_be_tested()) GTEST_FAIL() << "invalid parameters which cannot be tested"; // get_data_byte_size requires valid ranks and lengths to be calculated (of course) // --> make sure the I/O data sizes are not zero for test consistency w.r.t. testing // for nullptr data args I/O const size_t input_data_size = params.plan_helper.has_valid_rank() && params.plan_helper.has_valid_lengths() ? params.plan_helper.get_data_byte_size(fft_io_in) : sizeof(hipfftw_complex_t); const size_t output_data_size = params.plan_helper.has_valid_rank() && params.plan_helper.has_valid_lengths() ? params.plan_helper.get_data_byte_size(fft_io_out) : sizeof(hipfftw_complex_t); if(params.creation_io_is_null.first) plan_creation_input.free(); else plan_creation_input.alloc(input_data_size); if(params.creation_placement() == fft_placement_inplace || params.creation_io_is_null.second) plan_creation_output.free(); else plan_creation_output.alloc(output_data_size); if(params.use_creation_io_at_execution()) { plan_execution_input.free(); plan_execution_output.free(); } else { if(!params.is_execution_arg_null(fft_io::fft_io_in)) plan_execution_input.alloc(input_data_size); else plan_execution_input.free(); if(params.execution_placement() == fft_placement_inplace || params.is_execution_arg_null(fft_io::fft_io_out)) plan_execution_output.free(); else plan_execution_output.alloc(output_data_size); } } void TearDown() override { plan_creation_input.free(); plan_creation_output.free(); plan_execution_input.free(); plan_execution_output.free(); this->GetParam().plan_helper.release_plan(); } hostbuf plan_creation_input; hostbuf plan_creation_output; hostbuf plan_execution_input; hostbuf plan_execution_output; void expect_failure(hipfftw_step step_target) { const hipfftw_input_validation_params& params = this->GetParam(); std::unique_ptr exception_logger; bool check_log_content = false; std::string log_content; const auto expected_exception = params.expected_internal_exception_for(step_target); if(expected_exception != hipfftw_internal_exception::invalid_args && expected_exception != hipfftw_internal_exception::unsupported_args) GTEST_FAIL() << "invalid expected_exception to be tested for: only invalid or " "unsupported arguments may be tested"; const auto expected_log_instance = expected_exception == hipfftw_internal_exception::invalid_args ? hipfftw_expected_log_instance : hipfftw_expected_log_instance; try { if(step_target == hipfftw_step::plan_creation) { exception_logger = std::make_unique(); check_log_content = exception_logger->is_active(); } params.plan_helper.create_plan(plan_creation_input.data(), params.creation_placement() == fft_placement_inplace ? plan_creation_input.data() : plan_creation_output.data(), params.creation_options); if(step_target == hipfftw_step::plan_creation) { log_content = exception_logger->get_log(); exception_logger.reset(); const std::shared_ptr> plan_bundle = params.plan_helper.get_plan_bundle(); if(!plan_bundle) throw std::runtime_error( "the plan bundle could not be retrieved from the parameters"); if(plan_bundle->plan) throw std::runtime_error( hipfftw_creation_options_to_string(plan_bundle->creation_func, params.plan_helper.get_dft_kind(), params.plan_helper.get_rank()) + " actually created a plan for these parameters"); } else { exception_logger = std::make_unique(); check_log_content = exception_logger->is_active(); void* exec_in = params.use_creation_io_at_execution() ? plan_creation_input.data() : plan_execution_input.data(); void* exec_out = params.use_creation_io_at_execution() ? (params.creation_placement() == fft_placement_inplace ? plan_creation_input.data() : plan_creation_output.data()) : (params.execution_placement() == fft_placement_inplace ? plan_execution_input.data() : plan_execution_output.data()); // intentionally do not check that hipfftw_test_plan != nullptr as that's // kind of the point of this test: even if it doesn't report error codes, // execution must not misbehave (e.g. must not segfault) with invalid argument // (if hipfftw's exception handler is made verbose, it should print failure // info to the log, and that's verified in the end) params.plan_helper.execute(exec_in, exec_out); log_content = exception_logger->get_log(); exception_logger.reset(); } } catch(const hipfftw_undefined_function_ptr& e) { GTEST_FAIL() << "undefined function pointers detected. Error info: " << e.what(); } catch(const std::runtime_error e) { if(log_content.empty() && exception_logger) log_content = exception_logger->get_log(); std::ostringstream gtest_info; gtest_info << e.what(); if(!log_content.empty()) gtest_info << "\nContent of error log:\n" << log_content; GTEST_FAIL() << gtest_info.str(); } catch(...) { if(log_content.empty() && exception_logger) log_content = exception_logger->get_log(); std::ostringstream gtest_info; gtest_info << "unidentified exception caught during test."; if(!log_content.empty()) gtest_info << "\nContent of error log:\n" << log_content; GTEST_FAIL() << gtest_info.str(); } if(log_content.empty() && exception_logger) log_content = exception_logger->get_log(); if(check_log_content && log_content.find(expected_log_instance) == std::string::npos) { GTEST_FAIL() << "No instance of \"" << expected_log_instance << "\" detected in error log when testing for plan " << (step_target == hipfftw_step::plan_creation ? "creation." : "execution.") << "\nContent of error log:\n" << log_content; } } void input_validation_test() { const auto& param = this->GetParam(); if(param.expected_internal_exception_for(hipfftw_step::plan_execution) == hipfftw_internal_exception::none) { GTEST_FAIL() << "Invalid parameters for testing input validation (no internal " "exception expected up to execution)"; } // only one well-defined kind of exception should be expected at plan creation // for reliable testing: if plan creation arguments are a mixed bags of invalid // and unsupported values, implementation details may trigger one kind of // exception or the other (internally) const auto expected_plan_creation_exception = param.expected_internal_exception_for(hipfftw_step::plan_creation); if(expected_plan_creation_exception == hipfftw_internal_exception::invalid_args || expected_plan_creation_exception == hipfftw_internal_exception::unsupported_args) { expect_failure(hipfftw_step::plan_creation); } // always test for execution expect_failure(hipfftw_step::plan_execution); } public: static std::string TestName( const testing::TestParamInfo& info) { return info.param.to_string(); } }; // //--------------------------------------------------------------------------------------------- // FUNCTIONAL VALIDATION //--------------------------------------------------------------------------------------------- // enum class hipfftw_data_memory_type { pageable_host, pinned_host, #ifndef WIN32 // linux-only managed, #endif device }; const std::vector& get_possible_data_mem_types() { auto create_possible_cases = []() { // always testable std::vector ret = {hipfftw_data_memory_type::pageable_host, hipfftw_data_memory_type::pinned_host, hipfftw_data_memory_type::device}; #ifndef WIN32 // "managed" may or may not be supported hipDeviceProp_t props; if(hipGetDeviceProperties(&props, get_current_device_id()) == hipSuccess) { // explicitly ruling out gfx908 (MI100) if(std::strstr(props.gcnArchName, "gfx908") == nullptr && props.managedMemory == 1) ret.push_back(hipfftw_data_memory_type::managed); } #endif return ret; }; const static std::vector possible_cases = create_possible_cases(); return possible_cases; }; std::string hipfftw_data_mem_type_to_string(hipfftw_data_memory_type mem_type) { switch(mem_type) { case hipfftw_data_memory_type::pageable_host: return "pageable_host"; break; case hipfftw_data_memory_type::pinned_host: return "pinned_host"; break; #ifndef WIN32 case hipfftw_data_memory_type::managed: return "managed"; break; #endif case hipfftw_data_memory_type::device: return "device"; break; default: throw std::runtime_error("internal error: unexpected value of mem_tye in " "hipfftw_data_mem_type_to_string"); break; } }; template struct hipfftw_functional_validation_params { // define type of I/O argument memory to be tested at a given step (creation/execution) // by a map: mem_type[{step_label, io_label}] represents the test's target memory type to consider // for the "io_label" I/O argument at step "step_label" std::map, hipfftw_data_memory_type> mem_type; hipfftw_execution_io_args execution_io; hipfftw_helper plan_helper; fft_transform_type get_dft_kind() const { return plan_helper.get_dft_kind(); } fft_result_placement get_placement() const { return plan_helper.get_placement(); } bool use_creation_io_at_execution() const { return execution_io == hipfftw_execution_io_args::use_creation_io; } fft_array_type get_array_type(fft_io io) const { if(io != fft_io::fft_io_in && io != fft_io::fft_io_out) throw std::invalid_argument("invalid io argument for " "hipfftw_functional_validation_params::get_array_type"); const auto dft_kind = plan_helper.get_dft_kind(); if(is_complex(dft_kind)) return fft_array_type_complex_interleaved; else if(is_fwd(dft_kind) == (io == fft_io::fft_io_in)) return fft_array_type_real; else return fft_array_type_hermitian_interleaved; } std::vector get_lengths() const { return plan_helper.template get_length_as(); } std::vector get_ilengths() const { auto ilengths = get_lengths(); if(plan_helper.get_dft_kind() == fft_transform_type_real_inverse) ilengths.back() = ilengths.back() / 2 + 1; return ilengths; } int get_rank() const { return plan_helper.get_rank(); } std::vector get_istride() const { return plan_helper.template get_strides_as(fft_io::fft_io_in); } std::vector get_ostride() const { return plan_helper.template get_strides_as(fft_io::fft_io_out); } size_t get_idist() const { return plan_helper.template get_dist_as(fft_io::fft_io_in); } size_t get_odist() const { return plan_helper.template get_dist_as(fft_io::fft_io_out); } size_t get_nbatch() const { return plan_helper.template get_nbatch_as(fft_io::fft_io_in); } std::vector get_contiguous_istride() const { // equivalent to plan's strides for now, to be reconsidered once more general // configurations are enabled return plan_helper.template get_strides_as(fft_io::fft_io_in); } size_t get_contiguous_idist() const { // equivalent to plan's strides for now, to be reconsidered once more general // configurations are enabled return plan_helper.template get_dist_as(fft_io::fft_io_in); } bool can_be_tested() const { for(auto step : {hipfftw_step::plan_creation, hipfftw_step::plan_execution}) { for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair key = {step, io}; if(mem_type.find(key) == mem_type.end()) { // incomplete mem_type map return false; } } } // if using new I/O at execution, they must be clean // TODO: enabled clean_new_io when new-array executes are implemented if(execution_io != hipfftw_execution_io_args::use_creation_io /* && execution_io != hipfftw_execution_io_args::clean_new_io*/) { return false; } if(!plan_helper.can_create_plan()) return false; const auto placement = plan_helper.get_placement(); if(placement == fft_placement_inplace) { if(mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_in}) != mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_out})) return false; if(mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_in}) != mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_out})) return false; } if(execution_io == hipfftw_execution_io_args::use_creation_io) { if(mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_in}) != mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_in}) || mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_out}) != mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_out})) return false; } return true; } std::string to_string() const { for(auto step : {hipfftw_step::plan_creation, hipfftw_step::plan_execution}) { for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair key = {step, io}; if(mem_type.find(key) == mem_type.end()) throw std::runtime_error("incomplete mem_type map"); } } std::ostringstream ret; ret << plan_helper.token(); ret << "_creation_input_mem_type_" << hipfftw_data_mem_type_to_string( mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_in})); if(plan_helper.get_placement() == fft_placement_notinplace) { ret << "_creation_output_mem_type_" << hipfftw_data_mem_type_to_string( mem_type.at({hipfftw_step::plan_creation, fft_io::fft_io_out})); } if(execution_io == hipfftw_execution_io_args::clean_new_io) { ret << "_execution_input_mem_type_" << hipfftw_data_mem_type_to_string( mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_in})); if(plan_helper.get_placement() == fft_placement_notinplace) { ret << "_execution_output_mem_type_" << hipfftw_data_mem_type_to_string( mem_type.at({hipfftw_step::plan_execution, fft_io::fft_io_out})); } } return ret.str(); } // for using with insert_into_unique_sorted_params bool operator<(const hipfftw_functional_validation_params& other) const { return to_string() < other.to_string(); } bool operator==(const hipfftw_functional_validation_params& other) const { return to_string() == other.to_string(); } }; template class hipfftw_functional_validation : public ::testing::TestWithParam> { protected: void SetUp() override { try { const hipfftw_functional_validation_params& params = this->GetParam(); if(!params.can_be_tested()) GTEST_FAIL() << "invalid parameters, cannot be tested"; if(reference_plan) GTEST_FAIL() << "Starting from an unclean slate (reference plan is not nullptr)"; execution_results_on_host.resize(1); execution_results_on_host[0].alloc( params.plan_helper.get_data_byte_size(fft_io::fft_io_out)); std::vector io_range = {fft_io::fft_io_in}; if(params.get_placement() == fft_placement_notinplace) io_range.push_back(fft_io::fft_io_out); std::vector step_range = {hipfftw_step::plan_creation}; if(!params.use_creation_io_at_execution()) step_range.push_back(hipfftw_step::plan_execution); for(auto io : io_range) { const size_t data_size = params.plan_helper.get_data_byte_size(io); auto& io_verification_vec = io == fft_io::fft_io_in ? verification_input : verification_output; io_verification_vec.resize(1); io_verification_vec[0].alloc(data_size); for(auto step : step_range) { const std::pair map_key = {step, io}; const auto mem_type = params.mem_type.at(map_key); if(mem_type == hipfftw_data_memory_type::pageable_host || mem_type == hipfftw_data_memory_type::pinned_host) { host_io_buffer[map_key].alloc( data_size, mem_type == hipfftw_data_memory_type::pinned_host); } else { #ifndef WIN32 const auto hip_status = gpu_io_buffer[map_key].alloc( data_size, mem_type == hipfftw_data_memory_type::managed); #else const auto hip_status = gpu_io_buffer[map_key].alloc(data_size); #endif if(hip_status != hipSuccess) { std::ostringstream gtest_info; gtest_info << "failed to allocate a buffer of type " << hipfftw_data_mem_type_to_string(mem_type) << " and byte size " << std::to_string(data_size) << ". Current device ID is " << get_current_device_id(); throw hip_runtime_error(gtest_info.str(), hip_status); } } } } if(verification_input.size() != 1 || (params.get_placement() != fft_placement_inplace && verification_output.size() != 1)) GTEST_FAIL() << "Verification IO buffer incorrectly initialized"; // generate input data const std::vector field_lower(params.get_rank(), 0); const auto ilength = params.get_ilengths(); std::vector contiguous_istride(params.get_rank()); size_t val = 1; for(int dim = params.get_rank() - 1; dim >= 0; dim--) { contiguous_istride[dim] = val; val *= ilength[dim]; } const auto contiguous_idist = val; set_input>(verification_input, fft_input_random_generator_host, params.get_array_type(fft_io::fft_io_in), params.get_lengths(), ilength, params.get_istride(), params.get_idist(), params.get_nbatch(), get_curr_device_prop(), field_lower, 0 /* field_lower_batch */, contiguous_istride, contiguous_idist); // create the reference plan (systematically using the most general guru64 creation) reference_plan = params.plan_helper.get_reference_plan( verification_input[0].data(), params.get_placement() == fft_placement_inplace ? verification_input[0].data() : verification_output[0].data()); if(!reference_plan) { GTEST_FAIL() << "could not create a reference plan"; } } catch(const hip_runtime_error& e) { ++n_hip_failures; if(skip_runtime_fails) GTEST_SKIP() << e.what() << "\nError code: " << e.hip_error << "."; else GTEST_FAIL() << e.what() << "\nError code: " << e.hip_error << "."; } } void TearDown() override { verification_input.clear(); verification_output.clear(); execution_results_on_host.clear(); host_io_buffer.clear(); gpu_io_buffer.clear(); if constexpr(prec == fft_precision_single) fftwf_destroy_plan(reference_plan); else fftw_destroy_plan(reference_plan); this->GetParam().plan_helper.release_plan(); } // verification buffers (set_input and other common routines require std::vector's of size 1 for these) std::vector verification_input; std::vector verification_output; std::vector execution_results_on_host; // possible host buffers (pageable or pinned host allocation) std::map, hostbuf> host_io_buffer; // possible nonhost buffers (may be current/other device or runtime-managed) std::map, gpubuf> gpu_io_buffer; // reference plan hipfftw_plan_t reference_plan = nullptr; void functional_test() const { const hipfftw_functional_validation_params& params = this->GetParam(); try { std::ostringstream gtest_info; if(verification_input.size() != 1 || (params.get_placement() != fft_placement_inplace && verification_output.size() != 1)) GTEST_FAIL() << "The verification I/O buffer(s) were not initialized as " "needed; host buffer(s) are required"; if(execution_results_on_host.size() != 1) GTEST_FAIL() << "Improper test initialization: no host buffer to copy the " "execution results"; // get/define raw pointers as needed std::map, void*> test_io_ptr; std::map, hipfftw_data_memory_type> test_io_type; for(auto step : {hipfftw_step::plan_creation, hipfftw_step::plan_execution}) { if(step == hipfftw_step::plan_execution && params.use_creation_io_at_execution()) { for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair map_key = {step, io}; const std::pair creation_key = {hipfftw_step::plan_creation, io}; test_io_ptr[map_key] = test_io_ptr[creation_key]; test_io_type[map_key] = test_io_type[creation_key]; } continue; } for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair map_key = {step, io}; if(io == fft_io::fft_io_out && params.get_placement() == fft_placement_inplace) { const std::pair input_key = {step, fft_io::fft_io_in}; test_io_ptr[map_key] = test_io_ptr[input_key]; test_io_type[map_key] = test_io_type[input_key]; continue; } if(params.mem_type.find(map_key) == params.mem_type.end()) GTEST_FAIL() << "incomplete mem_type map in test parameters"; test_io_type[map_key] = params.mem_type.at(map_key); if(test_io_type[map_key] == hipfftw_data_memory_type::pageable_host || test_io_type[map_key] == hipfftw_data_memory_type::pinned_host) { const auto host_buf_it = host_io_buffer.find(map_key); if(host_buf_it == host_io_buffer.end() || host_buf_it->second.size() < params.plan_helper.get_data_byte_size(io)) { GTEST_FAIL() << "The test " << (io == fft_io::fft_io_in ? "input" : "output") << " buffer was not initialized (host buffer required)"; } else { test_io_ptr[map_key] = host_buf_it->second.data(); } } else { const auto gpu_buf_it = gpu_io_buffer.find(map_key); if(gpu_buf_it == gpu_io_buffer.end() || gpu_buf_it->second.size() < params.plan_helper.get_data_byte_size(io)) { GTEST_FAIL() << "The test " << (io == fft_io::fft_io_in ? "input" : "output") << " buffer was not initialized (GPU buffer required)"; } else { test_io_ptr[map_key] = gpu_buf_it->second.data(); } } } } // copy input data to hipfftw's input const std::pair exec_in_key = {hipfftw_step::plan_execution, fft_io::fft_io_in}; if(test_io_type.at(exec_in_key) == hipfftw_data_memory_type::device) { // an explicit host-to-device copy is needed const auto hip_status = hipMemcpyAsync(test_io_ptr.at(exec_in_key), verification_input[0].data(), params.plan_helper.get_data_byte_size(fft_io::fft_io_in), hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw hip_runtime_error("hipMemcpyAsync failed.", hip_status); } else { std::memcpy(test_io_ptr.at(exec_in_key), verification_input[0].data(), params.plan_helper.get_data_byte_size(fft_io::fft_io_in)); } std::shared_future reference_cpu_dft = std::async(std::launch::async, [&]() { if constexpr(prec == fft_precision_single) fftwf_execute(reference_plan); else fftw_execute(reference_plan); }); auto exception_logger = std::make_unique(); params.plan_helper.create_plan( test_io_ptr.at({hipfftw_step::plan_creation, fft_io::fft_io_in}), test_io_ptr.at({hipfftw_step::plan_creation, fft_io::fft_io_out})); params.plan_helper.execute( test_io_ptr.at({hipfftw_step::plan_execution, fft_io::fft_io_in}), test_io_ptr.at({hipfftw_step::plan_execution, fft_io::fft_io_out})); if(exception_logger->is_active()) { const auto log_content = exception_logger->get_log(); if(!log_content.empty()) { GTEST_FAIL() << "Non-empty log content detected:\n" << log_content; } } exception_logger.reset(); // copy hipfftw results back into the execution_results_on_host[0] buffer // for verification purposes const std::pair exec_out_key = {hipfftw_step::plan_execution, fft_io::fft_io_out}; if(test_io_type.at(exec_out_key) == hipfftw_data_memory_type::device) { // making this copy synchronous as the next step is verifying the results const auto hip_status = hipMemcpy(execution_results_on_host[0].data(), test_io_ptr.at(exec_out_key), params.plan_helper.get_data_byte_size(fft_io::fft_io_out), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw hip_runtime_error("hipMemcpy failed (copying output results).", hip_status); } else { std::memcpy(execution_results_on_host[0].data(), test_io_ptr.at(exec_out_key), params.plan_helper.get_data_byte_size(fft_io::fft_io_out)); } // compare results if(reference_cpu_dft.valid()) reference_cpu_dft.get(); const auto test_lengths = params.get_lengths(); auto test_olengths = test_lengths; if(params.get_dft_kind() == fft_transform_type_real_forward) test_olengths.back() = test_olengths.back() / 2 + 1; const auto total_length = product(test_lengths.begin(), test_lengths.end()); const auto& reference_output = params.get_placement() == fft_placement_inplace ? verification_input : verification_output; const auto ref_norm = norm(reference_output, test_olengths, params.get_nbatch(), prec, params.get_array_type(fft_io::fft_io_out), params.get_ostride(), params.get_odist(), {0} /* offset */); const double test_epsilon = prec == fft_precision_single ? single_epsilon : double_epsilon; const double linf_cutoff = test_epsilon * ref_norm.l_inf * log(total_length); // compare results const auto diff = distance(reference_output, execution_results_on_host, test_olengths, params.get_nbatch(), prec, params.get_array_type(fft_io::fft_io_out), params.get_ostride(), params.get_odist(), params.get_array_type(fft_io::fft_io_out), params.get_ostride(), params.get_odist(), nullptr, linf_cutoff, {0} /* offset */, {0} /* offset */); EXPECT_LE(diff.l_inf, linf_cutoff); EXPECT_LE(diff.l_2 / ref_norm.l_2, sqrt(log2(total_length)) * test_epsilon); if constexpr(prec == fft_precision_single) { max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / ref_norm.l_inf / log(total_length)); max_l2_eps_single = std::max( max_l2_eps_single, diff.l_2 / ref_norm.l_2 * sqrt(log2(total_length))); } else { max_linf_eps_double = std::max(max_linf_eps_double, diff.l_inf / ref_norm.l_inf / log(total_length)); max_l2_eps_double = std::max( max_l2_eps_double, diff.l_2 / ref_norm.l_2 * sqrt(log2(total_length))); } } catch(const hipfftw_undefined_function_ptr& e) { GTEST_FAIL() << "undefined function pointers detected. Error info: " << e.what(); } catch(const hip_runtime_error e) { ++n_hip_failures; if(skip_runtime_fails) GTEST_SKIP() << e.what() << "\nError code: " << e.hip_error << "."; else GTEST_FAIL() << e.what() << "\nError code: " << e.hip_error << "."; } catch(const std::runtime_error e) { GTEST_FAIL() << e.what(); } catch(...) { GTEST_FAIL() << "unidentified exception caught during test."; } } public: static std::string TestName( const testing::TestParamInfo& info) { return info.param.to_string(); } }; template std::vector> params_for_functional_tests(size_t desired_full_suite_size) { std::vector> full_list; hipfftw_functional_validation_params to_add; // for readability of template specialization values below constexpr bool valid_value = true; const auto& possible_mem_types = get_possible_data_mem_types(); while(full_list.size() < desired_full_suite_size) { // TODO: randomly alternate between hipfftw_execution_io_args::use_creation_io and // hipfftw_execution_io_args::clean_new_io when the latter is available to_add.execution_io = hipfftw_execution_io_args::use_creation_io; const auto dft_kind = trans_type_range_full[get_random_idx(trans_type_range_full.size())]; const auto rank = get_random_rank(); const auto placement = place_range[get_random_idx(place_range.size())]; const bool is_real_inplace = is_real(dft_kind) && placement == fft_placement_inplace; const ptrdiff_t len_threshold = std::min(get_len_threshold(max_num_elems_for_data_size( max_byte_size_for_hipfftw_tests(), dft_kind), rank, is_real_inplace), static_cast(max_length_for_hipfftw_test)); to_add.plan_helper.set_creation_args( dft_kind, rank, get_random_lengths(rank, len_threshold), placement, is_fwd(dft_kind) ? FFTW_FORWARD : FFTW_BACKWARD, FFTW_ESTIMATE); for(auto step : {hipfftw_step::plan_creation, hipfftw_step::plan_execution}) { if(step == hipfftw_step::plan_execution && to_add.execution_io == hipfftw_execution_io_args::use_creation_io) { for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair key = {step, io}; const std::pair creation_key = {hipfftw_step::plan_creation, io}; to_add.mem_type[key] = to_add.mem_type[creation_key]; } continue; } for(auto io : {fft_io::fft_io_in, fft_io::fft_io_out}) { const std::pair key = {step, io}; if(placement == fft_placement_inplace && io == fft_io::fft_io_out) { auto input_key = key; input_key.second = fft_io::fft_io_in; to_add.mem_type[key] = to_add.mem_type[input_key]; } else { to_add.mem_type[key] = possible_mem_types[get_random_idx(possible_mem_types.size())]; } } } // skip params if they can't be tested for some reason if(!to_add.can_be_tested()) continue; insert_into_unique_sorted_params(full_list, to_add); } if(test_prob == 1.0 && real_prob_factor == 1.0) return full_list; std::vector> ret; for(const auto& test : full_list) { const double roll = hash_prob(random_seed, test.to_string()); const double run_prob = test_prob * (is_real(test.plan_helper.get_dft_kind()) ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped: (roll=" << roll << " > " << run_prob << ")\n"; } continue; } ret.emplace_back(test); } return ret; } } // hipfftw test details' anonymous namespace // //--------------------------------------------------------------------------------------------- // INSTANTIATION OF TESTS //--------------------------------------------------------------------------------------------- // TEST(hipfftw_test, utility_functions) { test_existence_of_utility_functions(); test_existence_of_utility_functions(); } INSTANTIATE_TEST_SUITE_P( #ifdef __HIP_PLATFORM_AMD__ hipfftw_test, #else DISABLED_hipfftw_test, #endif allocation_sp, ::testing::ValuesIn(params_for_testing_hipfftw_malloc()), allocation_sp::TestName); INSTANTIATE_TEST_SUITE_P( #ifdef __HIP_PLATFORM_AMD__ hipfftw_test, #else DISABLED_hipfftw_test, #endif allocation_dp, ::testing::ValuesIn(params_for_testing_hipfftw_malloc()), allocation_dp::TestName); using argument_validation_sp = hipfftw_argument_validation; TEST_P(argument_validation_sp, creation_and_execution) { input_validation_test(); } using argument_validation_dp = hipfftw_argument_validation; TEST_P(argument_validation_dp, creation_and_execution) { input_validation_test(); } INSTANTIATE_TEST_SUITE_P( #ifdef __HIP_PLATFORM_AMD__ hipfftw_test, #else DISABLED_hipfftw_test, #endif argument_validation_sp, ::testing::ValuesIn(params_for_testing_input_validation_params()), argument_validation_sp::TestName); INSTANTIATE_TEST_SUITE_P( #ifdef __HIP_PLATFORM_AMD__ hipfftw_test, #else DISABLED_hipfftw_test, #endif argument_validation_dp, ::testing::ValuesIn(params_for_testing_input_validation_params()), argument_validation_dp::TestName); using hipfftw_functional_validation_sp = hipfftw_functional_validation; TEST_P(hipfftw_functional_validation_sp, accuracy_vs_fftw) { functional_test(); } using hipfftw_functional_validation_dp = hipfftw_functional_validation; TEST_P(hipfftw_functional_validation_dp, accuracy_vs_fftw) { functional_test(); } static constexpr size_t full_suite_size = 1024; // per precision INSTANTIATE_TEST_SUITE_P( hipfftw_test, hipfftw_functional_validation_sp, ::testing::ValuesIn(params_for_functional_tests(full_suite_size)), hipfftw_functional_validation_sp::TestName); INSTANTIATE_TEST_SUITE_P( hipfftw_test, hipfftw_functional_validation_dp, ::testing::ValuesIn(params_for_functional_tests(full_suite_size)), hipfftw_functional_validation_dp::TestName); // params_for_functional_tests may return empty vectors for low test probabilities. // The following ensures such cases do not make gtest report an error due to uninstantiated // hipfftw_functional_validation_{sp,dp}. GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(hipfftw_functional_validation_sp); GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(hipfftw_functional_validation_dp); hipFFT-rocm-7.1.0/clients/tests/multi_device_test.cpp000066400000000000000000000252131506642153200226230ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../hipfft_params.h" #include #include #include #include extern fft_params::fft_mp_lib mp_lib; extern int mp_ranks; static const std::vector> multi_gpu_sizes = { {128, 256}, {192, 768}, {64, 128, 256}, {96, 160, 192}, }; static const std::vector multi_gpu_batch_range = {10, 1}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; enum SplitType { // split both input and output on slow FFT dimension SLOW_INOUT, // split only input on slow FFT dimension, output is not split SLOW_IN, // split only output on slow FFT dimension, input is not split SLOW_OUT, // split input on slow FFT dimension, and output on fast FFT dimension SLOW_IN_FAST_OUT, // 3D pencil decomposition - one dimension is contiguous on input // and another dimension contiguous on output, remaining dims are // both split PENCIL_3D, }; std::vector param_generator_multi_gpu(const std::optional type, fft_auto_allocation auto_alloc_setting = fft_auto_allocation_default) { int localDeviceCount = 0; (void)hipGetDeviceCount(&localDeviceCount); // if we have an explicit split of data on the user side, we need // to use the multiprocessing API if(type) { if(mp_lib == fft_params::fft_mp_lib_none) return {}; } // data is not explicitly split up, that means the library is // asked to do the split. We need multiple GPUs to do this. else if(localDeviceCount < 2) return {}; static const std::vector> stride_range = {{1}}; auto params_complex = param_generator_complex(test_prob, multi_gpu_sizes, precision_range_sp_dp, multi_gpu_batch_range, stride_generator(stride_range), stride_generator(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, false, false, auto_alloc_setting); auto params_real = param_generator_real(test_prob, multi_gpu_sizes, precision_range_sp_dp, multi_gpu_batch_range, stride_generator(stride_range), stride_generator(stride_range), ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, false, false, auto_alloc_setting); std::vector all_params; auto distribute_params = [=, &all_params](const std::vector& params) { for(auto& p : params) { // test library splitting if(!type) { auto param_multi = p; // for single-batch, cuFFT only allows in-place if(p.nbatch == 1 && p.placement == fft_placement_notinplace) continue; param_multi.multiGPU = std::min(static_cast(p.nbatch), localDeviceCount); all_params.emplace_back(std::move(param_multi)); } else { // the API only allows for batch-1 multi-process FFTs if(p.nbatch > 1) continue; // user-specified split int brickCount = mp_ranks; // start with all-ones in grids std::vector input_grid(p.length.size() + 1, 1); std::vector output_grid(p.length.size() + 1, 1); auto p_dist = p; switch(*type) { case SLOW_INOUT: input_grid[1] = brickCount; output_grid[1] = brickCount; break; case SLOW_IN: // this type only specifies input field and no output // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; input_grid[1] = brickCount; break; case SLOW_OUT: // this type only specifies output field and no input // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; output_grid[1] = brickCount; break; case SLOW_IN_FAST_OUT: // requires at least rank-2 FFT if(p.length.size() < 2) continue; input_grid[1] = brickCount; output_grid.back() = brickCount; break; case PENCIL_3D: // need at least 2 bricks per split dimension, or 4 devices. // also needs to be a 3D problem. if(brickCount < 4 || p.length.size() != 3) continue; // make fast dimension contiguous on input input_grid[1] = static_cast(sqrt(brickCount)); input_grid[2] = brickCount / input_grid[1]; // make middle dimension contiguous on output output_grid[1] = input_grid[1]; output_grid[3] = input_grid[2]; break; } p_dist.mp_lib = mp_lib; p_dist.distribute_input(localDeviceCount, input_grid); p_dist.distribute_output(localDeviceCount, output_grid); // "placement" flag is meaningless if exactly one of // input+output is a field. So just add those cases if // the flag is "out-of-place", since "in-place" is // exactly the same test case. if(p_dist.placement == fft_placement_inplace && p_dist.ifields.empty() != p_dist.ofields.empty()) continue; // in-place transforms require identical input/output layouts if(p.placement == fft_placement_inplace && input_grid != output_grid) continue; all_params.push_back(std::move(p_dist)); } } }; distribute_params(params_complex); distribute_params(params_real); return all_params; } // split both input and output on slowest FFT dim INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT)), accuracy_test::TestName); // split slowest FFT dim only on input, or only on output INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT)), accuracy_test::TestName); // split input on slowest FFT and output on fastest, to minimize data // movement (only makes sense for rank-2 and higher FFTs) INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT)), accuracy_test::TestName); // 3D pencil decompositions INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D)), accuracy_test::TestName); // library-decided splits INSTANTIATE_TEST_SUITE_P(multi_gpu, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu({})), accuracy_test::TestName); // Note: disabled for now due to implementation issues and // unimplemented features in hipFFT (to fix first) INSTANTIATE_TEST_SUITE_P(DISABLED_various_multi_gpu, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu({}, fft_auto_allocation_off)), accuracy_test::TestName); hipFFT-rocm-7.1.0/clients/tests/multi_stream_test.cpp000066400000000000000000001017571506642153200226670ustar00rootroot00000000000000// Copyright (c) 2025 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "hipfft/hipfft.h" #include #include "../../shared/arithmetic.h" #include "../../shared/fft_params.h" #include "../../shared/params_gen.h" // hash_prob #include "../../shared/test_params.h" // externally-declared test parameters #include "../hipfft_params.h" #include // copy_n, any_of #include // M_PI, cos, sin #include #include // hasher #include #include // numeric_limits #include // ranlux24_base, uniform_int_distribution #include #include #ifdef WIN32 #include // Sleep #else #include // usleep #endif #ifndef M_PI #define M_PI 3.14159265358979323846 #endif DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP // a simple class capturing only the most elementary DFT-defining parameters // for tetsing multi-stream DFT operations class ParamsForMultiStreamDFT { class unsupported_case : public std::runtime_error { public: unsupported_case(const std::string& msg) : std::runtime_error(msg) { } }; public: bool is_inverse() const { return transform_type != fft_transform_type_complex_forward && transform_type != fft_transform_type_real_forward; } size_t dim() const { return lengths.size(); } bool is_real() const { return transform_type == fft_transform_type_real_inverse || transform_type == fft_transform_type_real_forward; } size_t real_scalar_type_size() const { size_t ret = 0; switch(precision) { case fft_precision_half: ret = sizeof(rocfft_fp16); break; case fft_precision_single: ret = sizeof(float); break; case fft_precision_double: ret = sizeof(double); break; default: throw std::runtime_error("Unknown precision"); break; } return ret; } hipfft_params make_sub_dft_params(size_t step_id, size_t stream_id) const { if(step_id >= num_steps || stream_id >= num_streams) { throw std::domain_error( "step_id must be in (0, num_steps( and streamd_id must be in (0, num_streams( "); } hipfft_params sub_dft; size_t total_batch_for_step = 0; if(step_id == is_inverse() ? 1 : 0) { // batched 1D DFTs sub_dft.length = {lengths.back()}; total_batch_for_step = product(lengths.begin(), lengths.end() - 1); // same placement, type of transform, precision as parent problem sub_dft.placement = placement; sub_dft.transform_type = transform_type; sub_dft.precision = precision; // default distances and strides are ok } else { // batched DFTs of dimension dim() - 1 sub_dft.length.clear(); std::copy_n(lengths.begin(), dim() - 1, std::back_inserter(sub_dft.length)); total_batch_for_step = is_real() ? lengths.back() / 2 + 1 : lengths.back(); // same precision as parent problem sub_dft.precision = precision; // always in-place sub_dft.placement = fft_placement_inplace; // always complex-to-complex: sub_dft.transform_type = is_inverse() ? fft_transform_type_complex_inverse : fft_transform_type_complex_forward; // Distances and strides sub_dft.idist = sub_dft.odist = 1; std::vector sub_strides(sub_dft.length.size()); for(int stride_idx = sub_strides.size() - 1; stride_idx >= 0; stride_idx--) { if(stride_idx == sub_strides.size() - 1) { sub_strides[stride_idx] = total_batch_for_step; } else { sub_strides[stride_idx] = sub_strides[stride_idx + 1] * sub_dft.length[stride_idx + 1]; } } sub_dft.istride = sub_dft.ostride = sub_strides; } if(total_batch_for_step < num_streams) { throw unsupported_case("Stream-decomposition not implemented yet"); } sub_dft.nbatch = total_batch_for_step / num_streams + (stream_id < total_batch_for_step % num_streams ? 1 : 0); sub_dft.precision = precision; sub_dft.validate(); return sub_dft; } std::vector lengths; fft_precision precision = fft_precision_single; fft_transform_type transform_type = fft_transform_type_complex_forward; fft_result_placement placement = fft_placement_inplace; bool is_supported() const { // check if division of work in num_streams streams is supported yet try { if(dim() < num_steps || dim() > 3 || (std::any_of(lengths.begin(), lengths.end(), [](const decltype(lengths)::value_type& val) { return val <= 0; }))) { throw unsupported_case("Invalid lengths"); } if(precision != fft_precision_single && precision != fft_precision_double) { throw unsupported_case("Precision not supported"); } if(transform_type != fft_transform_type_real_forward && transform_type != fft_transform_type_real_inverse && transform_type != fft_transform_type_complex_forward && transform_type != fft_transform_type_complex_inverse) { throw unsupported_case("Unknown transform type"); } for(size_t step_id = 0; step_id < num_steps; step_id++) { #ifdef __HIP_PLATFORM_NVIDIA__ size_t stream_start_idx_real = 0; #endif for(size_t stream_id = 0; stream_id < num_streams; stream_id++) { #ifdef __HIP_PLATFORM_NVIDIA__ if(stream_start_idx_real % 2) throw unsupported_case("Unaligned I/O data for cuFFT backend"); #endif auto sub_dft = make_sub_dft_params(step_id, stream_id); #ifdef __HIP_PLATFORM_NVIDIA__ if(sub_dft.is_real()) { const auto real_dist = is_inverse() ? sub_dft.odist : sub_dft.idist; stream_start_idx_real += real_dist * sub_dft.nbatch; } #endif } } } catch(const unsupported_case& e) { if(verbose > 0) { std::cout << e.what() << std::endl; } return false; } return true; } template size_t get_data_size() const { size_t total_size = real_scalar_type_size(); if(!is_real()) total_size *= 2 * lengths.back(); else { if((for_input_data == is_inverse()) || placement == fft_placement_inplace) { total_size *= 2 * (lengths.back() / 2 + 1); } else total_size *= lengths.back(); } for(size_t length_idx = 0; length_idx < dim() - 1; length_idx++) total_size *= lengths[length_idx]; return total_size; } template std::vector get_data_strides() const { std::vector ret(1, 1); if(is_real()) { if(for_input_data == is_inverse()) ret.insert(ret.begin(), lengths.back() / 2 + 1); else { if(placement == fft_placement_inplace) ret.insert(ret.begin(), 2 * (lengths.back() / 2 + 1)); else ret.insert(ret.begin(), lengths.back()); } } else ret.insert(ret.begin(), lengths.back()); for(size_t i = 2; i < dim(); i++) { ret.insert(ret.begin(), ret.front() * lengths[dim() - i]); } return ret; } std::string get_test_name() const { // use fft_params' token member function to generate unambiguous test names fft_params for_test_name; for_test_name.length = lengths; for_test_name.precision = precision; for_test_name.transform_type = transform_type; for_test_name.placement = placement; for_test_name.validate(); return for_test_name.token(); } constexpr static size_t num_streams = 4; constexpr static size_t num_steps = 2; }; // Base gtest class for multi-strean tests. class multiStreamTest : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.get_test_name(); } }; template static void init_data(hostbuf& hostbuffer, const ParamsForMultiStreamDFT& params, const std::vector& harmonic) { if(params.dim() < 2 || params.dim() > 3) throw std::invalid_argument("Only dimensions 2 and 3 can be considered"); if(harmonic.size() != params.dim()) throw std::invalid_argument( "As many harmonic components as the problem's dimension required"); real_data_type* data_ptr = static_cast(hostbuffer.data()); const std::vector strides = params.get_data_strides(); auto phase = [](const std::vector& k, const std::vector& h, const std::vector& l) { real_data_type ret = 0; for(size_t i = 0; i < k.size(); i++) ret += static_cast((k[i] * h[i]) % l[i]) / static_cast(l[i]); return 2.0 * M_PI * ret; }; // set pre-factor to have clear unit spike(s) at targeted harmonic component(s) after transform real_data_type pre_factor = 1.0 / product(params.lengths.begin(), params.lengths.end()); if(params.transform_type == fft_transform_type_real_forward) { auto length_it = params.lengths.begin(); // multiply by 2 if the chosen harmonic component is not its own hermitian symmetric if(std::any_of(harmonic.begin(), harmonic.end(), [&](decltype(harmonic[0])& h) { return h != (*length_it - h) % *length_it++; })) { pre_factor *= 2.0; } } std::vector multi_index(params.dim(), 0); for(size_t k = 0; k < (params.dim() == 2 ? 1 : params.lengths[0]); k++) { if(params.dim() > 2) multi_index[0] = k; for(size_t j = 0; j < params.lengths[params.dim() - 2]; j++) { multi_index[params.dim() - 2] = j; for(size_t i = 0; i < (params.transform_type == fft_transform_type_real_inverse ? params.lengths.back() / 2 + 1 : params.lengths.back()); i++) { const size_t data_idx = (params.dim() > 2 ? k * strides[0] : 0) + j * strides[params.dim() - 2] + i * strides[params.dim() - 1]; multi_index[params.dim() - 1] = i; if(params.transform_type != fft_transform_type_real_forward) { data_ptr[2 * data_idx] = pre_factor * std::cos(phase(multi_index, harmonic, params.lengths)); data_ptr[2 * data_idx + 1] = pre_factor * (params.is_inverse() ? -1.0 : +1.0) * std::sin(phase(multi_index, harmonic, params.lengths)); } else { data_ptr[data_idx] = pre_factor * std::cos(phase(multi_index, harmonic, params.lengths)); } } } } } template static real_data_type max_error(const hostbuf& hostbuffer, const ParamsForMultiStreamDFT& params, const std::vector& harmonic) { if(params.dim() < 2 || params.dim() > 3) throw std::invalid_argument("Only dimensions 2 and 3 can be considered"); if(harmonic.size() != params.dim()) throw std::invalid_argument( "As many harmonic components as the problem's dimension required"); const real_data_type* data_ptr = static_cast(hostbuffer.data()); const std::vector strides = params.get_data_strides(); auto spike_expected = [&](const std::vector& multi_index) { bool ret = multi_index == harmonic; if(!ret && params.transform_type == fft_transform_type_real_forward) { // could be the hermitian symmetry of the expected one auto length_it = params.lengths.begin(); auto multi_idx_it = multi_index.begin(); ret = std::all_of(harmonic.begin(), harmonic.end(), [&](decltype(harmonic[0])& h) { return h == (*length_it - *multi_idx_it++) % *length_it++; }); } return ret; }; std::vector multi_index(params.dim(), 0); real_data_type max_abs_diff{0}; for(size_t k = 0; k < (params.dim() == 2 ? 1 : params.lengths[0]); k++) { if(params.dim() > 2) multi_index[0] = k; for(size_t j = 0; j < params.lengths[params.dim() - 2]; j++) { multi_index[params.dim() - 2] = j; for(size_t i = 0; i < (params.transform_type == fft_transform_type_real_forward ? params.lengths.back() / 2 + 1 : params.lengths.back()); i++) { const size_t data_idx = (params.dim() > 2 ? k * strides[0] : 0) + j * strides[params.dim() - 2] + i * strides[params.dim() - 1]; multi_index[params.dim() - 1] = i; const real_data_type expected_real_value = spike_expected(multi_index) ? 1.0 : 0.0; if(params.transform_type != fft_transform_type_real_inverse) { max_abs_diff = std::max( max_abs_diff, std::fabs(data_ptr[2 * data_idx] - expected_real_value)); // imaginary part always expected to be 0 max_abs_diff = std::max(max_abs_diff, std::fabs(data_ptr[2 * data_idx + 1])); } else { max_abs_diff = std::max(max_abs_diff, std::fabs(data_ptr[data_idx] - expected_real_value)); } } } } return max_abs_diff; } template static void allocate_buffer(gpubuf_t& buffer, const size_t desired_size) { auto ret = buffer.alloc(desired_size); if(ret != hipSuccess) { n_hip_failures++; std::stringstream info; info << "Test failed to allocate " << desired_size << " bytes for gpu data"; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } } TEST_P(multiStreamTest, impulseSignalOnOutput) { fft_status fft_error_code = fft_status_success; hipError_t hip_error_code = hipSuccess; hipfftResult_t hipfft_error_code = HIPFFT_SUCCESS; std::stringstream info; ParamsForMultiStreamDFT parameters = GetParam(); std::hash hasher; std::ranlux24_base gen(random_seed + hasher(parameters.get_test_name())); std::uniform_int_distribution harmonic_rng(std::numeric_limits::min(), std::numeric_limits::max()); if(!parameters.is_supported()) GTEST_SKIP() << "Test not supported yet"; // RAII encapsulation struct for hip streams: struct sub_dft_stream_t { private: bool sub_dft_is_done; bool cb_is_enqueued; public: struct init_failure : public std::exception { }; hipStream_t hip_stream; sub_dft_stream_t() : sub_dft_is_done(false) , cb_is_enqueued(false) { auto ret = hipStreamCreate(&hip_stream); if(ret != hipSuccess) throw init_failure(); } hipError_t enqueue_host_callback() { if(cb_is_enqueued) throw std::runtime_error("a callback is already enqueued for this stream"); auto mark_stream_work_done_callback = [](hipStream_t, hipError_t, void* work_done_ptr) { *(static_cast(work_done_ptr)) = true; // raise flag }; const auto hip_status = hipStreamAddCallback( hip_stream, mark_stream_work_done_callback, &sub_dft_is_done, 0 /* must be 0 */); if(hip_status == hipSuccess) cb_is_enqueued = true; return hip_status; } ~sub_dft_stream_t() { if(cb_is_enqueued) { (void)hipStreamSynchronize(hip_stream); } (void)hipStreamDestroy(hip_stream); } bool done() const { return sub_dft_is_done; } void reset_flags() { sub_dft_is_done = cb_is_enqueued = false; } }; // create the test streams std::vector test_streams; try { test_streams.resize(ParamsForMultiStreamDFT::num_streams); } catch(const sub_dft_stream_t::init_failure& e) { n_hip_failures++; info.str(""); info << "Test failed to create " << ParamsForMultiStreamDFT::num_streams << " streams"; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } catch(...) { GTEST_FAIL() << "Issue caught when creating " << ParamsForMultiStreamDFT::num_streams << " streams"; } // construct plans for each step and stream: hipfft_params sub_dft[ParamsForMultiStreamDFT::num_steps][ParamsForMultiStreamDFT::num_streams]; try { for(size_t stream_id = 0; stream_id < ParamsForMultiStreamDFT::num_streams; stream_id++) { for(size_t step_id = 0; step_id < ParamsForMultiStreamDFT::num_steps; step_id++) { hipfft_params& dft_op = sub_dft[step_id][stream_id]; dft_op = parameters.make_sub_dft_params(step_id, stream_id); fft_error_code = dft_op.create_plan(); if(fft_error_code != fft_status_success) { GTEST_FAIL() << "Failed to create hipfft plan for step id " << step_id << ", stream id " << stream_id << " (sub-DFT token : " << dft_op.token() << ")"; } hipfft_error_code = dft_op.set_stream(test_streams[stream_id].hip_stream); if(hipfft_error_code != HIPFFT_SUCCESS) { GTEST_FAIL() << "Failed to set stream for step id " << step_id << ", stream id " << stream_id; } if(verbose > 1) { std::cout << "token of sub-DFT created for step id " << step_id << " and stream id " << stream_id << ": " << dft_op.token() << std::endl; } } } } catch(fft_params::work_buffer_alloc_failure& e) { info.str(""); info << "Allocation failure detected during the creation of the sub-DFT plans"; ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } catch(...) { GTEST_FAIL() << "The test failed to create the required sub-DFT plans"; } // compute the full DFT described by parameters by decomposing work in // ParamsForMultiStreamDFT::num_streams different streams const size_t isize = parameters.get_data_size(); const size_t osize = parameters.get_data_size(); gpubuf_t input_buf, output_buf; allocate_buffer(input_buf, isize); if(parameters.placement != fft_placement_inplace) { allocate_buffer(output_buf, osize); // avoid false negatives via nonzero initialization: hip_error_code = hipMemset(output_buf.data(), 1, osize); if(hip_error_code != hipSuccess) { GTEST_FAIL() << "Non-zero initialization of output buffer failed"; } } hostbuf hostbuffer; try { hostbuffer.alloc(std::max(isize, osize)); } catch(HOSTBUF_MEM_USAGE& e) { info.str(""); info << "could not allocate host buffer"; GTEST_SKIP() << info.str(); } std::vector expected_harmonic(parameters.dim()); for(size_t i = 0; i < parameters.dim(); i++) { const size_t max_harmonic = i == parameters.dim() - 1 && parameters.transform_type == fft_transform_type_real_forward ? parameters.lengths[i] / 2 + 1 : parameters.lengths[i]; expected_harmonic[i] = harmonic_rng(gen) % max_harmonic; if(verbose > 0) { std::cout << "Chosen harmonic component: " << expected_harmonic[i] << std::endl; } } if(parameters.precision == fft_precision_double) init_data(hostbuffer, parameters, expected_harmonic); else init_data(hostbuffer, parameters, expected_harmonic); // data copy to device (synchronizing) hip_error_code = hipMemcpy(input_buf.data(), hostbuffer.data(), isize, hipMemcpyHostToDevice); if(hip_error_code != hipSuccess) { n_hip_failures++; info.str(""); info << "Test failed to copy initialized data set from host to device"; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } // Computation of the full DFT described by parameters in 2 steps, each involving // ParamsForMultiStreamDFT::num_streams different streams. Synchronization done via // a (host) callback invoked upon completion of every stream's task. void* step_stream_input_data_ptr = nullptr; void* step_stream_output_data_ptr = nullptr; for(size_t step_id = 0; step_id < ParamsForMultiStreamDFT::num_steps; step_id++) { const size_t elementary_itype_size = parameters.real_scalar_type_size() * (parameters.transform_type == fft_transform_type_real_forward && step_id == 0 ? 1 : 2); const size_t elementary_otype_size = parameters.real_scalar_type_size() * (parameters.transform_type == fft_transform_type_real_inverse && step_id == 1 ? 1 : 2); if(parameters.placement == fft_placement_inplace) { // every input/output data for the subproblems is in input_buf step_stream_input_data_ptr = input_buf.data(); step_stream_output_data_ptr = input_buf.data(); } else { // stream's input/output data depends on step and type of global problem step_stream_input_data_ptr = step_id == 0 || parameters.is_inverse() ? input_buf.data() : output_buf.data(); step_stream_output_data_ptr = step_id == 1 || !parameters.is_inverse() ? output_buf.data() : input_buf.data(); } for(size_t stream_id = 0; stream_id < ParamsForMultiStreamDFT::num_streams; stream_id++) { hipfft_params& dft_op = sub_dft[step_id][stream_id]; fft_error_code = dft_op.execute(step_stream_input_data_ptr, step_stream_output_data_ptr); if(fft_error_code != fft_status_success) { GTEST_FAIL() << "execution failed for step id " << step_id << " and stream id " << stream_id; } hip_error_code = test_streams[stream_id].enqueue_host_callback(); if(hip_error_code != hipSuccess) { n_hip_failures++; info.str(""); info << "Test failed to add callback function forstep id " << step_id << " and stream id " << stream_id; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } // increment stream data pointers for next submissions step_stream_input_data_ptr = static_cast(step_stream_input_data_ptr) + dft_op.nbatch * dft_op.idist * elementary_itype_size; step_stream_output_data_ptr = static_cast(step_stream_output_data_ptr) + dft_op.nbatch * dft_op.odist * elementary_otype_size; } // Check if the callbacks get invoked within 10 s. If not, the stream set above was likely // ignored in the sub_dft operations (integer "times" in us below) size_t time_waited_us = 0; constexpr size_t sleep_time_us = 1000; // 1 ms constexpr size_t failure_time_threshold_us = 10000000; // 10^7 us := 10 s while(std::any_of(test_streams.begin(), test_streams.end(), [](const sub_dft_stream_t& stream) { return !stream.done(); }) && time_waited_us <= failure_time_threshold_us) { #ifdef WIN32 Sleep(sleep_time_us / 1000); // argument in ms #else usleep(sleep_time_us); #endif time_waited_us += sleep_time_us; } if(time_waited_us > failure_time_threshold_us) { // The added callback probably was never invoked, i.e., the above set_stream // was not taken into consideration by some sub_dft plan. GTEST_FAIL() << "Time limit exceeded"; } else { for(auto& stream : test_streams) { stream.reset_flags(); } } } // verify results: hip_error_code = hipMemcpy( hostbuffer.data(), (parameters.placement == fft_placement_inplace ? input_buf.data() : output_buf.data()), osize, hipMemcpyDeviceToHost); if(hip_error_code != hipSuccess) { n_hip_failures++; info.str(""); info << "Test failed to copy results from device back to host"; if(skip_runtime_fails) { GTEST_SKIP() << info.str(); } else { GTEST_FAIL() << info.str(); } } // always using doubles for measured max error and error thresholds for convenience (no data loss) const double error_threshold = (parameters.precision == fft_precision_single ? single_epsilon : double_epsilon) * log(product(parameters.lengths.begin(), parameters.lengths.end())); double measured_max_error = 0.0; if(parameters.precision == fft_precision_single) measured_max_error = max_error(hostbuffer, parameters, expected_harmonic); else measured_max_error = max_error(hostbuffer, parameters, expected_harmonic); ASSERT_LE(measured_max_error, error_threshold); } static std::vector generate_full_scope_for(const std::vector>& set_of_test_lengths) { std::vector ret; ParamsForMultiStreamDFT to_add; // set_of_lengths assumed not to contain duplicates for(const auto& test_lengths : set_of_test_lengths) { to_add.lengths = test_lengths; for(auto type : {fft_transform_type_complex_forward, fft_transform_type_real_forward, fft_transform_type_complex_inverse, fft_transform_type_real_inverse}) { to_add.transform_type = type; for(auto prec : {fft_precision_single, fft_precision_double}) { to_add.precision = prec; for(auto place : {fft_placement_inplace, fft_placement_notinplace}) { to_add.placement = place; if(!to_add.is_supported()) continue; const double roll = hash_prob(random_seed, to_add.get_test_name()); const double run_prob = test_prob * (to_add.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped: (roll=" << roll << " > " << run_prob << ")\n"; } continue; } ret.push_back(to_add); } } } } return ret; } // note: generate_full_scope_for will create 16 instance for every created size // --> 16*set_size tests generated in the end template static std::vector> create_random_set_of_sizes() { // limiting to lengths <= 512 per dimension std::ranlux24_base gen(random_seed); std::uniform_int_distribution size_rng(3 * ParamsForMultiStreamDFT::num_streams, 512); // lexicographically-sorted set to return std::vector> ret; while(ret.size() < set_size) { // alternate between 2D and 3D sizes const size_t dim = ret.size() % 2 == 0 ? 2 : 3; std::vector to_add(dim, 0); for(auto& length : to_add) length = size_rng(gen); if(2 * product(to_add.begin(), to_add.end()) * sizeof(double) > max_double_data_byte_size) continue; auto it = std::lower_bound(ret.begin(), ret.end(), to_add); if(it == ret.end() || *it != to_add) { ret.insert(it, to_add); } } return ret; } static constexpr size_t max_byte_size = 128 * 1024 * 1024; // limit data sets to 128 MiB max INSTANTIATE_TEST_SUITE_P( StreamDivision, multiStreamTest, ::testing::ValuesIn(generate_full_scope_for(create_random_set_of_sizes())), multiStreamTest::TestName); // The list of test parameters dynamically generated in the instantiation above may be empty // if low test probabilities are used. The following ensures such cases do not make gtest // report an error due to uninstantiated multiStreamTest, e.g., with option smoketest. GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(multiStreamTest); hipFFT-rocm-7.1.0/clients/tests/simple_test.cpp000066400000000000000000000706741506642153200214560ustar00rootroot00000000000000// Copyright (c) 2018 - 2022 Advanced Micro Devices, Inc. All rights // reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "hipfft/hipfft.h" #include #include #include #include #include #include "../hipfft_params.h" DISABLE_WARNING_PUSH DISABLE_WARNING_DEPRECATED_DECLARATIONS DISABLE_WARNING_RETURN_TYPE #include DISABLE_WARNING_POP // Function to return maximum error for float and double types for // the simple tests below template inline double type_epsilon_simple(); template <> inline double type_epsilon_simple() { return 1e-6; } template <> inline double type_epsilon_simple() { return 1e-7; } /* Static utility class template helping with the definition of valid/invalid values for arguments of (un)scoped enumeration types in API testing. Usage: for an enumeration type of interest, say "enum_of_interest", define template <> const std::vector enum_helper::valid_values = {all, the, known, valid, values, of, type, enum_of_interest}; before using any of this class' self-explanatory public member functions. */ template , bool> = true> class enum_helper { using base_t = typename std::underlying_type::type; // static class cannot be instantiated, copied, or moved enum_helper() = delete; ~enum_helper() = delete; enum_helper(const enum_helper&) = delete; enum_helper(enum_helper&& other) = delete; enum_helper& operator=(const enum_helper&) = delete; enum_helper& operator=(enum_helper&& other) = delete; public: static const std::vector valid_values; static bool has_value(const T& val) { return std::any_of( valid_values.begin(), valid_values.end(), [&](const T& v) { return v == val; }); } static size_t num_valid_values() { return valid_values.size(); } static T get_any_valid_value(size_t prng_seed = 0) { const size_t nvals = num_valid_values(); if(nvals == 0) throw std::runtime_error( "enum_helper::get_any_valid_value: no valid value is defined."); std::ranlux24_base gen(prng_seed); return valid_values[static_cast(gen()) % nvals]; } static T get_invalid_value(size_t prng_seed = 0) { constexpr base_t max_base_val = std::numeric_limits::max(); constexpr base_t min_base_val = std::numeric_limits::min(); std::ranlux24_base gen(prng_seed); std::uniform_int_distribution dis(min_base_val, max_base_val); // limit number of attempts to 10x the number of possible value size_t num_attempts = 0; T made_up_value; auto generate_candidate = [&]() { num_attempts++; made_up_value = static_cast(dis(gen)); return; }; generate_candidate(); while(has_value(made_up_value) && num_attempts < 10 * num_valid_values()) { generate_candidate(); } if(has_value(made_up_value)) throw std::runtime_error( "enum_helper::get_invalid_value failed to generate an invalid valid"); return made_up_value; } }; // definition of valid values for various enum types template <> const std::vector enum_helper::valid_values = {hipfftLibraryPropertyType::HIPFFT_MAJOR_VERSION, hipfftLibraryPropertyType::HIPFFT_MINOR_VERSION, hipfftLibraryPropertyType::HIPFFT_PATCH_LEVEL}; TEST(hipfftTest, Create1dPlan) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; size_t length = 1024; ASSERT_EQ(hipfftPlan1d(&plan, length, HIPFFT_C2C, 1), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CreatePlanMany) { int const rank = 3; int const nX = 64; int const nY = 128; int const nZ = 23; int n[3] = {nX, nY, nZ}; int inembed[3] = {nX, nY, nZ}; int* inembed_null = nullptr; int const istride = 1; int const idist = nX * nY * nZ; int onembed[3] = {nX, nY, nZ}; int* onembed_null = nullptr; int const ostride = 1; int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; int const batch = 1000; size_t workSize; // Tests plan creation with null and not null // combinations of inembed and onembed. // // Valid combinations: // inembed == null && onembed == null // or // inembed != null && onembed != null // // otherwise HIPFFT_INVALID_VALUE should be // returned to maintain compatibility with cuFFT // inembed == null && onembed == null { hipfftHandle plan_valid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_valid_1, rank, (int*)n, inembed_null, istride, idist, onembed_null, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS) << "inembed == null && onembed == null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid_1, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid_1), HIPFFT_SUCCESS); } // inembed != null && onembed != null { hipfftHandle plan_valid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_valid_2, rank, (int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS) << "inembed != null && onembed != null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid_2, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid_2), HIPFFT_SUCCESS); } // inembed != null && onembed == null { hipfftHandle plan_invalid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_invalid_1, rank, (int*)n, inembed, istride, idist, onembed_null, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE) << "inembed != null && onembed == null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftDestroy(plan_invalid_1), HIPFFT_SUCCESS); } // inembed == null && onembed != null { hipfftHandle plan_invalid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany(plan_invalid_2, rank, (int*)n, inembed_null, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE) << "inembed == null && onembed != null failed: " << hipfftResult_string(ret_hipfft); ASSERT_EQ(hipfftDestroy(plan_invalid_2), HIPFFT_SUCCESS); } } TEST(hipfftTest, CreatePlanMany64) { int const rank = 3; long long int const nX = 64; long long int const nY = 128; long long int const nZ = 23; long long int n[3] = {nX, nY, nZ}; long long int inembed[3] = {nX, nY, nZ}; long long int const istride = 1; long long int const idist = nX * nY * nZ; long long int onembed[3] = {nX, nY, nZ}; long long int onembed_invalid[3] = {nX, nY, -nZ}; long long int const ostride = 1; long long int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; long long int const batch = 1000; long long int const batch_invalid = -2; size_t workSize; // Tests the 64-bit version of plan creation // with valid/invalid data layouts. // First test with a valid data layout { hipfftHandle plan_valid = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_valid), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_valid, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan_valid, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_valid), HIPFFT_SUCCESS); } // invalid data layout (n array has a negative entry). only test rocFFT // backend, since it's more strict #ifdef __HIP_PLATFORM_AMD__ long long int n_invalid[3] = {nX, -nY, nZ}; { hipfftHandle plan_invalid_1 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_1), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_1, rank, (long long int*)n_invalid, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_VALUE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_1, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_1), HIPFFT_SUCCESS); } #endif // invalid data layout (onembed array has a negative entry) { hipfftHandle plan_invalid_2 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_2), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_2, rank, (long long int*)n, inembed, istride, idist, onembed_invalid, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_SIZE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_2, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_2), HIPFFT_SUCCESS); } // invalid data layout (batch is negative) { hipfftHandle plan_invalid_3 = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan_invalid_3), HIPFFT_SUCCESS); auto ret_hipfft = hipfftMakePlanMany64(plan_invalid_3, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch_invalid, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_INVALID_SIZE); ASSERT_EQ(hipfftSetAutoAllocation(plan_invalid_3, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan_invalid_3), HIPFFT_SUCCESS); } } TEST(hipfftTest, hipfftGetSizeMany) { int const rank = 3; int const nX = 33; int const nY = 128; int const nZ = 100; int n[3] = {nX, nY, nZ}; int inembed[3] = {nX, nY, nZ}; int const istride = 1; int const idist = nX * nY * nZ; int onembed[3] = {nX, nY, nZ}; int const ostride = 1; int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; int const batch = 1; size_t workSize; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); auto ret_hipfft = hipfftGetSizeMany(plan, rank, (int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, hipfftGetSizeMany64) { int const rank = 3; long long int const nX = 133; long long int const nY = 354; long long int const nZ = 256; long long int n[3] = {nX, nY, nZ}; long long int inembed[3] = {nX, nY, nZ}; long long int const istride = 1; long long int const idist = nX * nY * nZ; long long int onembed[3] = {nX, nY, nZ}; long long int const ostride = 1; long long int const odist = nX * nY * nZ; hipfftType type = HIPFFT_C2C; long long int const batch = 2; size_t workSize; hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); auto ret_hipfft = hipfftGetSizeMany64(plan, rank, (long long int*)n, inembed, istride, idist, onembed, ostride, odist, type, batch, &workSize); ASSERT_EQ(ret_hipfft, HIPFFT_SUCCESS); ASSERT_EQ(hipfftSetAutoAllocation(plan, 0), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeC2C) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 1024; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_C2C, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // No extra work buffer for C2C EXPECT_EQ(workSize, 0); #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeR2C) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); // real forward transform cannot modify input, so we need to pick // a sufficiently small N such that rocFFT can fuse // post-processing into one kernel and avoid a temp buffer size_t n = 256; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_R2C, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(float)); } #endif EXPECT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeC2R) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_C2R, 1, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(float)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeD2Z) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); // real forward transform cannot modify input, so we need to pick // a sufficiently small N such that rocFFT can fuse // post-processing into one kernel and avoid a temp buffer size_t n = 256; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_D2Z, batch, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(double)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } TEST(hipfftTest, CheckBufferSizeZ2D) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_Z2D, batch, &workSize), HIPFFT_SUCCESS); #ifdef __HIP_PLATFORM_AMD__ // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(n % 2 == 0) { EXPECT_EQ(workSize, 0); } else { EXPECT_EQ(workSize, 2 * n * sizeof(double)); } #endif ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } #ifdef __HIP_PLATFORM_AMD__ TEST(hipfftTest, CheckNullWorkBuffer) { hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t n = 2048; size_t batch = 1000; size_t workSize = 0; ASSERT_EQ(hipfftMakePlan1d(plan, n, HIPFFT_Z2D, batch, &workSize), HIPFFT_SUCCESS); EXPECT_EQ(hipfftSetWorkArea(plan, nullptr), HIPFFT_SUCCESS); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); } #endif TEST(hipfftTest, RunR2C) { const size_t N = 4096; float in[N]; for(size_t i = 0; i < N; i++) in[i] = i + (i % 3) - (i % 7); hipfftReal* d_in; hipfftComplex* d_out; ASSERT_EQ(hipMalloc(&d_in, N * sizeof(hipfftReal)), hipSuccess); ASSERT_EQ(hipMalloc(&d_out, (N / 2 + 1) * sizeof(hipfftComplex)), hipSuccess); ASSERT_EQ(hipMemcpy(d_in, in, N * sizeof(hipfftReal), hipMemcpyHostToDevice), hipSuccess); hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); size_t workSize; ASSERT_EQ(hipfftMakePlan1d(plan, N, HIPFFT_R2C, 1, &workSize), HIPFFT_SUCCESS); EXPECT_EQ(hipfftExecR2C(plan, d_in, d_out), HIPFFT_SUCCESS); std::vector out(N / 2 + 1); ASSERT_EQ(hipMemcpy(&out[0], d_out, (N / 2 + 1) * sizeof(hipfftComplex), hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); ASSERT_EQ(hipFree(d_in), hipSuccess); ASSERT_EQ(hipFree(d_out), hipSuccess); ; // NOTE: keep this condition for ease of changing n for ad-hoc tests // // cppcheck-suppress knownConditionTrueFalse if(N % 2 != 0) { EXPECT_TRUE(workSize != 0); } double ref_in[N]; for(size_t i = 0; i < N; i++) ref_in[i] = in[i]; fftw_complex* ref_out; fftw_plan ref_p; ref_out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * (N / 2 + 1)); ref_p = fftw_plan_dft_r2c_1d(N, ref_in, ref_out, FFTW_ESTIMATE); fftw_execute(ref_p); double maxv = 0; double nrmse = 0; // normalized root mean square error for(size_t i = 0; i < (N / 2 + 1); i++) { // printf("element %d: FFTW result %f, %f; hipFFT result %f, %f \n", (int)i, ref_out[i][0], ref_out[i][1], out[i].x, out[i].y); double dr = ref_out[i][0] - out[i].x; double di = ref_out[i][1] - out[i].y; maxv = fabs(ref_out[i][0]) > maxv ? fabs(ref_out[i][0]) : maxv; maxv = fabs(ref_out[i][1]) > maxv ? fabs(ref_out[i][1]) : maxv; nrmse += ((dr * dr) + (di * di)); } nrmse /= (double)((N / 2 + 1)); nrmse = sqrt(nrmse); nrmse /= maxv; EXPECT_LT(nrmse, type_epsilon_simple()); fftw_destroy_plan(ref_p); fftw_free(ref_out); } // ask for a transform whose parameters are only valid out-of-place. // since hipFFT generates both in-place and out-place plans up front // (because it's not told about the placement until exec time), this // ensures that a failure to create an in-place plan doesn't prevent // the out-place plan from working. TEST(hipfftTest, OutplaceOnly) { static const int N_in_const = 4; static const int N_out_const = N_in_const / 2 + 1; // mutable sizes for passing to hipFFT int N_in = N_in_const; int N_out = N_out_const; float in[N_in_const]; for(int i = 0; i < N_in; i++) in[i] = i + (i % 3) - (i % 7); hipfftReal* d_in; hipfftComplex* d_out; ASSERT_EQ(hipMalloc(&d_in, N_in * sizeof(hipfftReal)), hipSuccess); ASSERT_EQ(hipMalloc(&d_out, N_out * sizeof(hipfftComplex)), hipSuccess); ASSERT_EQ(hipMemcpy(d_in, in, N_in * sizeof(hipfftReal), hipMemcpyHostToDevice), hipSuccess); hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; ASSERT_EQ(hipfftCreate(&plan), HIPFFT_SUCCESS); ASSERT_EQ(hipfftPlanMany(&plan, 1, &N_in, &N_in, 1, N_in, &N_out, 1, N_out, HIPFFT_R2C, 1), HIPFFT_SUCCESS); ASSERT_EQ(plan == hipfft_params::INVALID_PLAN_HANDLE, false); ASSERT_EQ(hipfftExecR2C(plan, d_in, d_out), HIPFFT_SUCCESS) << "hipfftExecR2C failed"; std::vector out(N_out); ASSERT_EQ(hipMemcpy(out.data(), d_out, N_out * sizeof(hipfftComplex), hipMemcpyDeviceToHost), hipSuccess); // in-place transform isn't really *supposed* to work - this // might or might not fail but we can at least check that it // doesn't blow up. //hipfftExecR2C(plan, reinterpret_cast(d_out), d_out); ASSERT_EQ(hipfftDestroy(plan), HIPFFT_SUCCESS); ASSERT_EQ(hipFree(d_in), hipSuccess); ASSERT_EQ(hipFree(d_out), hipSuccess); double ref_in[N_in_const]; for(int i = 0; i < N_in_const; i++) ref_in[i] = in[i]; fftw_complex* ref_out; fftw_plan ref_p; ref_out = (fftw_complex*)fftw_malloc(sizeof(fftw_complex) * N_out); ref_p = fftw_plan_dft_r2c_1d(N_in, ref_in, ref_out, FFTW_ESTIMATE); fftw_execute(ref_p); double maxv = 0; double nrmse = 0; // normalized root mean square error for(int i = 0; i < N_out; i++) { // printf("element %d: FFTW result %f, %f; hipFFT result %f, %f \n", (int)i, ref_out[i][0], ref_out[i][1], out[i].x, out[i].y); double dr = ref_out[i][0] - out[i].x; double di = ref_out[i][1] - out[i].y; maxv = fabs(ref_out[i][0]) > maxv ? fabs(ref_out[i][0]) : maxv; maxv = fabs(ref_out[i][1]) > maxv ? fabs(ref_out[i][1]) : maxv; nrmse += ((dr * dr) + (di * di)); } nrmse /= (double)(N_out); nrmse = sqrt(nrmse); nrmse /= maxv; ASSERT_LT(nrmse, type_epsilon_simple()); fftw_destroy_plan(ref_p); fftw_free(ref_out); } static constexpr int absurd_version_or_property = std::numeric_limits::min(); TEST(hipfftTest, GetVersion) { // valid use case(s) int tmp = absurd_version_or_property; EXPECT_EQ(hipfftGetVersion(&tmp), HIPFFT_SUCCESS); EXPECT_NE(tmp, absurd_version_or_property); EXPECT_EQ(hipfftGetVersion(nullptr), HIPFFT_INVALID_VALUE); } TEST(hipfftTest, GetProperty) { // valid use case(s) int tmp; for(auto prop_type : enum_helper::valid_values) { tmp = absurd_version_or_property; EXPECT_EQ(hipfftGetProperty(prop_type, &tmp), HIPFFT_SUCCESS); EXPECT_NE(tmp, absurd_version_or_property); } // invalid use case(s) const auto valid_property_type = enum_helper::get_any_valid_value(); EXPECT_EQ(hipfftGetProperty(valid_property_type, nullptr), HIPFFT_INVALID_VALUE); const auto invalid_property_type = enum_helper::get_invalid_value(); EXPECT_EQ(hipfftGetProperty(invalid_property_type, &tmp), HIPFFT_INVALID_VALUE); } hipFFT-rocm-7.1.0/cmake/000077500000000000000000000000001506642153200146615ustar00rootroot00000000000000hipFFT-rocm-7.1.0/cmake/dependencies.cmake000066400000000000000000000071011506642153200203100ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # HIP if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) if( NOT BUILD_WITH_LIB STREQUAL "CUDA" ) if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP REQUIRED ) endif() list( APPEND HIP_INCLUDE_DIRS "${HIP_ROOT_DIR}/include" ) endif() else() if( BUILD_WITH_LIB STREQUAL "CUDA" ) set(HIP_INCLUDE_DIRS "${HIP_ROOT_DIR}/include") else() if( WIN32 ) find_package( HIP CONFIG REQUIRED ) else() find_package( HIP REQUIRED ) endif() endif() endif() # Either rocfft or cufft is required if(NOT BUILD_WITH_LIB STREQUAL "CUDA") if( HIPFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) endif() find_package(rocfft REQUIRED) else() # cufft may be in the HPC SDK or ordinary CUDA if( HIPFFT_MPI_ENABLE ) if( NOT BUILD_SHARED_LIBS ) message( FATAL_ERROR "cufftMp is shared-only, static build is not possible" ) endif() # MPI support is only in HPC SDK find_package(NVHPC REQUIRED COMPONENTS CUDA MATH MPI) else() find_package(NVHPC QUIET COMPONENTS CUDA MATH) endif() set(CUDA_USE_STATIC_CUDA_RUNTIME OFF) find_package(CUDAToolkit REQUIRED) endif() # ROCm find_package( ROCmCMakeBuildTools CONFIG PATHS /opt/rocm ) if(NOT ROCmCMakeBuildTools_FOUND) set( PROJECT_EXTERN_DIR "${CMAKE_CURRENT_BINARY_DIR}/extern" ) include( FetchContent ) FetchContent_Declare( rocm_cmake_local GIT_REPOSITORY https://github.com/ROCm/rocm-cmake GIT_TAG rocm-6.4.1 GIT_SHALLOW ON ) FetchContent_MakeAvailable( rocm_cmake_local ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) execute_process( COMMAND ${CMAKE_COMMAND} --build ${rocm_cmake_local_SOURCE_DIR} --target install WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif() if( ROCmCMakeBuildTools_FOUND ) message(STATUS "Found ROCm") include(ROCMSetupVersion) include(ROCMCreatePackage) include(ROCMInstallTargets) include(ROCMPackageConfigHelpers) include(ROCMInstallSymlinks) include(ROCMCheckTargetIds) include(ROCMClients) include(ROCMHeaderWrapper) else() message(WARNING "Could not find rocm-cmake, packaging will fail.") endif( ) hipFFT-rocm-7.1.0/cmake/get-cli-arguments.cmake000066400000000000000000000044221506642153200212140ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # Attempt (best effort) to return a list of user specified parameters cmake was invoked with # NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is # not returned because it does not have the matching helpstring function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # Retrieves the contents of CMakeCache.txt get_cmake_property( cmake_properties CACHE_VARIABLES ) foreach( property ${cmake_properties} ) get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) # Properties specified on the command line have boilerplate text if( help_string MATCHES "variable specified on the command line" ) # message( STATUS "property: ${property}") # message( STATUS "value: ${${property}}") list( APPEND cli_args "-D${property}=${${property}}") endif( ) endforeach( ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) endfunction( )hipFFT-rocm-7.1.0/cmake/package-functions.cmake000066400000000000000000000043271506642153200212720ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# # ######################################################################## # A helper function to generate packaging scripts to register libraries with system # ######################################################################## function( write_rocm_package_script_files scripts_write_dir library_name library_link_name ) set( ld_conf_file "/etc/ld.so.conf.d/${library_name}-dev.conf" ) file( WRITE ${scripts_write_dir}/postinst "#!/bin/bash set -e do_ldconfig() { echo ${CPACK_PACKAGING_INSTALL_PREFIX}/${LIB_INSTALL_DIR} > ${ld_conf_file} && ldconfig } case \"\$1\" in configure) do_ldconfig ;; abort-upgrade|abort-remove|abort-deconfigure) echo \"\$1\" ;; *) exit 0 ;; esac " ) file( WRITE ${scripts_write_dir}/prerm "#!/bin/bash set -e rm_ldconfig() { rm -f ${ld_conf_file} && ldconfig } case \"\$1\" in remove|purge) rm_ldconfig ;; *) exit 0 ;; esac " ) endfunction( ) hipFFT-rocm-7.1.0/cmake/verbose.cmake000066400000000000000000000062641506642153200173400ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # # ############################################################################# message(STATUS "hipfft_VERSION : ${hipfft_VERSION}") message(STATUS "\t==>CMAKE_BUILD_TYPE : ${CMAKE_BUILD_TYPE}") message(STATUS "\t==>BUILD_SHARED_LIBS : ${BUILD_SHARED_LIBS}") message(STATUS "\t==>CMAKE_INSTALL_PREFIX link : ${CMAKE_INSTALL_PREFIX}") message(STATUS "\t==>CMAKE_MODULE_PATH link : ${CMAKE_MODULE_PATH}") message(STATUS "\t==>CMAKE_PREFIX_PATH link : ${CMAKE_PREFIX_PATH}") message(STATUS "==============") message(STATUS "\t==>CMAKE_SYSTEM_NAME : ${CMAKE_SYSTEM_NAME}") message(STATUS "\t>>=HIP_ROOT_DIR : ${HIP_ROOT_DIR}") message(STATUS "\t==>CMAKE_CXX_COMPILER : ${CMAKE_CXX_FLAGS}") message(STATUS "\t==>CMAKE_CXX_COMPILER_VERSION : ${CMAKE_CXX_COMPILER_VERSION}") message(STATUS "\t==>CMAKE_CXX_COMPILER debug : ${CMAKE_CXX_FLAGS_DEBUG}") message(STATUS "\t==>CMAKE_CXX_COMPILER release : ${CMAKE_CXX_FLAGS_RELEASE}") message(STATUS "\t==>CMAKE_CXX_COMPILER relwithdebinfo : ${CMAKE_CXX_FLAGS_RELWITHDEBINFO}") message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS : ${CMAKE_EXE_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_EXE_LINKER_FLAGS_RELEASE : ${CMAKE_EXE_LINKER_FLAGS_RELEASE}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}") message(STATUS "==============" ) message(STATUS "\t==>CMAKE_SHARED_LIBRARY_C_FLAGS : ${CMAKE_SHARED_LIBRARY_C_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LIBRARY_CXX_FLAGS : ${CMAKE_SHARED_LIBRARY_CXX_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS : ${CMAKE_SHARED_LINKER_FLAGS}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_DEBUG : ${CMAKE_SHARED_LINKER_FLAGS_DEBUG}") message(STATUS "\t==>CMAKE_SHARED_LINKER_FLAGS_RELEASE : ${CMAKE_SHARED_LINKER_FLAGS_RELEASE}")hipFFT-rocm-7.1.0/deps/000077500000000000000000000000001506642153200145345ustar00rootroot00000000000000hipFFT-rocm-7.1.0/deps/CMakeLists.txt000066400000000000000000000102741506642153200173000ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# # Helper cmake script to automate building dependencies for hipfft # This script can be invoked manually by the user with 'cmake -P' # The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 cmake_minimum_required( VERSION 3.5 ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # Consider removing this in the future # It can be annoying for visual studio developers to build a project that tries to install into 'program files' if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() # The superbuild does not build anything itself; all compiling is done in external projects project( hipfft-dependencies NONE ) option( BUILD_BOOST "Download and build boost library" ON ) option( BUILD_GTEST "Download and build googletest library" ON ) # option( BUILD_VERBOSE "Print helpful build debug information" OFF ) # if( BUILD_VERBOSE ) # message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) # message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) # message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) # endif( ) # This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation include( get-cli-arguments ) # The following is a series of super-build projects; this cmake project will download and build if( BUILD_GTEST ) include( external-gtest ) list( APPEND hipfft_dependencies googletest ) set( gtest_custom_target COMMAND cd ${GTEST_BINARY_ROOT}$ ${CMAKE_COMMAND} --build . --target install ) endif( ) if( BUILD_BOOST ) include( external-boost ) list( APPEND hipfft_dependencies boost ) set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) endif( ) # POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" # Familiar target names like 'install' should be OK at the super-build level if( POLICY CMP0037 ) cmake_policy( SET CMP0037 OLD ) endif( ) add_custom_target( install ${boost_custom_target} ${gtest_custom_target} DEPENDS ${hipfft_dependencies} ) hipFFT-rocm-7.1.0/deps/external-boost.cmake000066400000000000000000000170471506642153200205150ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# message( STATUS "Configuring boost external dependency" ) include( ExternalProject ) set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) # We need to detect the compiler the user is attempting to invoke with CMake, # we do our best to translate cmake parameters into bjam parameters enable_language( CXX ) include( build-bitness ) # TODO: Options should be added to allow downloading Boost straight from github # This file is used to add Boost as a library dependency to another project # This sets up boost to download from sourceforge, and builds it as a cmake # ExternalProject # Change this one line to upgrade to newer versions of boost set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) mark_as_advanced( ext.Boost_VERSION ) string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) if( WIN32 ) # For newer cmake versions, 7z archives are much smaller to download if( CMAKE_VERSION VERSION_LESS "3.1.0" ) set( Boost_Ext "zip" ) else( ) set( Boost_Ext "7z" ) endif( ) else( ) set( Boost_Ext "tar.bz2" ) endif( ) if( WIN32 ) set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) else( ) set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) endif( ) if( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) endif( ) include( ProcessorCount ) ProcessorCount( Cores ) if( NOT Cores EQUAL 0 ) # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM if( DEFINED ENV{TRAVIS} ) if( Cores GREATER 8 ) set( Cores 8 ) endif( ) endif( ) # Add build thread in addition to the number of cores that we have math( EXPR Cores "${Cores} + 1 " ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread set( Cores "2" ) endif( ) message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) message( STATUS "ExternalBoost building [ serialization, filesystem, system, regex ] components" ) list( APPEND Boost.Command -j ${Cores} --with-serialization --with-filesystem --with-system --with-regex ) if( BUILD_64 ) list( APPEND Boost.Command address-model=64 ) else( ) list( APPEND Boost.Command address-model=32 ) endif( ) if( MSVC10 ) list( APPEND Boost.Command toolset=msvc-10.0 ) elseif( MSVC11 ) list( APPEND Boost.Command toolset=msvc-11.0 ) elseif( MSVC12 ) list( APPEND Boost.Command toolset=msvc-12.0 ) elseif( MSVC14 ) list( APPEND Boost.Command toolset=msvc-14.0 ) elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Command toolset=clang ) elseif( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command toolset=gcc ) endif( ) if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) endif( ) if( NOT DEFINED ext.Boost_LINK ) if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) else( ) set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) endif( ) endif() mark_as_advanced( ext.Boost_LINK ) if( WIN32 ) # Versioned is the default on windows set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For windows, default to build both variants to support the VS IDE set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) # Tagged builds provide unique enough names to be able to build both variants set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For Linux, typically a build tree only needs one variant if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) endif( ) endif( ) mark_as_advanced( ext.Boost_LAYOUT ) mark_as_advanced( ext.Boost_VARIANT ) list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) message( STATUS "Boost.Command: ${Boost.Command}" ) # If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable if( DEFINED ENV{BOOST_URL} ) set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) else( ) set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) endif( ) mark_as_advanced( ext.Boost_URL ) set( Boost.Bootstrap "" ) set( ext.HASH "" ) if( WIN32 ) set( Boost.Bootstrap "bootstrap.bat" ) if( CMAKE_VERSION VERSION_LESS "3.1.0" ) # .zip file set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) else( ) # .7z file set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) endif( ) else( ) set( Boost.Bootstrap "./bootstrap.sh" ) # .tar.bz2 set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Bootstrap --with-toolset=clang ) endif( ) endif( ) # Below is a fancy CMake command to download, build and install Boost on the users computer ExternalProject_Add( boost PREFIX ${CMAKE_BINARY_DIR}/boost URL ${ext.Boost_URL} URL_HASH SHA256=${ext.HASH} UPDATE_COMMAND ${Boost.Bootstrap} LOG_UPDATE 1 CONFIGURE_COMMAND "" BUILD_COMMAND ${Boost.Command} stage BUILD_IN_SOURCE 1 LOG_BUILD 1 INSTALL_COMMAND "" ) set_property( TARGET boost PROPERTY FOLDER "extern" ) ExternalProject_Get_Property( boost install_dir ) ExternalProject_Get_Property( boost binary_dir ) # For use by the user of ExternalGtest.cmake set( BOOST_INSTALL_ROOT ${install_dir} ) set( BOOST_BINARY_ROOT ${binary_dir} ) hipFFT-rocm-7.1.0/deps/external-gtest.cmake000066400000000000000000000115061506642153200205070ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# message( STATUS "Configuring gtest external dependency" ) include( ExternalProject ) # set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=/package ) set( PREFIX_GTEST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) set( gtest_cmake_args -DCMAKE_INSTALL_PREFIX=${PREFIX_GTEST} ) append_cmake_cli_arguments( gtest_cmake_args gtest_cmake_args ) set( gtest_git_repository "https://github.com/google/googletest.git" CACHE STRING "URL to download gtest from" ) set( gtest_git_tag "release-1.8.0" CACHE STRING "URL to download gtest from" ) if( MSVC ) list( APPEND gtest_cmake_args -Dgtest_force_shared_crt=ON -DCMAKE_DEBUG_POSTFIX=d ) # else( ) # GTEST_USE_OWN_TR1_TUPLE necessary to compile with hipcc # list( APPEND gtest_cmake_args -DGTEST_USE_OWN_TR1_TUPLE=1 ) endif( ) if( CMAKE_CONFIGURATION_TYPES ) set( gtest_make COMMAND ${CMAKE_COMMAND} --build --config Release COMMAND ${CMAKE_COMMAND} --build --config Debug ) else( ) # Add build thread in addition to the number of cores that we have include( ProcessorCount ) ProcessorCount( Cores ) # If we are not using an IDE, assume nmake with visual studio if( MSVC ) set( gtest_make "nmake" ) else( ) set( gtest_make "make" ) # The -j paramter does not work with nmake if( NOT Cores EQUAL 0 ) math( EXPR Cores "${Cores} + 1 " ) list( APPEND gtest_make -j ${Cores} ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread list( APPEND gtest_make -j 2 ) endif( ) endif( ) message( STATUS "ExternalGmock using ( " ${Cores} " ) cores to build with" ) endif( ) # message( STATUS "gtest_make ( " ${gtest_make} " ) " ) # message( STATUS "gtest_cmake_args ( " ${gtest_cmake_args} " ) " ) # Master branch has a new structure that combines googletest with googlemock ExternalProject_Add( googletest PREFIX ${CMAKE_BINARY_DIR}/gtest GIT_REPOSITORY ${gtest_git_repository} GIT_TAG ${gtest_git_tag} CMAKE_ARGS ${gtest_cmake_args} BUILD_COMMAND ${gtest_make} LOG_BUILD 1 INSTALL_COMMAND "" LOG_INSTALL 1 ) ExternalProject_Get_Property( googletest source_dir ) # For visual studio, the path 'debug' is hardcoded because that is the default VS configuration for a build. # Doesn't matter if its the gtest or gtestd project above set( package_dir "${PREFIX_GTEST}" ) if( CMAKE_CONFIGURATION_TYPES ) # Create a package by bundling libraries and header files if( BUILD_64 ) set( LIB_DIR lib64 ) else( ) set( LIB_DIR lib ) endif( ) set( gtest_lib_dir "/${LIB_DIR}" ) ExternalProject_Add_Step( googletest createPackage COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Debug ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory ${gtest_lib_dir}/Release ${package_dir}/${LIB_DIR} COMMAND ${CMAKE_COMMAND} -E copy_directory /include ${package_dir}/include COMMAND ${CMAKE_COMMAND} -E copy_directory /gtest/include/gtest ${package_dir}/include/gtest DEPENDEES install ) endif( ) set_property( TARGET googletest PROPERTY FOLDER "extern") ExternalProject_Get_Property( googletest install_dir ) ExternalProject_Get_Property( googletest binary_dir ) # For use by the user of ExternalGtest.cmake set( GTEST_INSTALL_ROOT ${install_dir} ) set( GTEST_BINARY_ROOT ${binary_dir} ) hipFFT-rocm-7.1.0/docs/000077500000000000000000000000001506642153200145315ustar00rootroot00000000000000hipFFT-rocm-7.1.0/docs/.gitignore000066400000000000000000000000621506642153200165170ustar00rootroot00000000000000_doxygen/ doxygen/html/ doxygen/rtf/ doxygen/xml/ hipFFT-rocm-7.1.0/docs/conceptual/000077500000000000000000000000001506642153200166665ustar00rootroot00000000000000hipFFT-rocm-7.1.0/docs/conceptual/overview.rst000066400000000000000000000533761506642153200213040ustar00rootroot00000000000000.. meta:: :description: hipFFT documentation and API reference library :keywords: FFT, hipFFT, rocFFT, ROCm, API, documentation .. _hipfft-overview: ******************************************************************** hipFFT overview ******************************************************************** hipFFT is a GPU FFT marshalling library that supports either :doc:`rocFFT ` or NVIDIA CUDA `cuFFT`_ as the backend. hipFFT exports an interface that does not require the client to change, regardless of the chosen backend. It sits between the application and the backend FFT library, marshalling inputs into the backend and results back to the application. ===================== Basic hipFFT usage ===================== To use hipFFT, follow this step-by-step process: #. Create a transform plan for the FFT. To create a plan, use the functions :cpp:func:`hipfftPlan1d`, :cpp:func:`hipfftPlan2d`, or :cpp:func:`hipfftPlan3d`, depending on the dimensions of the FFT. For a 1D FFT, use the following code: .. code-block:: cpp hipfftHandle plan; hipfftPlan1d(&plan, N, HIPFFT_C2C, 1); For higher-dimension plans, use :cpp:func:`hipfftPlan2d` or :cpp:func:`hipfftPlan3d`. #. Allocate a work buffer (optional) hipFFT generally handles memory allocation internally, so work buffers aren't explicitly required. However, to manually manage memory, you can still allocate buffers before execution. You might want to do this, for example, if you have multiple plans that need work buffers and you want them to share a single buffer. Otherwise, each plan will allocate its own work memory, which might be wasteful. #. Execute the plan To execute the FFT computation, use :cpp:func:`hipfftExecC2C`, :cpp:func:`hipfftExecR2C`, or :cpp:func:`hipfftExecC2R`, depending on the type of transform. You can reuse the same plan for multiple executions, changing the data pointers as necessary. .. code-block:: cpp hipfftExecC2C(plan, x, x, HIPFFT_FORWARD); #. Destroy the plan After you are done with the plan, destroy it to free the associated resources: .. code-block:: cpp hipfftDestroy(plan); #. Free any device memory (if applicable) If you allocated any buffers for storing input/output data or intermediate results, free them using ``hipFree``: .. code-block:: cpp hipFree(x); #. Terminate the library No specific cleanup function is required for hipFFT, but ensure that any HIP memory is freed and the HIP runtime is cleaned up appropriately after all computations are done. The following code sample illustrates how to apply these steps: .. code-block:: cpp #include #include #include "hip/hip_runtime_api.h" #include "hip/hip_vector_types.h" #include "hipfft/hipfft.h" int main() { hipfftHandle plan; size_t N = 16; size_t Nbytes = N * sizeof(hipfftComplex); // Create HIP device buffer hipfftComplex *x; hipMalloc(&x, Nbytes); // Initialize data std::vector cx(N); for (size_t i = 0; i < N; i++) { cx[i].x = 1; cx[i].y = -1; } // Copy data to device hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); // Create hipFFT plan hipfftPlan1d(&plan, N, HIPFFT_C2C, 1); // Execute plan hipfftExecC2C(plan, x, x, HIPFFT_FORWARD); // Wait for execution to finish hipDeviceSynchronize(); // Copy result back to host std::vector y(N); hipMemcpy(y.data(), x, Nbytes, hipMemcpyDeviceToHost); // Print results for (size_t i = 0; i < N; i++) { std::cout << y[i].x << ", " << y[i].y << std::endl; } // Free device buffer hipFree(x); // Destroy plan hipfftDestroy(plan); return 0; } ======================== Advanced hipFFT usage ======================== For transforms that require advanced input layouts, use the :cpp:func:`hipfftPlanMany` function, setting these parameters: * ``int rank``: The number of dimensions for the FFT (1D, 2D, or 3D). * ``int* n``: Array specifying the size of the FFT in each dimension. * ``int* inembed``: The dimensions of the input data layout in memory. * ``int istride``: Stride between elements in the input array. * ``int idist``: Distance between consecutive FFTs in the input array. * ``int* onembed``: The dimensions of the output data layout in memory. * ``int ostride``: Stride between elements in the output array. * ``int odist``: Distance between consecutive FFTs in the output array. * ``hipfftType type``: Type of FFT (for example, ``HIPFFT_C2C`` or ``HIPFFT_R2C``). * ``int batch``: Number of FFTs to compute in parallel. Here's an example of a 2D single-precision real-to-complex transform using the hipFFT advanced interface: .. code-block:: cpp #include #include #include #include #include int main() { // Define the parameters for the 2D FFT int rank = 2; // Rank of the transform (2D FFT) int n[2] = {4, 5}; // Dimensions of the FFT (4 rows, 5 columns) int howmany = 3; // Number of transforms to compute (batch size) // Derived parameters for handling real-to-complex output int n1_complex_elements = n[1] / 2 + 1; // Number of complex elements in the last dimension int n1_padding_real_elements = n1_complex_elements * 2; // Adjusted real elements to account for padding // Strides and distances int istride = 1; // Stride between elements in input int ostride = istride; // Stride between elements in output int inembed[2] = {istride * n[0], istride * n1_padding_real_elements}; // Input layout int onembed[2] = {ostride * n[0], ostride * n1_complex_elements}; // Output layout int idist = inembed[0] * inembed[1]; // Distance between batches in input int odist = onembed[0] * onembed[1]; // Distance between batches in output // Print the layout parameters std::cout << "n: " << n[0] << " " << n[1] << "\n" << "howmany: " << howmany << "\n" << "istride: " << istride << "\tostride: " << ostride << "\n" << "inembed: " << inembed[0] << " " << inembed[1] << "\n" << "onembed: " << onembed[0] << " " << onembed[1] << "\n" << "idist: " << idist << "\todist: " << odist << "\n" << std::endl; // Initialize input data std::vector data(howmany * idist); // Allocate space for batched input const auto total_bytes = data.size() * sizeof(decltype(data)::value_type); std::cout << "input:\n"; std::fill(data.begin(), data.end(), 0.0); // Fill data with zeros for(int ibatch = 0; ibatch < howmany; ++ibatch) { for(int i = 0; i < n[0]; i++) // Loop over rows { for(int j = 0; j < n[1]; j++) // Loop over columns { // Calculate the position in the input array const auto pos = ibatch * idist + istride * (i * inembed[1] + j); data[pos] = i + ibatch + j; // Populate data with unique values for clarity } } } // Print the input data for each batch for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < inembed[0]; i++) { for(int j = 0; j < inembed[1]; j++) { const auto pos = ibatch * idist + i * inembed[1] + j; std::cout << data[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; // Create the hipFFT plan for batched 2D real-to-complex transforms hipfftHandle hipForwardPlan; hipfftResult hipfft_rt = hipfftPlanMany(&hipForwardPlan, rank, n, inembed, istride, idist, onembed, ostride, odist, HIPFFT_R2C, // Transform type (real-to-complex) howmany); // Number of transforms in the batch if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Allocate GPU memory for input and output hipfftReal* gpu_data; hipError_t hip_rt = hipMalloc((void**)&gpu_data, total_bytes); if(hip_rt != hipSuccess) throw std::runtime_error("hipMalloc failed"); // Copy input data to the GPU hip_rt = hipMemcpy(gpu_data, (void*)data.data(), total_bytes, hipMemcpyHostToDevice); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Execute the FFT on the GPU hipfft_rt = hipfftExecR2C(hipForwardPlan, gpu_data, (hipfftComplex*)gpu_data); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("failed to execute plan"); // Copy the output data back to the host hip_rt = hipMemcpy((void*)data.data(), gpu_data, total_bytes, hipMemcpyDeviceToHost); if(hip_rt != hipSuccess) throw std::runtime_error("hipMemcpy failed"); // Display the output data std::cout << "output:\n"; const std::complex* output = (std::complex*)data.data(); for(int ibatch = 0; ibatch < howmany; ++ibatch) { std::cout << "batch: " << ibatch << "\n"; for(int i = 0; i < onembed[0]; i++) { for(int j = 0; j < onembed[1]; j++) { const auto pos = ibatch * odist + i * onembed[1] + j; std::cout << output[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } std::cout << std::endl; // Clean up resources hipfftDestroy(hipForwardPlan); // Destroy the FFT plan hip_rt = hipFree(gpu_data); // Free the GPU memory if(hip_rt != hipSuccess) throw std::runtime_error("hipFree failed"); return 0; } ====================== Overlapping input data ====================== There are signal processing tasks, such as sliding window FFTs, where overlapping data can improve computational efficiency. Care must be taken to ensure proper memory management and alignment when using overlapping input layouts. The following example demonstrates the use of overlapping input data by configuring the ``inembed``, ``istride``, and ``idist`` parameters in the :cpp:func:`hipfftMakePlanMany` function. Set these parameters to create a memory layout where portions of the input data are reused across multiple FFT batches: * ``inembed`` specifies the physical layout of the input data in memory, with extra padding to accommodate overlapping rows (for example, ``2240``). * ``istride`` ensures continuous reading of data within each row (if set to ``1``). * ``idist`` defines the distance between the starting points of consecutive batches (for example, ``432``), which is smaller than the total memory implied by ``xformSz`` and ``inembed``. .. code-block:: cpp #include #include #include #include #include #include #include int main() { std::cout << "hipFFT 2D batched complex-to-complex transform example\n"; // FFT configuration int rank = 2; int xformSz[2] = {512, 512}; // 2D FFT size: 512x512 int inEmbed[2] = {1, 2240}; // Input data layout int onEmbed[2] = {1, 512}; // Output data layout int istride = 1, ostride = 1; // Stride for input and output int idist = 432, odist = 262144; // Batch distance for input and output int batch = 5; // Number of FFTs to compute in parallel // Calculate input and output sizes in bytes size_t inSize = idist * batch * sizeof(std::complex); size_t outSize = odist * batch * sizeof(std::complex); // Initialize HIP and hipFFT resources hipSetDevice(0); hipStream_t stream; hipStreamCreateWithFlags(&stream, hipStreamNonBlocking); hipfftHandle handleF; if (hipfftPlanMany(&handleF, rank, xformSz, inEmbed, istride, idist, onEmbed, ostride, odist, HIPFFT_C2C, batch) != HIPFFT_SUCCESS) { std::cerr << "Failed to create hipFFT plan" << std::endl; return EXIT_FAILURE; } hipfftSetStream(handleF, stream); // Allocate device memory std::complex* miTD; // Input buffer std::complex* miFD; // Output buffer if (hipMalloc(&miTD, inSize) != hipSuccess || hipMalloc(&miFD, outSize) != hipSuccess) { std::cerr << "hipMalloc failed" << std::endl; return EXIT_FAILURE; } // Initialize input data on the host std::vector> inputData(idist * batch, {0.0f, 0.0f}); for (int ibatch = 0; ibatch < batch; ++ibatch) { for (int i = 0; i < xformSz[0]; ++i) { for (int j = 0; j < xformSz[1]; ++j) { int pos = ibatch * idist + i * inEmbed[1] + j; inputData[pos] = std::complex(i + j, ibatch); } } } // Copy input data to device if (hipMemcpy(miTD, inputData.data(), inSize, hipMemcpyHostToDevice) != hipSuccess) { std::cerr << "hipMemcpy failed" << std::endl; return EXIT_FAILURE; } // Execute FFT if (hipfftExecC2C(handleF, reinterpret_cast(miTD), reinterpret_cast(miFD), HIPFFT_FORWARD) != HIPFFT_SUCCESS) { std::cerr << "Failed to execute hipFFT" << std::endl; return EXIT_FAILURE; } // Synchronize stream hipStreamSynchronize(stream); // Copy results back to host std::vector> outputData(odist * batch); if (hipMemcpy(outputData.data(), miFD, outSize, hipMemcpyDeviceToHost) != hipSuccess) { std::cerr << "hipMemcpy failed" << std::endl; return EXIT_FAILURE; } // Display results std::cout << "Output data:\n"; for (int ibatch = 0; ibatch < batch; ++ibatch) { std::cout << "Batch " << ibatch << ":\n"; for (int i = 0; i < xformSz[0]; ++i) { for (int j = 0; j < xformSz[1] / 2 + 1; ++j) { int pos = ibatch * odist + i * onEmbed[1] + j; std::cout << outputData[pos] << " "; } std::cout << "\n"; } std::cout << "\n"; } // Clean up resources hipfftDestroy(handleF); hipStreamDestroy(stream); hipFree(miTD); hipFree(miFD); return EXIT_SUCCESS; } ================= Multi-GPU example ================= The following example demonstrates a multi-GPU 2D double-precision complex-to-complex transform using the hipFFT library. It showcases how to perform a 2D Fast Fourier Transform (FFT) in double precision (complex-to-complex) across two GPUs. The following concepts and API calls are used: * ``hipfftXt``: This API lets users execute FFTs across multiple GPUs by managing multi-GPU plans. ``hipfftXt`` provides an extended version of the hipFFT API to handle GPU-specific operations, such as memory allocation and execution across multiple devices. For more details, see the :doc:`API reference <../reference/fft-api-usage>`. * :cpp:func:`hipfftCreate`: Creates a hipFFT plan that contains the FFT configuration. This plan is used to configure the FFT transform operation. * ``hipStreamCreate``: Creates a stream for managing GPU work concurrently. This enables execution of the multi-GPU plan in parallel on multiple GPUs. For more details, see :doc:`HIP `. * :cpp:func:`hipfftXtSetGPUs`: Assigns the GPUs (in this case, two GPUs) to the hipFFT plan, enabling multi-GPU computation for the FFT. * :cpp:func:`hipfftMakePlan2d`: Creates a 2D FFT plan for the specified input/output size (``Nx``, ``Ny``), specifying the transform type (complex-to-complex in this case). * :cpp:func:`hipfftXtMalloc`: Allocates memory on the GPUs for storing the FFT input and output data. * :cpp:func:`hipfftXtMemcpy`: Copies data between the host and GPU memory, supporting both host-to-device and device-to-host operations. * :cpp:func:`hipfftXtExecDescriptor`: Executes the FFT operation based on the input descriptor ``desc``, which holds the input data and transform configuration. * :cpp:func:`hipfftXtFree`: Frees the memory allocated for the input/output descriptors after the computation is completed. For detailed API usage, see :ref:`hipfft-api-usage`. .. code-block:: cpp #include #include #include #include #include #include "../hipfft_params.h" int main() { // Define FFT dimensions const int Nx = 512; const int Ny = 512; int direction = HIPFFT_FORWARD; // forward = -1, backward = 1 // Initialize input data (complex numbers) for FFT computation int verbose = 0; std::vector> cinput(Nx * Ny); for(size_t i = 0; i < Nx * Ny; i++) { cinput[i] = i; // Initialize the data with some values } // Optionally, print the input data if(verbose) { std::cout << "Input:\n"; for(int i = 0; i < Nx; i++) { for(int j = 0; j < Ny; j++) { int pos = i * Ny + j; std::cout << cinput[pos] << " "; } std::cout << "\n"; } std::cout << std::endl; } // Specify the GPUs you want to use for multi-GPU setup std::array gpus = {0, 1}; // Use GPU 0 and GPU 1 // Create a multi-GPU plan hipLibXtDesc* desc; // Input descriptor for the Xt format hipfftHandle plan = hipfft_params::INVALID_PLAN_HANDLE; // Initialize plan handle // Create the FFT plan if(hipfftCreate(&plan) != HIPFFT_SUCCESS) throw std::runtime_error("failed to create plan"); // Create a GPU stream and assign it to the plan for asynchronous operations hipStream_t stream{}; if(hipStreamCreate(&stream) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); if(hipfftSetStream(plan, stream) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftSetStream failed."); // Assign GPUs to the plan (this is where multi-GPU is specified) hipfftResult hipfft_rt = hipfftXtSetGPUs(plan, gpus.size(), gpus.data()); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtSetGPUs failed."); // Make the 2D plan for FFT (this defines the 2D FFT using the specified dimensions) size_t workSize[gpus.size()]; hipfft_rt = hipfftMakePlan2d(plan, Nx, Ny, HIPFFT_Z2Z, workSize); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftMakePlan2d failed."); // Allocate memory for input data on the GPUs (Xt format handles the data distribution) hipfftXtSubFormat_t format = HIPFFT_XT_FORMAT_INPLACE_SHUFFLED; hipfft_rt = hipfftXtMalloc(plan, &desc, format); // Allocate memory for the descriptor if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMalloc failed."); // Copy the input data to the GPUs (device memory) hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(desc), reinterpret_cast(cinput.data()), HIPFFT_COPY_HOST_TO_DEVICE); // Copy from host to device if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy failed."); // Execute the FFT computation using the Xt descriptor hipfft_rt = hipfftXtExecDescriptor(plan, desc, desc, direction); if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtExecDescriptor failed."); // Optionally, print the output data (copy the results back to the host) if(verbose) { // Copy the output data back to the host hipfft_rt = hipfftXtMemcpy(plan, reinterpret_cast(cinput.data()), reinterpret_cast(desc), HIPFFT_COPY_DEVICE_TO_HOST); // Copy from device to host if(hipfft_rt != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtMemcpy D2H failed."); std::cout << "Output:\n"; for(size_t i = 0; i < Nx; i++) { for(size_t j = 0; j < Ny; j++) { auto pos = i * Ny + j; std::cout << cinput[pos] << " "; // Print the output FFT results } std::cout << "\n"; } std::cout << std::endl; } // Clean up memory and resources if(hipfftXtFree(desc) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftXtFree failed."); if(hipfftDestroy(plan) != HIPFFT_SUCCESS) throw std::runtime_error("hipfftDestroy failed."); if(hipStreamDestroy(stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); return 0; } .. _rocFFT: https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html .. _cuFFT: https://developer.nvidia.com/cufft hipFFT-rocm-7.1.0/docs/conf.py000066400000000000000000000021401506642153200160250ustar00rootroot00000000000000# Configuration file for the Sphinx documentation builder. # # This file only contains a selection of the most common options. For a full # list see the documentation: # https://www.sphinx-doc.org/en/master/usage/configuration.html import re from rocm_docs import ROCmDocs with open('../CMakeLists.txt', encoding='utf-8') as f: match = re.search(r'set\( VERSION_STRING \"?([0-9.]+)[^0-9.]+', f.read()) if not match: raise ValueError("VERSION not found!") version_number = match[1] left_nav_title = f"hipFFT {version_number} Documentation" # for PDF output on Read the Docs project = "hipFFT Documentation" author = "Advanced Micro Devices, Inc." copyright = "Copyright (c) 2025 Advanced Micro Devices, Inc. All rights reserved." version = version_number release = version_number external_toc_path = "./sphinx/_toc.yml" docs_core = ROCmDocs(left_nav_title) docs_core.run_doxygen(doxygen_root="doxygen", doxygen_path="doxygen/xml") docs_core.setup() external_projects_current_project = "hipfft" for sphinx_var in ROCmDocs.SPHINX_VARS: globals()[sphinx_var] = getattr(docs_core, sphinx_var) hipFFT-rocm-7.1.0/docs/doxygen/000077500000000000000000000000001506642153200162065ustar00rootroot00000000000000hipFFT-rocm-7.1.0/docs/doxygen/Doxyfile000066400000000000000000003221051506642153200177170ustar00rootroot00000000000000# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "hipFFT" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v1.0.21 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HIP" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = . # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../../library/include/hipfft # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- SHOW_NAMESPACES = NO # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = YES # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = # If the WARN_AS_ERROR tag is set to YES then doxygen will immediately stop when # a warning is encountered. If the WARN_AS_ERROR tag is set to FAIL_ON_WARNINGS # then doxygen will continue running as if WARN_AS_ERROR tag is set to NO, but # at the end of the doxygen process doxygen will return with a non-zero status. # Possible values are: NO, YES and FAIL_ON_WARNINGS. # The default value is: NO. WARN_AS_ERROR = YES #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../library/include/hipfft # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # compiled with the --with-libclang option. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /