pax_global_header 0000666 0000000 0000000 00000000064 15066521634 0014522 g ustar 00root root 0000000 0000000 52 comment=fece8692bafd8117e2ad50036ca646ba91c5d3ba
rocFFT-rocm-7.1.0/ 0000775 0000000 0000000 00000000000 15066521634 0013610 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/.azuredevops/ 0000775 0000000 0000000 00000000000 15066521634 0016235 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/.azuredevops/rocm-ci.yml 0000664 0000000 0000000 00000001240 15066521634 0020306 0 ustar 00root root 0000000 0000000 resources:
repositories:
- repository: pipelines_repo
type: github
endpoint: ROCm
name: ROCm/ROCm
variables:
- group: common
- template: /.azuredevops/variables-global.yml@pipelines_repo
trigger:
batch: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .githooks
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
pr:
autoCancel: true
branches:
include:
- develop
- mainline
paths:
exclude:
- .githooks
- .github
- .jenkins
- docs
- '.*.y*ml'
- '*.md'
drafts: false
jobs:
- template: ${{ variables.CI_COMPONENT_PATH }}/rocFFT.yml@pipelines_repo
rocFFT-rocm-7.1.0/.clang-format 0000664 0000000 0000000 00000006542 15066521634 0016172 0 ustar 00root root 0000000 0000000 # Style file for MLSE Libraries based on the modified rocBLAS style
# Common settings
BasedOnStyle: WebKit
TabWidth: 4
IndentWidth: 4
UseTab: Never
ColumnLimit: 100
# Other languages JavaScript, Proto
---
Language: Cpp
# http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code
# int formatted_code;
# // clang-format off
# void unformatted_code ;
# // clang-format on
# void formatted_code_again;
DisableFormat: false
Standard: Cpp11
AccessModifierOffset: -4
AlignAfterOpenBracket: Align
AlignConsecutiveAssignments: true
AlignConsecutiveDeclarations: true
AlignEscapedNewlines: Left
AlignOperands: true
AlignTrailingComments: false
AllowAllArgumentsOnNextLine: true
AllowAllConstructorInitializersOnNextLine: true
AllowAllParametersOfDeclarationOnNextLine: true
AllowShortBlocksOnASingleLine: false
AllowShortCaseLabelsOnASingleLine: false
AllowShortFunctionsOnASingleLine: Empty
AllowShortIfStatementsOnASingleLine: false
AllowShortLoopsOnASingleLine: false
AlwaysBreakAfterDefinitionReturnType: false
AlwaysBreakAfterReturnType: None
AlwaysBreakBeforeMultilineStrings: false
AlwaysBreakTemplateDeclarations: true
BinPackArguments: false
BinPackParameters: false
# Configure each individual brace in BraceWrapping
BreakBeforeBraces: Custom
# Control of individual brace wrapping cases
BraceWrapping: {
AfterCaseLabel: 'true'
AfterClass: 'true'
AfterControlStatement: 'true'
AfterEnum : 'true'
AfterFunction : 'true'
AfterNamespace : 'true'
AfterStruct : 'true'
AfterUnion : 'true'
BeforeCatch : 'true'
BeforeElse : 'true'
IndentBraces : 'false'
# AfterExternBlock : 'true'
}
#BreakAfterJavaFieldAnnotations: true
#BreakBeforeInheritanceComma: false
#BreakBeforeBinaryOperators: None
#BreakBeforeTernaryOperators: true
#BreakConstructorInitializersBeforeComma: true
#BreakStringLiterals: true
CommentPragmas: '^ IWYU pragma:'
#CompactNamespaces: false
ConstructorInitializerAllOnOneLineOrOnePerLine: false
ConstructorInitializerIndentWidth: 4
ContinuationIndentWidth: 4
Cpp11BracedListStyle: true
SpaceBeforeCpp11BracedList: false
DerivePointerAlignment: false
ExperimentalAutoDetectBinPacking: false
ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ]
IndentCaseLabels: false
IndentPPDirectives: None
#FixNamespaceComments: true
IndentWrappedFunctionNames: true
KeepEmptyLinesAtTheStartOfBlocks: true
MacroBlockBegin: ''
MacroBlockEnd: ''
#JavaScriptQuotes: Double
MaxEmptyLinesToKeep: 1
NamespaceIndentation: All
ObjCBlockIndentWidth: 4
#ObjCSpaceAfterProperty: true
#ObjCSpaceBeforeProtocolList: true
PenaltyBreakBeforeFirstCallParameter: 19
PenaltyBreakComment: 300
PenaltyBreakFirstLessLess: 120
PenaltyBreakString: 1000
PenaltyExcessCharacter: 1000000
PenaltyReturnTypeOnItsOwnLine: 60
PointerAlignment: Left
SpaceAfterCStyleCast: false
SpaceBeforeAssignmentOperators: true
SpaceBeforeParens: Never
SpaceInEmptyBlock: false
SpaceInEmptyParentheses: false
SpacesBeforeTrailingComments: 1
SpacesInAngles: false
SpacesInContainerLiterals: true
SpacesInCStyleCastParentheses: false
SpacesInParentheses: false
SpacesInSquareBrackets: false
#SpaceAfterTemplateKeyword: true
#SpaceBeforeInheritanceColon: true
#SortUsingDeclarations: true
SortIncludes: true
# Comments are for developers, they should arrange them
ReflowComments: false
#IncludeBlocks: Preserve
---
rocFFT-rocm-7.1.0/.githooks/ 0000775 0000000 0000000 00000000000 15066521634 0015515 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/.githooks/install 0000775 0000000 0000000 00000000222 15066521634 0017105 0 ustar 00root root 0000000 0000000 #!/usr/bin/env bash
cd $(git rev-parse --git-dir)
cd hooks
echo "Installing hooks..."
ln -s ../../.githooks/pre-commit pre-commit
echo "Done!"
rocFFT-rocm-7.1.0/.githooks/pre-commit 0000775 0000000 0000000 00000001767 15066521634 0017532 0 ustar 00root root 0000000 0000000 #!/bin/sh
#
# This pre-commit hook checks if any versions of clang-format
# are installed, and if so, uses the installed version to format
# the staged changes.
base=/opt/rocm/llvm/bin/clang-format
format=""
# Redirect output to stderr.
exec 1>&2
# check if clang-format is installed
type "$base" >/dev/null 2>&1 && format="$base"
# no versions of clang-format are installed
if [ -z "$format" ]
then
echo "$base is not installed. Pre-commit hook will not be executed."
exit 0
fi
# Do everything from top - level
cd $(git rev-parse --show-toplevel)
if git rev-parse --verify HEAD >/dev/null 2>&1
then
against=HEAD
else
# Initial commit: diff against an empty tree object
against=4b825dc642cb6eb9a060e54bf8d69288fbee4904
fi
# do the formatting
for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$')
do
if [ -e "$file" ]
then
echo "$format $file"
"$format" -i -style=file "$file"
fi
done
rocFFT-rocm-7.1.0/.github/ 0000775 0000000 0000000 00000000000 15066521634 0015150 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/.github/CODEOWNERS 0000664 0000000 0000000 00000000574 15066521634 0016551 0 ustar 00root root 0000000 0000000 * @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd
# Documentation files
docs/ @ROCm/rocm-documentation
*.md @ROCm/rocm-documentation
*.rst @ROCm/rocm-documentation
.readthedocs.yaml @ROCm/rocm-documentation
# Header directory for Doxygen documentation
library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd
rocFFT-rocm-7.1.0/.github/CONTRIBUTING.md 0000664 0000000 0000000 00000014633 15066521634 0017410 0 ustar 00root root 0000000 0000000
# Contributing to rocFFT #
We welcome contributions to rocFFT. Please follow these details to help ensure your contributions will be successfully accepted.
## Issue Discussion ##
Please use the GitHub Issues tab to notify us of issues.
* Use your best judgment for issue creation. If your issue is already listed, upvote the issue and
comment or post to provide additional details, such as how you reproduced this issue.
* If you're not sure if your issue is the same, err on the side of caution and file your issue.
You can add a comment to include the issue number (and link) for the similar issue. If we evaluate
your issue as being the same as the existing issue, we'll close the duplicate.
* If your issue doesn't exist, use the issue template to file a new issue.
* When filing an issue, be sure to provide as much information as possible, including script output so
we can collect information about your configuration. This helps reduce the time required to
reproduce your issue.
* Check your issue regularly, as we may require additional information to successfully reproduce the
issue.
* You may also open an issue to ask questions to the maintainers about whether a proposed change
meets the acceptance criteria, or to discuss an idea pertaining to the library.
## Acceptance Criteria ##
When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions.
These checks include:
* Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.)
* Running on different GPU architectures (MI-series, Radeon series cards, etc.)
* Running benchmarks to check for performance degradation
In order for a submission to be accepted:
* It must pass all of the automated checks
* It must undergo a code review
Users can visualize our continuous integration infrastructure in: `rocFFT/.jenkins`.
The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests.
## Code Structure ##
In a broad view, rocFFT library is structured as follows:
├── docs/: contains rocFFT documentation
├── library/: contains main source code and headers
├── clients/:
│ ├── bench/ : contains benchmarking code
│ ├── samples/ : contains examples
│ ├── tests/ : contains our test infrastructure
├── shared/: contains important global headers and those for linking to other applications
## Coding Style ##
* All public APIs are C89 compatible; all other library code should use c++17.
* Our minimum supported compiler is clang 3.6.
* Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code.
* C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `rocFFT/.clang-format`.
To format a C/C++ file, use:
```
clang-format -style=file -i
```
* Python code should use:
```
yapf --style pep8
```
## Pull Request Guidelines ##
Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/).
This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code.
Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user.
The following guidelines apply:
* When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch.
* Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers.
* Ensure code builds successfully.
* Do not break existing test cases
* Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit.
### Deliverables ###
New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference.
The following guidelines apply:
* New functionality will only be merged with new unit tests.
* New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md).
* Tests must have good code coverage.
### Process ###
All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged.
Once a contribution is ready to be submitted, consider the following:
* Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i
* While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted.
* Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails.
* During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas.
* When a modification request has been completed, the conversation thread about it will be marked as resolved.
* To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request.
* Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch.
## Code License ##
All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license.
rocFFT-rocm-7.1.0/.github/ISSUE_TEMPLATE.md 0000664 0000000 0000000 00000000461 15066521634 0017656 0 ustar 00root root 0000000 0000000 ### What is the expected behavior
-
### What actually happens
-
### How to reproduce
-
### Environment
| Hardware | description |
|-----|-----|
| GPU | device string |
| CPU | device string |
| Software | version |
|-----|-----|
| ROCK | v0.0 |
| ROCR | v0.0 |
| HCC | v0.0 |
| Library | v0.0 |
rocFFT-rocm-7.1.0/.github/PULL_REQUEST_TEMPLATE.md 0000664 0000000 0000000 00000000070 15066521634 0020746 0 ustar 00root root 0000000 0000000 resolves #___
Summary of proposed changes:
-
-
-
rocFFT-rocm-7.1.0/.github/dependabot.yml 0000664 0000000 0000000 00000001223 15066521634 0017776 0 ustar 00root root 0000000 0000000 # To get started with Dependabot version updates, you'll need to specify which
# package ecosystems to update and where the package manifests are located.
# Please see the documentation for all configuration options:
# https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates
version: 2
updates:
- package-ecosystem: "pip" # See documentation for possible values
directory: "/docs/sphinx" # Location of package manifests
open-pull-requests-limit: 10
schedule:
interval: "daily"
labels:
- "documentation"
- "dependencies"
- "ci:docs-only"
reviewers:
- "samjwu"
rocFFT-rocm-7.1.0/.gitignore 0000664 0000000 0000000 00000000554 15066521634 0015604 0 ustar 00root root 0000000 0000000 # Compiled Object files
*.slo
*.lo
*.o
*.obj
# Precompiled Headers
*.gch
*.pch
# Compiled Dynamic libraries
*.so
*.dylib
*.dll
# Fortran module files
*.mod
# Compiled Static libraries
*.lai
*.la
*.a
*.lib
# Executables
*.exe
*.out
*.app
# vim tags
tags
.tags
.*.swp
# Visual Studio Code
.vscode
# install.sh build dir
build/
# python bytecode
__pycache__
rocFFT-rocm-7.1.0/.readthedocs.yaml 0000664 0000000 0000000 00000000572 15066521634 0017043 0 ustar 00root root 0000000 0000000 # Read the Docs configuration file
# See https://docs.readthedocs.io/en/stable/config-file/v2.html for details
version: 2
sphinx:
configuration: docs/conf.py
formats: [htmlzip, pdf, epub]
python:
install:
- requirements: docs/sphinx/requirements.txt
build:
os: ubuntu-22.04
tools:
python: "mambaforge-22.9"
conda:
environment: docs/environment.yml
rocFFT-rocm-7.1.0/CHANGELOG.md 0000664 0000000 0000000 00000051446 15066521634 0015433 0 ustar 00root root 0000000 0000000 # Changelog for rocFFT
Documentation for rocFFT is available at
[https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/).
## rocFFT 1.0.35 for ROCM 7.1.0
### Optimized
* Implemented single-kernel plans for some 2D problem sizes, on devices with at least 160KiB of LDS.
* Improved performance of unit-strided, complex-interleaved, forward/inverse FFTs for lengths:
- (64,64,128)
- (64,64,52)
- (60,60,60)
- (32,32,128)
- (32,32,64)
- (64,32,128)
* Improved performance of 3D MPI pencil decompositions by using sub-communicators for global transpose operations.
## rocFFT 1.0.34 for ROCm 7.0.0
### Added
* Added gfx950 support.
### Removed
* Removed rocfft-rider legacy compatibility from clients
* Removed support for the gfx940 and gfx941 targets from the client programs.
### Optimized
* Removed unnecessary HIP event/stream allocation and synchronization during MPI transforms.
* Implemented single-precision 1D kernels for lengths:
- 4704
- 5488
- 6144
- 6561
- 8192
* Implemented single-kernel plans for some large 1D problem sizes, on devices with at least 160KiB of LDS.
### Resolved issues
* Fixed kernel faults on multi-device transforms that gather to a single device, when the input/output bricks are not
contiguous.
## rocFFT 1.0.32 for ROCm 6.4.0
### Changed
* Building with the address sanitizer option sets xnack+ on relevant GPU
architectures and adds address-sanitizer support to runtime-compiled
kernels.
* The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated.
### Removed
* Removed ahead-of-time compiled kernels for the gfx906, gfx940, and gfx941 architectures. These architectures still
function the same, but kernels for them are now compiled at runtime.
* Removed consumer GPU architectures from the precompiled kernel cache that ships with
rocFFT. rocFFT continues to ship with a cache of precompiled RTC kernels for data-center
and workstation architectures. As before, user-level caches can be enabled by setting the
environment variable ROCFFT_RTC_CACHE_PATH to a writeable file location.
### Optimized
* Improved MPI transform performance by using all-to-all communication for global transpose operations.
Point-to-point communications are still used when all-to-all is not possible.
* Improved the performance of unit-strided, complex interleaved, forward and inverse, length (64,64,64) FFTs.
### Resolved issues
* Fixed incorrect results from 2-kernel 3D FFT plans that used non-default output strides. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/507).
* Plan descriptions can be reused with different strides for different plans. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/504).
* Fixed client packages to depend on hipRAND instead of rocRAND.
* Fixed potential integer overflows during large MPI transforms.
## rocFFT 1.0.31 for ROCm 6.3.0
### Added
* rocfft-test now includes a --smoketest option.
* Support for the gfx1151, gfx1200, and gfx1201 architectures.
* Implemented experimental APIs to allow computing FFTs on data
distributed across multiple MPI ranks. These APIs can be enabled with the
`ROCFFT_MPI_ENABLE` CMake option. This option defaults to `OFF`.
When `ROCFFT_MPI_ENABLE` is set to `ON`:
* `rocfft_plan_description_set_comm` can be called to provide an
MPI communicator to a plan description, which can then be passed
to `rocfft_plan_create`. Each rank calls
`rocfft_field_add_brick` to specify the layout of data bricks on
that rank.
* An MPI library with ROCm acceleration enabled is required at
build time and at runtime.
### Changed
* Compilation uses amdclang++ instead of hipcc.
* CLI11 replaces Boost Program Options as the command line parser for clients and samples.
## rocFFT 1.0.30 for ROCm 6.2.4
### Optimizations
* Implemented 1D kernels for factorizable sizes > 1024 and < 2048.
### Fixes
* Fixed plan creation failure on some even-length real-complex transforms that use Bluestein's algorithm.
### Additions
* GFX1151 Support
## rocFFT 1.0.29 for ROCm 6.2.1
### Optimizations
* Implemented 1D kernels for factorizable sizes < 1024
## rocFFT 1.0.28 for ROCm 6.2.0
### Optimizations
* Implemented multi-device transform for 3D pencil decomposition. Contiguous dimensions on input and output bricks
are transformed locally, with global transposes to make remaining dimensions contiguous.
### Changes
* Add option in dyna-bench to load the libs in forward and then reverse order for benchmark tests.
* Randomly generated accuracy tests are now disabled by default; these can be enabled using
the --nrand option (which defaults to 0).
* Use Bonferroni multi-hypothesis testing framework by default for benchmark tests.
## rocFFT 1.0.27 for ROCm 6.1.1
### Fixes
* Fixed kernel launch failure on execute of very large odd-length real-complex transforms.
### Additions
* Enable multi-gpu testing on systems without direct GPU-interconnects
## rocFFT 1.0.26 for ROCm 6.1.0
### Changes
* Multi-device FFTs now allow batch greater than 1
* Multi-device, real-complex FFTs are now supported
* rocFFT now statically links libstdc++ when only `std::experimental::filesystem` is available (to guard
against ABI incompatibilities with newer libstdc++ libraries that include `std::filesystem`)
## rocFFT 1.0.25 for ROCm 6.0.0
### Additions
* Implemented experimental APIs to allow computing FFTs on data distributed across multiple devices
in a single process
* `rocfft_field` is a new type that can be added to a plan description to describe the layout of FFT
input or output
* `rocfft_field_add_brick` can be called to describe the brick decomposition of an FFT field, where each
brick can be assigned a different device
These interfaces are still experimental and subject to change. We are interested in getting feedback.
You can raise questions and concerns by opening issues in the
[rocFFT issue tracker](https://github.com/ROCmSoftwarePlatform/rocFFT/issues).
Note that multi-device FFTs currently have several limitations (we plan to address these in future
releases):
* Real-complex (forward or inverse) FFTs are not supported
* Planar format fields are not supported
* Batch (the `number_of_transforms` provided to `rocfft_plan_create`) must be 1
* FFT input is gathered to the current device at run time, so all FFT data must fit on that device
### Optimizations
* Improved the performance of several 2D/3D real FFTs supported by `2D_SINGLE` kernel. Offline
tuning provides more optimization for fx90a
* Removed an extra kernel launch from even-length, real-complex FFTs that use callbacks
### Changes
* Built kernels in a solution map to the library kernel cache
* Real forward transforms (real-to-complex) no longer overwrite input; rocFFT may still overwrite real
inverse (complex-to-real) input, as this allows for faster performance
* `rocfft-rider` and `dyna-rocfft-rider` have been renamed to `rocfft-bench` and `dyna-rocfft-bench`;
these are controlled by the `BUILD_CLIENTS_BENCH` CMake option
* Links for the former file names are installed, and the former `BUILD_CLIENTS_RIDER` CMake option
is accepted for compatibility, but both will be removed in a future release
* Binaries in debug builds no longer have a `-d` suffix
### Fixes
* rocFFT now correctly handles load callbacks that convert data from a smaller data type (e.g., 16-bit
integers -> 32-bit float)
## rocFFT 1.0.24 for ROCm 5.7.0
### Optimizations
* Improved the performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use
Bluestein's algorithm
### Additions
* Implemented a solution map version converter and finished the first conversion from ver.0 to ver.1
* Version 1 removes some incorrect kernels (sbrc/sbcr using `half_lds`)
### Changes
* Moved `rocfft_rtc_helper` executable to the `lib/rocFFT` directory on Linux
* Moved library kernel cache to the `lib/rocFFT` directory
## rocFFT 1.0.23 for ROCm 5.6.0
### Additions
* Implemented half-precision transforms; these can be requested by passing `rocfft_precision_half` to
`rocfft_plan_create`
* Implemented a hierarchical solution map that saves information on how to decompose a problem
and the kernels that are used
* Implemented a first version of offline-tuner to support tuning kernels for C2C and Z2Z problems
### Changes
* Replaced `std::complex` with hipComplex data types for the data generator
* FFT plan dimensions are now sorted to be row-major internally where possible, which produces
better plans if the dimensions were accidentally specified in a different order (column-major, for
example)
* Added the `--precision` argument to benchmark and test clients (`--double` is still accepted but is
deprecated as a method to request a double-precision transform)
* Improved performance test suite statistical framework
### Fixes
* Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch
failure
## rocFFT 1.0.22 for ROCm 5.5.0
### Optimizations
* Improved the performance of 1D lengths < 2048 that use Bluestein's algorithm
* Reduced code generation time during plan creation
* Optimized 3D R2C and C2R lengths 32, 84, 128
* Optimized batched small 1D R2C and C2R cases
### Additions
* Added gfx1101 to default `AMDGPU_TARGETS`
### Changes
* Moved client programs to C++17
* Moved planar kernels and infrequently used Stockham kernels to be runtime-compiled
* Moved transpose, real-complex, Bluestein, and Stockham kernels to the library kernel cache
### Fixes
* Removed zero-length twiddle table allocations, which fixes errors from `hipMallocManaged`
* Fixed incorrect freeing of HIP stream handles during twiddle computation when multiple devices are
present
## rocFFT 1.0.21 for ROCm 5.4.3
### Fixes
* Removed the source directory from `rocm_install_targets` to prevent the installation of `rocfft.h` in an
unintended location
## rocFFT 1.0.20 for ROCm 5.4.1
### Fixes
* Fixed incorrect results on strided large 1D FFTs where batch size does not equal the stride
## rocFFT 1.0.19 for ROCm 5.4.0
### Optimizations
* Optimized some strided large 1D plans
### Additions
* Added the `rocfft_plan_description_set_scale_factor` API to efficiently multiply each output element of
an FFT by a given scaling factor
* Created a `rocfft_kernel_cache.db` file next to the installed library; SBCC, CR, and RC kernels are
moved to this file when built with the library, and are runtime-compiled for new GPU architectures
* Added gfx1100 and gfx1102 to default `AMDGPU_TARGETS`
### Changes
* Moved the runtime compilation cache to in-memory by default
* A default on-disk cache can encounter contention problems on multi-node clusters with a shared
filesystem
* rocFFT can still use an on-disk cache by setting the `ROCFFT_RTC_CACHE_PATH` environment
variable
## rocFFT 1.0.18 for ROCm 5.3.0
### Changes
* The runtime compilation cache now looks for environment variables `XDG_CACHE_HOME` (on Linux)
and `LOCALAPPDATA` (on Windows) before falling back to `HOME`
* Moved computation of the twiddle table from the host to the device
### Optimizations
* Optimized 2D R2C and C2R to use 2-kernel plans where possible
* Improved performance of the Bluestein algorithm
* Optimized sbcc-168 and 100 by using half-LDS
* Optimized length-280 2D and 3D transforms
* Added kernels for factorizable 1D lengths < 128
### Fixes
* Fixed occasional failures to parallelize runtime compilation of kernels (failures would be retried
serially and ultimately succeed, but this would take extra time)
* Fixed failures of some R2C 3D transforms that use the unsupported `TILE_UNALGNED` SBRC kernels
(an example is 98^3 R2C out-of-place)
* Fixed bugs in the `SBRC_ERC` type
## rocFFT 1.0.17 for ROCm 5.2.0
### Additions
* Packages for test and benchmark executables on all supported operating systems using CPack
* Added file and folder reorganization changes, with backward compatibility support, using
`rocm-cmake` wrapper functions
### Changes
* Improved reuse of twiddle memory between plans
* Set a default load/store callback when only one callback type is set via the API (for improved
performance)
* Updated the GoogleTest dependency to version 1.11
### Optimizations
* Introduced a new access pattern of LDS (non-linear) and applied it on sbcc kernels len 64 and 81 for a
performance improvement
* Applied `lds-non-linear`, `direct-load-to-register`, and `direct-store-from-register` on sbcr kernels for
a performance improvement
### Fixes
* Correctness of certain transforms with unusual strides
* Incorrect handling of user-specified stream for runtime-compiled kernels
* Incorrect buffer allocation in `rocfft-test` on in-place transforms with different input and output sizes
## rocFFT 1.0.16 for ROCm 5.1.0
### Changes
* Supported unaligned tile dimension for `SBRC_2D` kernels
* Improved test and benchmark infrastructure by adding RAII
* Enabled runtime compilation of length-2304 FFT kernel during plan creation
* Added tokenizer for test suite
* Reduce twiddle memory requirements for even-length, real-complex transforms
* Clients can now be built separately from the main library
### Optimizations
* Optimized more large 1D cases by using `L1D_CC` plan
* Optimized the 3D 200^3 C2R case
* Optimized the 1D 2^30 double precision on MI200
* Added padding to work buffer sizes to improve performance in many cases
### Fixes
* Fixed the correctness of some R2C transforms with unusual strides
### Removals
* The hipFFT API (header) has been removed; use the
[hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT) package or repository to obtain the API
## rocFFT 1.0.15 for ROCm 5.0.0
### Changes
* Enabled runtime compilation of single FFT kernels > length 1024
* Re-aligned the split device library into four roughly equal libraries
* Implemented the FuseShim framework to replace the original OptimizePlan
* Implemented the generic buffer-assignment framework
* The buffer assignment is no longer performed by each node--we designed a generic algorithm to
test and pick the best assignment path
* With the help of FuseShim, we can achieve the most kernel-fusions possible
* Don't read the imaginary part of the DC and Nyquist modes for even-length complex-to-real
transforms
### Optimizations
* Optimized twiddle conjugation; complex-to-complex inverse transforms should now have similar
performance to forward transforms
* Improved performance of single-kernel, small 2D transforms
## rocFFT 1.0.14 for ROCm 4.5.0
### Optimizations
* Optimized SBCC kernels of lengths 52, 60, 72, 80, 84, 96, 104, 108, 112, 160, 168, 208, 216, 224, and
240 with a new kernel generator
### Additions
* Added support for Windows 10 as a build target
### Changes
* Packaging has been split into a runtime package (`rocfft`) and a development package
(`rocfft-devel`):
The development package depends on the runtime package. When installing the runtime package,
the package manager will suggest the installation of the development package to aid users
transitioning from the previous version's combined package. This suggestion by package manager is
for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion`
feature in the runtime package is introduced as a deprecated feature and will be removed in a future
ROCm release.
### Fixes
* Fixed validation failures for even-length R2C inplace 2D and 3D cubics sizes, such as 100^2 (or ^3),
200^2 (or ^3), and 256^2 (or ^3)
* We combine two kernels (`r2c-transpose`) instead of combining the three kernels
(`stockham-r2c-transpose`)
### Changes
* Split 2D device code into separate libraries
## rocFFT 1.0.13 for ROCm 4.4.0
### Optimizations
* Improved plans by removing unnecessary transpose steps
* Optimized scheme selection for 3D problems
* Imposed fewer restrictions on `3D_BLOCK_RC` selection (more problems can use `3D_BLOCK_RC` and
have performance gains)
* Enabled `3D_RC`; some 3D problems with SBCC-supported z-dim can use fewer kernels to get
benefits
* Forced `--length` 336 336 56 (dp) to use faster `3D_RC` to prevent it from being skipped by a
conservative threshold test
* Optimized some even-length R2C/C2R cases by doing more in-place operations and combining
pre- and post-processing into Stockham kernels
* Added radix-17
### Additions
* Added a new kernel generator for select fused 2D transforms
### Fixes
* Improved large 1D transform decompositions
## rocFFT 1.0.12 for ROCm 4.3.0
### Changes
* Re-split device code into single-precision, double-precision, and miscellaneous kernels
### Fixes
* Fixed potential crashes in double-precision planar->planar transpose
* Fixed potential crashes in 3D transforms with unusual strides for SBCC-optimized sizes
* Improved buffer placement logic
### Additions
* Added a new kernel generator for select lengths; new kernels have improved performance
* Added public `rocfft_execution_info_set_load_callback` and`rocfft_execution_info_set_store_callback`
API functions to allow running extra logic when loading data from and storing data to global
memory during a transform
### Removals
* Removed R2C pair schemes and kernels
### Optimizations
* Optimized 2D and 3D R2C 100 and 1D Z2Z 2500
* Reduced number of kernels for 2D/3D sizes where higher dimension is 64, 128, 256
### Fixes
* Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes
## rocFFT 1.0.11 for ROCm 4.2.0
### Changes
* Move device code into the main library
### Optimizations
* Improved performance for single-precision kernels exercising all except radix-2/7 butterfly ops
* Minor optimization for C2R 3D 100 and 200 cube sizes
* Optimized some C2C and R2C 3D 64, 81, 100, 128, 200, and 256 rectangular sizes
* When factoring, test to see if the remaining length is explicitly supported
* Explicitly added radix-7 lengths 14, 21, and 224 to list of supported lengths
* Optimized R2C 2D and 3D 128, 200, and 256 cube sizes
### Known issues
* Fixed potential crashes in small 3D transforms with unusual strides
([issue 311](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/311))
* Fixed potential crashes when running transforms on multiple devices
([issue 310](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/310))
## rocFFT 1.0.10 for ROCm 4.1.0
### Additions
* Explicitly specify `MAX_THREADS_PER_BLOCK` through `__launch_bounds_` for all kernels
* Switched to a new syntax for specifying AMD GPU architecture names and features
### Optimizations
* Optimized C2C and R2C 3D 64, 81, 100, 128, 200, and 256 cube sizes
* Improved the performance of the standalone out-of-place transpose kernel
* Optimized the 1D length 40000 C2C case
* Enabled radix-7 for size 336
* New radix-11 and radix-13 kernels; used in length 11 and 13 (and some of their multiples)
transforms
### Changes
* rocFFT now automatically allocates a work buffer if the plan requires one and none is provided
* An explicit `rocfft_status_invalid_work_buffer` error is now returned when a work buffer of insufficient
size is provided
* Updated online documentation
* Updated Debian package name version with separated underscore ( _ )
* Adjusted accuracy test tolerances and how they are compared
### Fixes
* Fixed a 4x4x8192 accuracy failure
## rocFFT 1.0.8 for ROCm 3.10.0
### Optimizations
* Optimized the 1D length 10000 C2C case
### Changes
* Added the `BUILD_CLIENTS_ALL` CMake option
### Fixes
* Fixed the correctness of SBCC and SBRC kernels with non-unit strides
* Fixed fused C2R kernel when a Bluestein transform follows it
## rocFFT 1.0.7 for ROCm 3.9.0
### Optimizations
* New R2C and C2R fused kernels to combine pre- and post-processing steps with transpose
* Enabled diagonal transpose for 1D and 2D power-of-2 cases
* New single kernels for small power-of-2, 3, and 5 sizes
* Added more radix-7 kernels
### Changes
* Explicitly disabled XNACK and SRAM-ECC features on AMDGPU hardware
### Fixes
* Fixed 2D C2R transform with length 1 on one dimension
* Fixed a potential thread unsafety in logging
## rocFFT 1.0.6 for ROCm 3.8.0
### Optimizations
* Improved the performance of 1D batch-paired R2C transforms of odd length
* Added some radix-7 kernels
* Improved the performance for 1D length 6561 and 10000
* Improved the performance for certain 2D transform sizes
### Changes
* Allowed a static library build with `BUILD_SHARED_LIBS=OFF` CMake option
* Updated GoogleTest dependency to version 1.10
### Fixes
* Correctness of certain large 2D sizes
## rocFFT 1.0.5 for ROCM 3.7.0
### Optimizations
* Optimized C2C power-of-2 middle sizes
### Changes
* Parallelized work in unit tests and eliminated duplicate cases
### Fixes
* Correctness of certain large 1D, and 2D power-of-3 and 5 sizes
* Incorrect buffer assignment for some even-length R2C transforms
* `` inclusion on C compilers
* Incorrect results on non-unit strides with SBCC/SBRC kernels
rocFFT-rocm-7.1.0/CMakeLists.txt 0000664 0000000 0000000 00000024526 15066521634 0016361 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# We use C++17 features, this will add compile option: -std=c++17
set( CMAKE_CXX_STANDARD 17 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_BUILD_SCOPE ON )
project( rocfft LANGUAGES CXX C )
# This finds the rocm-cmake project, and installs it if not found
# rocm-cmake contains common cmake code for rocm projects to help setup and install
set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern )
find_package( ROCmCMakeBuildTools PATHS ${ROCM_PATH} /opt/rocm )
if( NOT ROCmCMakeBuildTools_FOUND )
include( FetchContent )
FetchContent_Declare( rocm_cmake_local
GIT_REPOSITORY https://github.com/ROCm/rocm-cmake
GIT_TAG rocm-6.4.1
GIT_SHALLOW ON
)
FetchContent_MakeAvailable( rocm_cmake_local )
execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake .
WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} )
execute_process( COMMAND ${CMAKE_COMMAND} --build ${rocm_cmake_local_SOURCE_DIR} --target install
WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} )
find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake )
endif( )
include( ROCMSetupVersion )
include( ROCMCreatePackage )
include( ROCMInstallTargets )
include( ROCMPackageConfigHelpers )
include( ROCMInstallSymlinks )
include( ROCMCheckTargetIds )
include( ROCMClients )
include( ROCMHeaderWrapper )
if( ROCM_PATH )
list( APPEND CMAKE_BUILD_RPATH ${ROCM_PATH}/lib )
endif()
# Using standardized versioning from rocm-cmake
set ( VERSION_STRING "1.0.35" )
rocm_setup_version( VERSION ${VERSION_STRING} )
# Append our library helper cmake path and the cmake path for hip (for
# convenience).
# Users may override HIP path by specifying their own in CMAKE_MODULE_PATH
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
# Enable verbose output
option( BUILD_VERBOSE "Output additional build information" OFF )
# BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option
# such that it shows in cmake-gui
option( BUILD_SHARED_LIBS "Build rocFFT as a shared library" ON )
option( WERROR "Treat warnings as errors" OFF )
option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF)
option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF)
# Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable,
# Set default to OFF since users are not likely to tune
option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF)
# Provide ability to disable hipRAND dependency
option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON)
# Split up function pool compilation across N files to parallelize its build
set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation")
set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare -Wno-deprecated-declarations )
if( WERROR )
set( WARNING_FLAGS ${WARNING_FLAGS} -Werror )
endif( )
set(DEFAULT_GPUS
gfx803
gfx900
gfx906
gfx908
gfx90a
gfx942
gfx950
gfx1030
gfx1100
gfx1101
gfx1102
gfx1151
gfx1200
gfx1201)
if(BUILD_ADDRESS_SANITIZER)
add_compile_options(-fsanitize=address)
add_link_options(-fsanitize=address)
add_link_options(-shared-libasan)
SET(DEFAULT_GPUS
gfx908:xnack+
gfx90a:xnack+
gfx942:xnack+)
add_link_options(-fuse-ld=lld)
set(ROCFFT_KERNEL_CACHE_ENABLE off)
add_compile_definitions(ADDRESS_SANITIZER)
endif()
# Build only for local GPU architecture
if (BUILD_LOCAL_GPU_TARGET_ONLY)
message(STATUS "Building only for local GPU target")
if (COMMAND rocm_local_targets)
rocm_local_targets(DEFAULT_GPUS)
else()
message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.")
endif()
endif()
if(AMDGPU_TARGETS AND NOT GPU_TARGETS)
message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." )
endif()
set(AMDGPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)")
rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}")
# Don't force, users should be able to override GPU_TARGETS at the command line if desired
set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for")
# HIP is required - library and clients use HIP to access the device
find_package( hip REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hip/ )
find_package( hiprtc REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hiprtc/ )
# The nvidia backend can be used to compile for CUDA devices.
# Specify the CUDA prefix in the CUDA_PREFIX variable.
# CUDA_ARCH (e.g. sm_75) is also required.
if( USE_CUDA )
if( NOT DEFINED CUDA_PREFIX )
message( FATAL_ERROR "CUDA_PREFIX variable is required (e.g. /usr/local/cuda-11.4)" )
endif()
if( NOT DEFINED CUDA_ARCH )
message( FATAL_ERROR "CUDA_ARCH variable is required. (e.g. sm_75)" )
endif()
add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__)
add_link_options(-L${CUDA_PREFIX}/lib64 -pthread)
endif( )
# hipcc automatically provides HIP include dirs and HIP platform,
# but plain clang needs to be told
if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" )
include_directories( ${HIP_INCLUDE_DIRS} )
if( USE_CUDA )
add_compile_definitions( __HIP_PLATFORM_NVIDIA__ )
else()
add_compile_definitions( __HIP_PLATFORM_AMD__ )
endif()
endif()
# Enable MPI support in rocFFT:
option(ROCFFT_MPI_ENABLE "Enable MPI" OFF)
option(ROCFFT_CRAY_MPI_ENABLE "Cray MPI" OFF)
if( ROCFFT_MPI_ENABLE )
find_package( MPI REQUIRED )
include_directories(SYSTEM ${MPI_INCLUDE_PATH})
endif()
add_subdirectory( library )
include( clients/cmake/build-options.cmake )
# Build clients of the library
if( BUILD_CLIENTS )
set( BUILD_CLIENTS_BENCH ON )
set( BUILD_CLIENTS_SAMPLES ON )
set( BUILD_CLIENTS_TESTS ON )
endif( )
# old name for BUILD_CLIENTS_BENCH
if( BUILD_CLIENTS_RIDER )
set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} )
endif()
if( BUILD_CLIENTS_SAMPLES
OR BUILD_CLIENTS_TESTS
OR BUILD_CLIENTS_BENCH )
if( NOT CLIENTS_OS )
rocm_set_os_id( CLIENTS_OS )
endif()
if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP))
set(OPENMP_DEB "libgomp1")
set(FFTW_DEB "libfftw3-bin")
if(CLIENTS_OS STREQUAL "sles")
set(OPENMP_RPM "libgomp1")
set(FFTW_RPM "libfftw3-3")
else()
set(OPENMP_RPM "libgomp")
set(FFTW_RPM "fftw-libs")
endif()
endif()
rocm_package_setup_component(clients)
if( USE_HIPRAND )
set( HIPRAND_DEP hiprand )
endif()
if(BUILD_CLIENTS_TESTS)
rocm_package_setup_client_component(
tests
DEPENDS
DEB ${OPENMP_DEB} ${FFTW_DEB} ${HIPRAND_DEP}
RPM ${OPENMP_RPM} ${FFTW_RPM} ${HIPRAND_DEP}
)
endif()
if(BUILD_CLIENTS_BENCH)
rocm_package_setup_client_component(
benchmarks
DEPENDS
DEB ${HIPRAND_DEP}
RPM ${HIPRAND_DEP}
)
endif()
add_subdirectory( clients )
endif( )
if(WIN32)
set(CPACK_SOURCE_GENERATOR "ZIP")
set(CPACK_GENERATOR "ZIP")
if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT)
set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE)
endif()
set(INSTALL_PREFIX "C:/hipSDK")
set(CPACK_SET_DESTDIR OFF)
set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK")
set(CPACK_PACKAGING_INSTALL_PREFIX "")
set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF)
endif()
# Package specific CPACK vars
string( TOLOWER "${HIP_RUNTIME}" HIP_RUNTIME_LOWER )
if( HIP_RUNTIME_LOWER STREQUAL "rocclr" )
if(BUILD_ADDRESS_SANITIZER)
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" )
else()
set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" )
endif()
rocm_package_add_dependencies("${DEPENDS_HIP_RUNTIME} >= 4.5.0")
endif( )
set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" )
set( CPACK_RPM_PACKAGE_LICENSE "MIT" )
set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" )
set( ROCFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" )
set( package_name rocfft )
rocm_create_package(
NAME ${package_name}
DESCRIPTION "ROCm FFT library"
MAINTAINER "rocfft-maintainer@amd.com"
LDCONFIG
LDCONFIG_DIR ${ROCFFT_CONFIG_DIR}
)
option(BUILD_CODE_COVERAGE "Build with code coverage flags (clang only)" OFF)
rocFFT-rocm-7.1.0/CppCheckSuppressions.txt 0000664 0000000 0000000 00000000342 15066521634 0020466 0 ustar 00root root 0000000 0000000 // generator uses implicit constructors for convenience
noExplicitConstructor:library/src/device/generator/generator.h
// has some false positives and isn't hard to run manually for periodic
// dead code sweeps
unusedFunction
rocFFT-rocm-7.1.0/LICENSE.md 0000664 0000000 0000000 00000005360 15066521634 0015220 0 ustar 00root root 0000000 0000000 MIT License
Copyright (C) Advanced Micro Devices, Inc.
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in all
copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
SOFTWARE.
---
This product includes software from copyright holders as shown below, and distributed under their license terms as specified.
CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry
Schreiner under NSF AWARD 1414736. All rights reserved.
Redistribution and use in source and binary forms of CLI11, with or without
modification, are permitted provided that the following conditions are met:
1. Redistributions of source code must retain the above copyright notice, this
list of conditions and the following disclaimer.
2. Redistributions in binary form must reproduce the above copyright notice,
this list of conditions and the following disclaimer in the documentation
and/or other materials provided with the distribution.
3. Neither the name of the copyright holder nor the names of its contributors
may be used to endorse or promote products derived from this software without
specific prior written permission.
THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED
WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR
ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES
(INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES;
LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON
ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
(INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
rocFFT-rocm-7.1.0/README.md 0000664 0000000 0000000 00000010756 15066521634 0015100 0 ustar 00root root 0000000 0000000 # rocFFT
rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP
programming language. It's part of the AMD software ecosystem based on
[ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD GPUs.
## Documentation
> [!NOTE]
> The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the projects/rocfft/docs folder of the rocm-libraries repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html).
To build our documentation locally, use the following code:
```Bash
cd projects/rocfft/docs
pip3 install -r sphinx/requirements.txt
python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html
```
## Build and install
You can install rocFFT using pre-built packages or building from source.
* Installing pre-built packages:
1. Download the pre-built packages from the
[ROCm package servers](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) or use the
GitHub releases tab to download the source (this may give you a more recent version than the
pre-built packages).
2. Run: `sudo apt update && sudo apt install rocfft`
* Building from source:
rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your
build. The following commands build a shared library for supported AMD GPUs. Run these commands from the `rocm-libraries/projects/rocfft` directory:
```bash
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_C_COMPILER=amdclang ..
make -j
```
You can compile a static library using the `-DBUILD_SHARED_LIBS=off` option.
With rocFFT, you can use indirect function calls by default; this requires ROCm 4.3 or higher. You can
use `-DROCFFT_CALLBACKS_ENABLED=off` with CMake to prevent these calls on older ROCm
compilers. Note that with this configuration, callbacks won't work correctly.
rocFFT includes the following clients:
* `rocfft-bench`: Runs general transforms and is useful for performance analysis
* `rocfft-test`: Runs various regression tests
* Various small samples
| Client | CMake option | Dependencies |
|:------|:-----------------|:-----------------|
| `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND |
| `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest |
| samples | `-DBUILD_CLIENTS_SAMPLES=on` | None |
| coverage | `-DBUILD_CODE_COVERAGE=ON` | clang, llvm-cov |
Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process
downloads and builds GoogleTest and FFTW if they are not already installed.
Clients can be built separately from the main library. For example, you can build all the clients with
an existing rocFFT library by invoking CMake from within the `rocFFT-src/clients` folder:
```bash
mkdir build && cd build
cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_PREFIX_PATH=/path/to/rocFFT-lib ..
make -j
```
To install client dependencies on Ubuntu, run:
```bash
sudo apt install libgtest-dev libfftw3-dev libboost-dev
```
rocFFT uses version 1.11 of GoogleTest.
You can generate a test coverage report with the following:
```bash
cmake -DCMAKE_CXX_COMPILER=amdclang++ -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CODE_COVERAGE=ON ..
make -j coverage
```
The above will output the coverage report to the terminal and also save an html coverage report to `$PWD/coverage-report`.
## Examples
A summary of the latest functionality and workflow to compute an FFT with rocFFT is available on the
[rocFFT documentation portal](https://rocm.docs.amd.com/projects/rocFFT/en/latest/).
You can find additional examples in the `clients/samples` subdirectory.
## Support
You can report bugs and feature requests through the rocm-libraries GitHub
[issue tracker](https://github.com/ROCm/rocm-libraries/issues).
## Contribute
If you want to contribute to rocFFT, you must follow the [contribution guidelines](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/.github/CONTRIBUTING.md).
rocFFT-rocm-7.1.0/ValgrindSuppressions.txt 0000664 0000000 0000000 00000000267 15066521634 0020562 0 ustar 00root root 0000000 0000000 {
Memcheck:Param
sched_setaffinity(mask)
...
fun:hipMalloc
}
{
Memcheck:Param
sched_setaffinity(mask)
...
fun:hipMemGetInfo
} rocFFT-rocm-7.1.0/clients/ 0000775 0000000 0000000 00000000000 15066521634 0015251 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/CMakeLists.txt 0000664 0000000 0000000 00000010673 15066521634 0020020 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
set( CPACK_PACKAGING_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
set( CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_CLIENTS_BUILD_SCOPE ON )
# This project may compile dependencies for clients
project( rocfft-clients LANGUAGES CXX C )
set(CMAKE_CXX_STANDARD 17)
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT ROCmCMakeBuildTools_FOUND )
find_package( ROCmCMakeBuildTools REQUIRED )
endif()
include( ROCMInstallTargets )
# Adding Version File to rocfft-client, this avoids empty rocfft-client package
file ( WRITE "${PROJECT_BINARY_DIR}/package/client-version"
"${rocfft_VERSION_MAJOR}.${rocfft_VERSION_MINOR}.${rocfft_VERSION_PATCH}-${BUILD_ID}\n" )
rocm_install ( FILES ${PROJECT_BINARY_DIR}/package/client-version DESTINATION .info COMPONENT clients)
# This option only works for make/nmake and the ninja generators, but
# no reason it shouldn't be on all the time.
# This tells cmake to create a compile_commands.json file that can be
# used with clang tooling or vim.
set( CMAKE_EXPORT_COMPILE_COMMANDS ON )
if(NOT ROCFFT_BUILD_SCOPE AND
NOT BUILD_CLIENTS_SAMPLES AND
NOT BUILD_CLIENTS_TESTS AND
NOT BUILD_CLIENTS_BENCH)
set( BUILD_CLIENTS_SAMPLES ON )
set( BUILD_CLIENTS_TESTS ON )
set( BUILD_CLIENTS_BENCH ON )
endif()
# each backend requires different libraries for host and device code
if( USE_CUDA )
if( NOT DEFINED CUDA_PREFIX )
message( FATAL_ERROR "CUDA_PREFIX variable is required." )
endif()
if( NOT DEFINED CUDA_ARCH )
message( FATAL_ERROR "CUDA_ARCH variable is required." )
endif()
add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__)
add_link_options(-L${CUDA_PREFIX}/lib64 -pthread)
add_compile_options(--cuda-path=${CUDA_PREFIX} --cuda-gpu-arch=${CUDA_ARCH} -xcuda)
set( ROCFFT_CLIENTS_HOST_LINK_LIBS -lcudart -ldl -lrt )
else()
set( ROCFFT_CLIENTS_HOST_LINK_LIBS hip::host )
set( ROCFFT_CLIENTS_DEVICE_LINK_LIBS hip::device )
endif()
if( ROCFFT_MPI_ENABLE )
find_package( MPI REQUIRED )
endif()
if( BUILD_CLIENTS_SAMPLES )
add_subdirectory( samples )
endif( )
if( BUILD_CLIENTS_TESTS )
add_subdirectory( tests )
endif( )
if( BUILD_CLIENTS_BENCH )
add_subdirectory( bench )
endif( )
rocFFT-rocm-7.1.0/clients/bench/ 0000775 0000000 0000000 00000000000 15066521634 0016330 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/bench/CMakeLists.txt 0000664 0000000 0000000 00000012201 15066521634 0021064 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-bench LANGUAGES CXX )
set(CMAKE_CXX_STANDARD 17)
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
if( NOT ROCmCMakeBuildTools_FOUND )
find_package( ROCmCMakeBuildTools REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
include( ROCMInstallTargets )
set( bench_list rocfft-bench dyna-rocfft-bench )
foreach( bench ${bench_list})
if(${bench} STREQUAL "rocfft-bench")
add_executable( ${bench} ../../shared/array_validator.cpp bench.cpp bench.h )
else()
add_executable( ${bench} ../../shared/array_validator.cpp dyna-bench.cpp bench.h )
endif()
target_compile_options( ${bench} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
# NB: hip-clang includes omp.h, so we need to specify the location
# of ROCM_CLANG_ROOT at cmake config time if we are using clang++.
target_include_directories( ${bench}
PRIVATE
$
${HIP_CLANG_ROOT}/include
${ROCM_CLANG_ROOT}/include
)
if(${bench} STREQUAL "rocfft-bench")
target_link_libraries( ${bench}
PRIVATE
hip::device
roc::rocfft
)
else()
target_link_libraries( ${bench}
PRIVATE
${CMAKE_DL_LIBS}
hip::device
)
endif()
if( USE_HIPRAND )
target_link_libraries( ${bench}
PRIVATE
hip::hiprand
)
target_compile_definitions( ${bench} PRIVATE USE_HIPRAND )
endif()
# We need to include both rocfft.h and rocfft-export.h
target_include_directories( ${bench}
PRIVATE
${CMAKE_BINARY_DIR}/include
${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/
${HIP_CLANG_ROOT}/include
)
target_link_libraries( ${bench} PUBLIC
${ROCFFT_CLIENTS_HOST_LINK_LIBS}
)
if( ROCFFT_MPI_ENABLE )
target_link_libraries( ${bench}
PRIVATE
MPI::MPI_CXX
)
if ( ROCFFT_CRAY_MPI_ENABLE)
target_link_libraries( ${bench}
PRIVATE
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( ${bench}
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
endif()
endif()
set_target_properties( ${bench} PROPERTIES
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( BENCH_OUT_DIR "/../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( BENCH_OUT_DIR "/../bin" )
else()
set( BENCH_OUT_DIR "/bin")
endif()
string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} )
set_target_properties(${bench}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${BENCH_OUT_DIR} )
rocm_install(TARGETS ${bench} COMPONENT benchmarks)
endforeach()
# Link dyna-rocfft-bench to the experimental filesystem library if
# it's not available in the standard library.
include( ../../cmake/std-filesystem.cmake )
target_link_std_experimental_filesystem( dyna-rocfft-bench )
rocFFT-rocm-7.1.0/clients/bench/bench.cpp 0000664 0000000 0000000 00000036210 15066521634 0020115 0 ustar 00root root 0000000 0000000 // Copyright (C) 2016 - 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include "../../shared/CLI11.hpp"
#include "../../shared/arithmetic.h"
#include "../../shared/gpubuf.h"
#include "../../shared/hip_object_wrapper.h"
#include "../../shared/rocfft_params.h"
#include "bench.h"
#include "rocfft/rocfft.h"
int main(int argc, char* argv[])
{
// This helps with mixing output of both wide and narrow characters to the screen
std::ios::sync_with_stdio(false);
// Control output verbosity:
int verbose{};
// number of GPUs to use:
int ngpus{};
// hip Device number for running tests:
int deviceId{};
// Ignore runtime failures.
// eg: hipMalloc failing when there isn't enough free vram.
bool ignore_hip_runtime_failures{true};
// Number of performance trial samples
int ntrial{};
// FFT parameters:
rocfft_params params;
// input/output FFT grids
std::vector ingrid;
std::vector outgrid;
// Token string to fully specify fft params.
std::string token;
CLI::App app{"rocfft-bench command line options"};
// Declare the supported options. Some option pointers are declared to track passed opts.
app.add_flag("--version", "Print queryable version information from the rocfft library")
->each([](const std::string&) {
char v[256];
rocfft_get_version_string(v, 256);
std::cout << "version " << v << std::endl;
return EXIT_SUCCESS;
});
CLI::Option* opt_token
= app.add_option("--token", token, "Token to read FFT params from")->default_val("");
// Group together options that conflict with --token
auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token");
non_token
->add_flag("--double", "Double precision transform (deprecated: use --precision double)")
->each([&](const std::string&) { params.precision = fft_precision_double; });
non_token->excludes(opt_token);
non_token
->add_option("-t, --transformType",
params.transform_type,
"Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
"forward\n3) real inverse")
->default_val(fft_transform_type_complex_forward);
non_token
->add_option("--auto_allocation",
params.auto_allocate,
"rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"")
->default_val("default");
non_token
->add_option(
"--precision", params.precision, "Transform precision: single (default), double, half")
->excludes("--double");
CLI::Option* opt_not_in_place
= non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)")
->each([&](const std::string&) { params.placement = fft_placement_notinplace; });
non_token
->add_option("--itype",
params.itype,
"Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
non_token
->add_option("--otype",
params.otype,
"Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
CLI::Option* opt_length
= non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3);
non_token->add_option("--ngpus", ngpus, "Number of GPUs to use")
->default_val(1)
->check(CLI::NonNegativeNumber);
// define multi-GPU grids for FFT computation,
CLI::Option* opt_ingrid
= non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input")
->expected(1, 3)
->needs("--ngpus");
CLI::Option* opt_outgrid
= non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output")
->expected(1, 3)
->needs("--ngpus");
non_token
->add_option("-b, --batchSize",
params.nbatch,
"If this value is greater than one, arrays will be used")
->default_val(1);
CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides");
CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides");
non_token->add_option("--idist", params.idist, "Logical distance between input batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; });
non_token->add_option("--odist", params.odist, "Logical distance between output batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; });
CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset");
CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset");
app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures",
ignore_hip_runtime_failures,
"Ignore hip runtime failures");
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0);
app.add_option("-N, --ntrial", ntrial, "Trial size for the problem")
->default_val(1)
->each([&](const std::string& val) {
std::cout << "Running profile with " << val << " samples\n";
});
// Default value is set in fft_params.h based on if device-side PRNG was enabled.
app.add_option("-g, --inputGen",
params.igen,
"Input data generation:\n0) PRNG sequence (device)\n"
"1) PRNG sequence (host)\n"
"2) linearly-spaced sequence (device)\n"
"3) linearly-spaced sequence (host)");
app.add_option("--isize", params.isize, "Logical size of input buffer");
app.add_option("--osize", params.osize, "Logical size of output buffer");
app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output");
// Parse args and catch any errors here
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
if(!token.empty())
{
std::cout << "Reading fft params from token:\n" << token << std::endl;
try
{
params.from_token(token);
}
catch(...)
{
std::cout << "Unable to parse token." << std::endl;
return EXIT_FAILURE;
}
std::cout << std::flush;
}
else // generate token
{
if(ngpus > 1)
{
// set default GPU grids in case none were given
params.set_default_grid(ngpus, ingrid, outgrid);
// split the problem among ngpus
params.mp_lib = fft_params::fft_mp_lib_none;
int localDeviceCount = 0;
if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
{
throw std::runtime_error("hipGetDeviceCount failed");
}
// start with all-ones in grids
std::vector input_grid(params.length.size() + 1, 1);
std::vector output_grid(params.length.size() + 1, 1);
// create input and output grids and distribute it according to user requirements
std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1);
std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1);
params.distribute_input(localDeviceCount, input_grid);
params.distribute_output(localDeviceCount, output_grid);
}
if(*opt_not_in_place)
{
std::cout << "out-of-place\n";
}
else
{
std::cout << "in-place\n";
}
if(*opt_length)
{
std::cout << "length:";
for(auto& i : params.length)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_istride)
{
std::cout << "istride:";
for(auto& i : params.istride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ostride)
{
std::cout << "ostride:";
for(auto& i : params.ostride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ioffset)
{
std::cout << "ioffset:";
for(auto& i : params.ioffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ooffset)
{
std::cout << "ooffset:";
for(auto& i : params.ooffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ingrid || !ingrid.empty())
{
std::cout << "input grid:";
for(auto& i : ingrid)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_outgrid || !outgrid.empty())
{
std::cout << "output grid:";
for(auto& i : outgrid)
std::cout << " " << i;
std::cout << "\n";
}
std::cout << "\n";
}
std::cout << std::flush;
rocfft_setup();
// Set GPU for single-device FFT computation
rocfft_scoped_device dev(deviceId);
params.validate();
if(!params.valid(verbose))
{
throw std::runtime_error("Invalid parameters, add --verbose=1 for detail");
}
std::cout << "Token: " << params.token() << std::endl;
if(verbose)
{
std::cout << params.str(" ") << std::endl;
}
// Check free and total available memory:
size_t free = 0;
size_t total = 0;
try
{
HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
const auto raw_vram_footprint
= params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
if(!vram_fits_problem(raw_vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
const auto vram_footprint = params.vram_footprint();
if(!vram_fits_problem(vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
auto ret = params.create_plan();
if(ret != fft_status_success)
LIB_V_THROW(rocfft_status_failure, "Plan creation failed");
// GPU input buffer:
std::vector ibuffer;
std::vector pibuffer;
// CPU-side input buffer
std::vector ibuffer_cpu;
auto is_host_gen = (params.igen == fft_input_generator_host
|| params.igen == fft_input_random_generator_host);
auto ibricks = get_input_bricks(params);
auto obricks = get_output_bricks(params);
std::vector obuffer_data;
std::vector* obuffer = nullptr;
alloc_bench_bricks(
params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen);
pibuffer.resize(ibuffer.size());
for(unsigned int i = 0; i < ibuffer.size(); ++i)
{
pibuffer[i] = ibuffer[i].data();
}
// print input if requested
if(verbose > 1)
{
if(is_host_gen)
{
// data is already on host
params.print_ibuffer(ibuffer_cpu);
}
else
{
print_device_buffer(params, ibuffer, true);
}
}
std::vector pobuffer(obuffer->size());
for(unsigned int i = 0; i < obuffer->size(); ++i)
{
pobuffer[i] = obuffer->at(i).data();
}
init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
// Execute a warm-up call
params.execute(pibuffer.data(), pobuffer.data());
// Run the transform several times and record the execution time:
std::vector gpu_time(ntrial);
hipEvent_wrapper_t start, stop;
start.alloc();
stop.alloc();
for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial)
{
// Create input at every iteration to avoid overflow
if(is_host_gen)
{
copy_host_input_to_dev(ibuffer_cpu, ibuffer);
}
else
{
init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
}
HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
params.execute(pibuffer.data(), pobuffer.data());
HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed");
HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed");
float time;
HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed");
gpu_time[itrial] = time;
// Print result after FFT transform
if(verbose > 2)
{
print_device_buffer(params, *obuffer, false);
}
}
std::cout << "\nExecution gpu time:";
for(const auto& i : gpu_time)
{
std::cout << " " << i;
}
std::cout << " ms" << std::endl;
std::cout << "Execution gflops: ";
const double totsize = product(params.length.begin(), params.length.end());
const double k
= ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5
: 5.0;
const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0);
for(const auto& i : gpu_time)
{
std::cout << " " << opscount / (1e6 * i);
}
std::cout << std::endl;
rocfft_cleanup();
}
rocFFT-rocm-7.1.0/clients/bench/bench.h 0000664 0000000 0000000 00000025063 15066521634 0017566 0 ustar 00root root 0000000 0000000 // Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef ROCFFT_BENCH_H
#define ROCFFT_BENCH_H
#include "../../shared/fft_params.h"
#include "../../shared/rocfft_hip.h"
#include "rocfft/rocfft.h"
#include
#include
class rocfft_hip_runtime_error : public std::runtime_error
{
public:
rocfft_hip_runtime_error(const std::string& msg = "")
: runtime_error(msg)
{
}
};
// This is used to either wrap a HIP function call, or to explicitly check a variable
// for an error condition. If an error occurs, we throw.
// Note: std::runtime_error does not take unicode strings as input, so only strings
// supported
inline void
hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName)
{
if(res != hipSuccess)
{
std::stringstream tmp;
tmp << "HIP_V_THROWERROR< ";
tmp << res;
tmp << " > (";
tmp << fileName;
tmp << " Line: ";
tmp << lineno;
tmp << "): ";
tmp << msg;
std::string errorm(tmp.str());
std::cout << errorm << std::endl;
throw rocfft_hip_runtime_error(errorm);
}
}
class rocfft_runtime_error : public std::runtime_error
{
public:
rocfft_runtime_error(const std::string& msg = "")
: runtime_error(msg)
{
}
};
inline void lib_V_Throw(rocfft_status res,
const std::string& msg,
size_t lineno,
const std::string& fileName)
{
if(res != rocfft_status_success)
{
std::stringstream tmp;
tmp << "LIB_V_THROWERROR< ";
tmp << res;
tmp << " > (";
tmp << fileName;
tmp << " Line: ";
tmp << lineno;
tmp << "): ";
tmp << msg;
std::string errorm(tmp.str());
std::cout << errorm << std::endl;
throw rocfft_runtime_error(errorm);
}
}
#define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__)
#define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__)
// return input bricks for params, or one big brick covering the
// input field if no bricks are specified
template
std::vector get_input_bricks(const Tparams& params)
{
std::vector bricks;
if(!params.ifields.empty())
bricks = params.ifields[0].bricks;
else
{
auto len = params.ilength();
// just make one big brick covering the whole input field
bricks.resize(1);
bricks.front().lower.resize(len.size() + 1);
bricks.front().upper.resize(len.size() + 1);
bricks.front().stride.resize(len.size() + 1);
bricks.front().upper.front() = params.nbatch;
std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1);
bricks.front().stride.front() = params.idist;
std::copy(params.istride.begin(), params.istride.end(), bricks.front().stride.begin() + 1);
}
return bricks;
}
// return output bricks for params, or one big brick covering the
// output field if no bricks are specified
template
std::vector get_output_bricks(const Tparams& params)
{
std::vector bricks;
if(!params.ofields.empty())
bricks = params.ofields[0].bricks;
else
{
auto len = params.olength();
// just make one big brick covering the whole output field
bricks.resize(1);
bricks.front().lower.resize(len.size() + 1);
bricks.front().upper.resize(len.size() + 1);
bricks.front().stride.resize(len.size() + 1);
bricks.front().upper.front() = params.nbatch;
std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1);
bricks.front().stride.front() = params.odist;
std::copy(params.ostride.begin(), params.ostride.end(), bricks.front().stride.begin() + 1);
}
return bricks;
}
// Allocate input/output buffers for a bench run.
template
void alloc_bench_bricks(const Tparams& params,
const std::vector& ibricks,
const std::vector& obricks,
std::vector& ibuffers,
std::vector& obuffer_data,
std::vector*& obuffers,
std::vector& host_buffers,
bool is_host_gen)
{
auto alloc_buffers = [¶ms, &host_buffers](const std::vector& bricks,
fft_array_type type,
std::vector& output,
bool is_host_gen) {
auto elem_size = var_size(params.precision, type);
const bool is_planar
= type == fft_array_type_complex_planar || type == fft_array_type_hermitian_planar;
// alloc 2x buffers, each half size for planar
if(is_planar)
elem_size /= 2;
for(const auto& b : bricks)
{
rocfft_scoped_device dev(b.device);
size_t brick_size_bytes = compute_ptrdiff(b.length(), b.stride, 0, 0) * elem_size;
output.emplace_back();
if(output.back().alloc(brick_size_bytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
if(is_planar)
{
output.emplace_back();
if(output.back().alloc(brick_size_bytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
}
if(is_host_gen)
{
host_buffers.emplace_back();
host_buffers.back().alloc(brick_size_bytes);
if(is_planar)
{
host_buffers.emplace_back();
host_buffers.back().alloc(brick_size_bytes);
}
}
}
};
// If brick shape differs, inplace is only allowed for single
// bricks. e.g. in-place real-complex
if(params.placement == fft_placement_inplace)
{
if(ibricks.size() != 1 && obricks.size() != 1 && ibricks != obricks)
throw std::runtime_error(
"in-place transform to different brick shapes only allowed for single bricks");
// allocate the larger of the two bricks
auto isize_bytes = compute_ptrdiff(ibricks.front().length(), ibricks.front().stride, 0, 0)
* var_size(params.precision, params.itype);
auto osize_bytes = compute_ptrdiff(obricks.front().length(), obricks.front().stride, 0, 0)
* var_size(params.precision, params.otype);
alloc_buffers(isize_bytes > osize_bytes ? ibricks : obricks,
isize_bytes > osize_bytes ? params.itype : params.otype,
ibuffers,
is_host_gen);
obuffers = &ibuffers;
}
else
{
alloc_buffers(ibricks, params.itype, ibuffers, is_host_gen);
alloc_buffers(obricks, params.otype, obuffer_data, false);
obuffers = &obuffer_data;
}
}
void copy_host_input_to_dev(std::vector& host_buffers, std::vector& buffers)
{
for(size_t i = 0; i < buffers.size(); ++i)
{
if(hipMemcpy(buffers[i].data(),
host_buffers[i].data(),
host_buffers[i].size(),
hipMemcpyHostToDevice)
!= hipSuccess)
throw std::runtime_error("hipMemcpy failure");
}
}
template
void init_bench_input(const Tparams& params,
const std::vector& bricks,
std::vector& buffers,
std::vector& host_buffers,
bool is_host_gen)
{
auto elem_size = var_size(params.precision, params.itype);
if(is_host_gen)
{
std::vector ptrs;
ptrs.reserve(host_buffers.size());
for(auto& buf : host_buffers)
ptrs.push_back(buf.data());
init_local_input(0, params, bricks, elem_size, ptrs);
copy_host_input_to_dev(host_buffers, buffers);
}
else
{
#ifdef USE_HIPRAND
std::vector ptrs;
ptrs.reserve(buffers.size());
for(auto& buf : buffers)
ptrs.push_back(buf.data());
init_local_input(0, params, bricks, elem_size, ptrs);
#endif
}
}
template
void print_device_buffer(const Tparams& params, std::vector& buffer, bool input)
{
// copy data back to host
std::vector print_buffer;
for(auto& buf : buffer)
{
print_buffer.emplace_back();
print_buffer.back().alloc(buf.size());
if(hipMemcpy(print_buffer.back().data(), buf.data(), buf.size(), hipMemcpyDeviceToHost)
!= hipSuccess)
throw std::runtime_error("hipMemcpy failed");
}
if(input)
params.print_ibuffer(print_buffer);
else
params.print_obuffer(print_buffer);
}
#endif // ROCFFT_BENCH_H
rocFFT-rocm-7.1.0/clients/bench/dyna-bench.cpp 0000664 0000000 0000000 00000073144 15066521634 0021055 0 ustar 00root root 0000000 0000000 // Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
// This file allows one to run tests multiple different rocFFT libraries at the same time.
// This allows one to randomize the execution order for better a better experimental setup
// which produces fewer type 1 errors where one incorrectly rejects the null hypothesis.
#include
#if __has_include()
#include
#else
#include
namespace std
{
namespace filesystem = experimental::filesystem;
}
#endif
#include
#include
#include
#include
#ifdef WIN32
#include
// psapi.h requires windows.h to be included first
#include
#else
#include
#include
#endif
#include "../../shared/CLI11.hpp"
#include "../../shared/gpubuf.h"
#include "../../shared/hip_object_wrapper.h"
#include "../../shared/rocfft_params.h"
#include "bench.h"
#include "rocfft/rocfft.h"
#ifdef WIN32
typedef HMODULE ROCFFT_LIB;
#else
typedef void* ROCFFT_LIB;
#endif
// Load the rocfft library
ROCFFT_LIB rocfft_lib_load(const std::string& path)
{
#ifdef WIN32
return LoadLibraryA(path.c_str());
#else
return dlopen(path.c_str(), RTLD_LAZY);
#endif
}
// Return a string describing the error loading rocfft
const char* rocfft_lib_load_error()
{
#ifdef WIN32
// just return the error number
static std::string error_str;
error_str = std::to_string(GetLastError());
return error_str.c_str();
#else
return dlerror();
#endif
}
// Get symbol from rocfft lib
void* rocfft_lib_symbol(ROCFFT_LIB libhandle, const char* sym)
{
#ifdef WIN32
return reinterpret_cast(GetProcAddress(libhandle, sym));
#else
return dlsym(libhandle, sym);
#endif
}
void rocfft_lib_close(ROCFFT_LIB libhandle)
{
#ifdef WIN32
FreeLibrary(libhandle);
#else
dlclose(libhandle);
#endif
}
// Given a libhandle from dload, return a plan to a rocFFT plan with the given parameters.
rocfft_plan make_plan(ROCFFT_LIB libhandle, const fft_params& params)
{
auto procfft_setup = (decltype(&rocfft_setup))rocfft_lib_symbol(libhandle, "rocfft_setup");
if(procfft_setup == NULL)
throw rocfft_runtime_error("rocfft_setup failed");
auto procfft_plan_description_create
= (decltype(&rocfft_plan_description_create))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_create");
auto procfft_plan_description_destroy
= (decltype(&rocfft_plan_description_destroy))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_destroy");
auto procfft_plan_description_set_data_layout
= (decltype(&rocfft_plan_description_set_data_layout))rocfft_lib_symbol(
libhandle, "rocfft_plan_description_set_data_layout");
auto procfft_plan_create
= (decltype(&rocfft_plan_create))rocfft_lib_symbol(libhandle, "rocfft_plan_create");
procfft_setup();
rocfft_plan_description desc = NULL;
LIB_V_THROW(procfft_plan_description_create(&desc), "rocfft_plan_description_create failed");
LIB_V_THROW(
procfft_plan_description_set_data_layout(desc,
rocfft_array_type_from_fftparams(params.itype),
rocfft_array_type_from_fftparams(params.otype),
params.ioffset.data(),
params.ooffset.data(),
params.istride.size(),
params.istride.data(),
params.idist,
params.ostride.size(),
params.ostride.data(),
params.odist),
"rocfft_plan_description_data_layout failed");
rocfft_plan plan = NULL;
LIB_V_THROW(procfft_plan_create(&plan,
rocfft_result_placement_from_fftparams(params.placement),
rocfft_transform_type_from_fftparams(params.transform_type),
rocfft_precision_from_fftparams(params.precision),
params.length.size(),
params.length.data(),
params.nbatch,
desc),
"rocfft_plan_create failed");
LIB_V_THROW(procfft_plan_description_destroy(desc), "rocfft_plan_description_destroy failed");
return plan;
}
// Given a libhandle from dload and a rocFFT plan, destroy the plan.
void destroy_plan(ROCFFT_LIB libhandle, rocfft_plan& plan)
{
auto procfft_plan_destroy
= (decltype(&rocfft_plan_destroy))rocfft_lib_symbol(libhandle, "rocfft_plan_destroy");
LIB_V_THROW(procfft_plan_destroy(plan), "rocfft_plan_destroy failed");
auto procfft_cleanup
= (decltype(&rocfft_cleanup))rocfft_lib_symbol(libhandle, "rocfft_cleanup");
if(procfft_cleanup)
LIB_V_THROW(procfft_cleanup(), "rocfft_cleanup failed");
}
// Given a libhandle from dload and a rocFFT execution info structure, destroy the info.
void destroy_info(ROCFFT_LIB libhandle, rocfft_execution_info& info)
{
auto procfft_execution_info_destroy
= (decltype(&rocfft_execution_info_destroy))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_destroy");
LIB_V_THROW(procfft_execution_info_destroy(info), "rocfft_execution_info_destroy failed");
}
// Given a libhandle from dload, and a corresponding rocFFT plan, return how much work
// buffer is required.
size_t get_wbuffersize(ROCFFT_LIB libhandle, const rocfft_plan& plan)
{
auto procfft_plan_get_work_buffer_size
= (decltype(&rocfft_plan_get_work_buffer_size))rocfft_lib_symbol(
libhandle, "rocfft_plan_get_work_buffer_size");
// Get the buffersize
size_t workBufferSize = 0;
LIB_V_THROW(procfft_plan_get_work_buffer_size(plan, &workBufferSize),
"rocfft_plan_get_work_buffer_size failed");
return workBufferSize;
}
// Given a libhandle from dload and a corresponding rocFFT plan, print the plan information.
void show_plan(ROCFFT_LIB libhandle, const rocfft_plan& plan)
{
auto procfft_plan_get_print
= (decltype(&rocfft_plan_get_print))rocfft_lib_symbol(libhandle, "rocfft_plan_get_print");
LIB_V_THROW(procfft_plan_get_print(plan), "rocfft_plan_get_print failed");
}
// FIXME: doc
rocfft_execution_info make_execinfo(ROCFFT_LIB libhandle)
{
auto procfft_execution_info_create = (decltype(&rocfft_execution_info_create))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_create");
rocfft_execution_info info = NULL;
LIB_V_THROW(procfft_execution_info_create(&info), "rocfft_execution_info_create failed");
return info;
}
// FIXME: doc
void set_work_buffer(const ROCFFT_LIB& libhandle,
rocfft_execution_info& info,
const size_t wbuffersize,
void* wbuffer)
{
if(wbuffersize > 0 && wbuffer != NULL)
{
auto procfft_execution_info_set_work_buffer
= (decltype(&rocfft_execution_info_set_work_buffer))rocfft_lib_symbol(
libhandle, "rocfft_execution_info_set_work_buffer");
LIB_V_THROW(procfft_execution_info_set_work_buffer(info, wbuffer, wbuffersize),
"rocfft_execution_info_set_work_buffer failed");
}
}
// Given a libhandle from dload and a corresponding rocFFT plan and execution info,
// execute a transform on the given input and output buffers and return the kernel
// execution time.
float run_plan(
ROCFFT_LIB libhandle, rocfft_plan plan, rocfft_execution_info info, void** in, void** out)
{
auto procfft_execute
= (decltype(&rocfft_execute))rocfft_lib_symbol(libhandle, "rocfft_execute");
hipEvent_wrapper_t start, stop;
start.alloc();
stop.alloc();
HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed");
auto rcfft = procfft_execute(plan, in, out, info);
HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed");
HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed");
if(rcfft != rocfft_status_success)
{
throw std::runtime_error("execution failed");
}
float time;
HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed");
return time;
}
std::pair create_handleplan(const std::string& libstring,
const fft_params& params)
{
auto libhandle = rocfft_lib_load(libstring);
if(libhandle == NULL)
{
std::stringstream ss;
ss << "Failed to open " << libstring << ", error: " << rocfft_lib_load_error();
throw std::runtime_error(ss.str());
}
auto plan = make_plan(libhandle, params);
return std::make_pair(libhandle, plan);
}
int main(int argc, char* argv[])
{
// Control output verbosity:
int verbose{};
// number of GPUs to use:
int ngpus{};
// hip Device number for running tests:
int deviceId{};
// Ignore runtime failures.
// eg: hipMalloc failing when there isn't enough free vram.
bool ignore_hip_runtime_failures{true};
// Number of performance trial samples:
int ntrial{};
// Bool to specify whether the libs are loaded in forward or forward+reverse order.
int reverse{};
// Test sequence choice:
int test_sequence{};
// Vector of test target libraries
std::vector lib_strings;
// FFT parameters:
fft_params params;
// input/output FFT grids
std::vector ingrid;
std::vector outgrid;
// Token string to fully specify fft params.
std::string token;
CLI::App app{"dyna-rocfft-bench command line options"};
// Declare the supported options. Some option pointers are declared to track passed opts.
// FIXME: version needs to be implemented
app.add_flag("--version",
"Print queryable version information from the rocfft library and exit");
app.add_flag("--reverse", reverse, "Load libs in forward and reverse order")->default_val(1);
app.add_option(
"--sequence", test_sequence, "Test sequence:\n0) random\n1) alternating\n2) sequential")
->default_val(0);
app.add_option("--lib", lib_strings, "Set test target library full path (appendable)");
CLI::Option* opt_token
= app.add_option("--token", token, "Token to read FFT params from")->default_val("");
// Group together options that conflict with --token
auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token");
non_token
->add_flag("--double", "Double precision transform (deprecated: use --precision double)")
->each([&](const std::string&) { params.precision = fft_precision_double; });
non_token->excludes(opt_token);
non_token
->add_option("-t, --transformType",
params.transform_type,
"Type of transform:\n0) complex forward\n1) complex inverse\n2) real "
"forward\n3) real inverse")
->default_val(fft_transform_type_complex_forward);
non_token
->add_option(
"--precision", params.precision, "Transform precision: single (default), double, half")
->excludes("--double");
CLI::Option* opt_not_in_place
= non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)")
->each([&](const std::string&) { params.placement = fft_placement_notinplace; });
non_token
->add_option("--itype",
params.itype,
"Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
non_token
->add_option("--otype",
params.otype,
"Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) "
"hermitian interleaved\n4) hermitian planar")
->default_val(fft_array_type_unset);
CLI::Option* opt_length
= non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3);
non_token->add_option("--ngpus", ngpus, "Number of GPUs to use")
->default_val(1)
->check(CLI::NonNegativeNumber);
// define multi-GPU grids for FFT computation,
CLI::Option* opt_ingrid
= non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input")
->expected(1, 3)
->needs("--ngpus");
CLI::Option* opt_outgrid
= non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output")
->expected(1, 3)
->needs("--ngpus");
non_token
->add_option("-b, --batchSize",
params.nbatch,
"If this value is greater than one, arrays will be used")
->default_val(1);
CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides");
CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides");
non_token->add_option("--idist", params.idist, "Logical distance between input batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; });
non_token->add_option("--odist", params.odist, "Logical distance between output batches")
->default_val(0)
->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; });
CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset");
CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset");
app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures",
ignore_hip_runtime_failures,
"Ignore hip runtime failures");
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0);
app.add_option("-N, --ntrial", ntrial, "Trial size for the problem")
->default_val(1)
->each([&](const std::string& val) {
std::cout << "Running profile with " << val << " samples\n";
});
// Default value is set in fft_params.h based on if device-side PRNG was enabled.
app.add_option("-g, --inputGen",
params.igen,
"Input data generation:\n0) PRNG sequence (device)\n"
"1) PRNG sequence (host)\n"
"2) linearly-spaced sequence (device)\n"
"3) linearly-spaced sequence (host)");
app.add_option("--isize", params.isize, "Logical size of input buffer");
app.add_option("--osize", params.osize, "Logical size of output buffer");
app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output");
// Parse args and catch any errors here
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
// Check if all the provided libraries are actually there:
for(const auto& lib_string : lib_strings)
{
if(!std::filesystem::exists(lib_string))
{
std::cerr << "Error: lib " << lib_string << " does not exist\n";
return EXIT_FAILURE;
}
}
if(!token.empty())
{
std::cout << "Reading fft params from token:\n" << token << std::endl;
try
{
params.from_token(token);
}
catch(...)
{
std::cout << "Unable to parse token." << std::endl;
return EXIT_FAILURE;
}
}
else
{
if(ngpus > 1)
{
// set default GPU grids in case none were given
params.set_default_grid(ngpus, ingrid, outgrid);
// split the problem among ngpus
params.mp_lib = fft_params::fft_mp_lib_none;
int localDeviceCount = 0;
if(hipGetDeviceCount(&localDeviceCount) != hipSuccess)
{
throw std::runtime_error("hipGetDeviceCount failed");
}
// start with all-ones in grids
std::vector input_grid(params.length.size() + 1, 1);
std::vector output_grid(params.length.size() + 1, 1);
// create input and output grids and distribute it according to user requirements
std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1);
std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1);
params.distribute_input(localDeviceCount, input_grid);
params.distribute_output(localDeviceCount, output_grid);
}
if(*opt_not_in_place)
{
std::cout << "out-of-place\n";
}
else
{
std::cout << "in-place\n";
}
if(*opt_length)
{
std::cout << "length:";
for(auto& i : params.length)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_istride)
{
std::cout << "istride:";
for(auto& i : params.istride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ostride)
{
std::cout << "ostride:";
for(auto& i : params.ostride)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ioffset)
{
std::cout << "ioffset:";
for(auto& i : params.ioffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ooffset)
{
std::cout << "ooffset:";
for(auto& i : params.ooffset)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_ingrid || !ingrid.empty())
{
std::cout << "input grid:";
for(auto& i : ingrid)
std::cout << " " << i;
std::cout << "\n";
}
if(*opt_outgrid || !outgrid.empty())
{
std::cout << "output grid:";
for(auto& i : outgrid)
std::cout << " " << i;
std::cout << "\n";
}
}
std::cout << std::flush;
// Set GPU for single-device FFT computation
rocfft_scoped_device dev(deviceId);
params.validate();
if(!params.valid(verbose))
{
throw rocfft_runtime_error("Invalid parameters, add --verbose=1 for detail");
}
std::cout << "Token: " << params.token() << std::endl;
if(verbose)
{
std::cout << params.str() << std::endl;
}
// Check free and total available memory:
size_t free = 0;
size_t total = 0;
try
{
HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
const auto raw_vram_footprint
= params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params);
if(!vram_fits_problem(raw_vram_footprint, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint
<< ") raw data too large for device.\n";
return EXIT_SUCCESS;
}
// GPU input buffer:
std::vector ibuffer;
std::vector pibuffer;
// CPU-side input buffer
std::vector ibuffer_cpu;
auto is_host_gen = (params.igen == fft_input_generator_host
|| params.igen == fft_input_random_generator_host);
auto ibricks = get_input_bricks(params);
auto obricks = get_output_bricks(params);
std::vector obuffer_data;
std::vector* obuffer = nullptr;
alloc_bench_bricks(
params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen);
init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen);
for(unsigned int i = 0; i < ibuffer.size(); ++i)
{
pibuffer.push_back(ibuffer[i].data());
}
// print input if requested
if(verbose > 1)
{
if(is_host_gen)
{
// data is already on host
params.print_ibuffer(ibuffer_cpu);
}
else
{
print_device_buffer(params, ibuffer, true);
}
}
std::vector pobuffer(obuffer->size());
for(unsigned int i = 0; i < obuffer->size(); ++i)
{
pobuffer[i] = obuffer->at(i).data();
}
// Execution times for loaded libraries:
std::vector> time(lib_strings.size());
// If we are doing a reverse-run, then we need two ntrials; otherwise, just one.
std::vector ntrial_runs;
if(reverse == 0)
{
ntrial_runs.push_back(ntrial);
}
else
{
ntrial_runs.push_back((ntrial + 1) / 2);
ntrial_runs.push_back(ntrial / 2);
}
for(size_t ridx = 0; ridx < ntrial_runs.size(); ++ridx)
{
std::vector> index_lib_string;
for(size_t i = 0; i < lib_strings.size(); ++i)
{
index_lib_string.push_back(std::make_pair(i, lib_strings[i]));
}
if(ridx == 1)
{
std::reverse(index_lib_string.begin(), index_lib_string.end());
}
// Create the handles to the libs and the associated fft plans.
std::vector handle;
std::vector plan;
// Allocate the work buffer: just one, big enough for any dloaded library.
std::vector info;
size_t wbuffer_size = 0;
for(unsigned int idx = 0; idx < lib_strings.size(); ++idx)
{
std::cout << idx << ": " << lib_strings[idx] << "\n";
auto libhandle = rocfft_lib_load(lib_strings[idx]);
if(libhandle == NULL)
{
std::cout << "Failed to open " << lib_strings[idx]
<< ", error: " << rocfft_lib_load_error() << "\n";
return 1;
}
handle.push_back(libhandle);
plan.push_back(make_plan(handle[idx], params));
show_plan(handle[idx], plan[idx]);
wbuffer_size = std::max(wbuffer_size, get_wbuffersize(handle[idx], plan[idx]));
info.push_back(make_execinfo(handle[idx]));
}
std::cout << "Work buffer size: " << wbuffer_size << std::endl;
if(!vram_fits_problem(raw_vram_footprint + wbuffer_size, free))
{
std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << " + " << +wbuffer_size
<< " = " << raw_vram_footprint + wbuffer_size
<< " ) data too large for device.\n";
return EXIT_SUCCESS;
}
gpubuf wbuffer;
if(wbuffer_size)
{
try
{
HIP_V_THROW(wbuffer.alloc(wbuffer_size), "Creating intermediate Buffer failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
// Associate the work buffer to the individual libraries:
for(unsigned int idx = 0; idx < lib_strings.size(); ++idx)
{
set_work_buffer(handle[idx], info[idx], wbuffer_size, wbuffer.data());
}
// Run the plan using its associated rocFFT library:
for(unsigned int idx = 0; idx < handle.size(); ++idx)
{
try
{
run_plan(handle[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data());
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::vector testcase(ntrial_runs[ridx] * index_lib_string.size());
switch(test_sequence)
{
case 0:
{
// Random order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[index_lib_string.size() * itrial + ilib] = ilib;
}
}
std::random_device rd;
std::mt19937 g(rd());
std::shuffle(testcase.begin(), testcase.end(), g);
break;
}
case 1:
// Alternating order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[index_lib_string.size() * itrial + ilib] = ilib;
}
}
break;
case 2:
// Sequential order:
for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial)
{
for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib)
{
testcase[ilib * ntrial + itrial] = ilib;
}
}
break;
default:
throw std::runtime_error("Invalid test sequence choice.");
}
if(verbose > 3)
{
std::cout << "Test case order:";
for(const auto val : testcase)
std::cout << " " << val;
std::cout << "\n";
}
std::cout << "Running the tests...\n";
for(size_t itest = 0; itest < testcase.size(); ++itest)
{
const int tidx = testcase[itest];
if(verbose > 3)
{
std::cout << "running test case " << tidx << " with lib "
<< index_lib_string[tidx].second << "\n";
}
#ifdef USE_HIPRAND
if(!is_host_gen)
params.compute_input(ibuffer);
#endif
if(is_host_gen)
{
for(unsigned int bidx = 0; bidx < ibuffer_cpu.size(); ++bidx)
{
try
{
HIP_V_THROW(hipMemcpy(pibuffer[bidx],
ibuffer_cpu[bidx].data(),
ibuffer_cpu[bidx].size(),
hipMemcpyHostToDevice),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
}
// Run the plan using its associated rocFFT library:
try
{
time[tidx].push_back(run_plan(
handle[tidx], plan[tidx], info[tidx], pibuffer.data(), pobuffer.data()));
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
if(verbose > 2)
{
auto output = allocate_host_buffer(params.precision, params.otype, params.osize);
for(unsigned int iout = 0; iout < output.size(); ++iout)
{
try
{
HIP_V_THROW(hipMemcpy(output[iout].data(),
pobuffer[iout],
output[iout].size(),
hipMemcpyDeviceToHost),
"hipMemcpy failed");
}
catch(rocfft_hip_runtime_error)
{
return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE;
}
}
std::cout << "GPU output:\n";
params.print_obuffer(output);
}
}
// Clean up:
for(unsigned int hidx = 0; hidx < handle.size(); ++hidx)
{
destroy_info(handle[hidx], info[hidx]);
destroy_plan(handle[hidx], plan[hidx]);
rocfft_lib_close(handle[hidx]);
}
}
std::cout << "Execution times in ms:\n";
for(unsigned int idx = 0; idx < time.size(); ++idx)
{
std::cout << "\nExecution gpu time:";
for(auto& i : time[idx])
{
std::cout << " " << i;
}
std::cout << " ms" << std::endl;
}
return EXIT_SUCCESS;
}
rocFFT-rocm-7.1.0/clients/cmake/ 0000775 0000000 0000000 00000000000 15066521634 0016331 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/cmake/build-gtest.cmake 0000664 0000000 0000000 00000004604 15066521634 0021562 0 ustar 00root root 0000000 0000000 # Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
include( ExternalProject )
option( BUILD_GTEST "Download and build GoogleTest" OFF )
if( NOT BUILD_GTEST )
find_package( GTest 1.11.0 )
endif()
if( (BUILD_GTEST OR NOT GTEST_FOUND) AND (NOT TARGET gtest) )
set(GTEST_INCLUDE_DIRS
${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include)
set(GTEST_LIBRARIES
${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX}
${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX})
set(GTEST_SRC_URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz CACHE STRING "Location of GTest source code")
set(GTEST_SRC_SHA256 b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 CACHE STRING "SHA256 hash of GTest source code")
ExternalProject_Add(gtest
URL ${GTEST_SRC_URL}
URL_HASH SHA256=${GTEST_SRC_SHA256}
PREFIX ${CMAKE_CURRENT_BINARY_DIR}
CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER}
INSTALL_COMMAND ""
BUILD_BYPRODUCTS ${GTEST_LIBRARIES})
ExternalProject_Get_Property( gtest source_dir binary_dir )
endif()
rocFFT-rocm-7.1.0/clients/cmake/build-options.cmake 0000664 0000000 0000000 00000003600 15066521634 0022122 0 ustar 00root root 0000000 0000000 # Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# This file is intended to be used in two ways; independently in a stand alone PROJECT
# and as part of a superbuild. If the file is included in a stand alone project, the
# variables are not expected to be preset, and this will produce options() in the GUI
# for the user to examine. If this file is included in a superbuild, the options will be
# presented in the superbuild GUI, but then passed into the ExternalProject as -D
# parameters, which would already define them.
if( NOT BUILD_CLIENTS_TESTS )
option( BUILD_CLIENTS_TESTS "Build rocFFT unit tests" OFF )
endif( )
if( NOT BUILD_CLIENTS_BENCH )
option( BUILD_CLIENTS_BENCH "Build rocFFT benchmarks" OFF )
endif( )
if( NOT BUILD_CLIENTS_SAMPLES )
option( BUILD_CLIENTS_SAMPLES "Build rocFFT samples" OFF )
endif( )
rocFFT-rocm-7.1.0/clients/samples/ 0000775 0000000 0000000 00000000000 15066521634 0016715 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/CMakeLists.txt 0000664 0000000 0000000 00000005053 15066521634 0021460 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
set( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ON )
project( rocfft-clients-samples LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
list( APPEND samples_subdirs "fixed-16" )
list( APPEND samples_subdirs "fixed-large" )
list( APPEND samples_subdirs "rocfft" )
list( APPEND samples_subdirs "multi_gpu" )
if( ROCFFT_MPI_ENABLE )
list( APPEND samples_subdirs "mpi" )
endif()
foreach( client ${samples_subdirs} )
add_subdirectory( ${client} )
endforeach( )
rocFFT-rocm-7.1.0/clients/samples/fixed-16/ 0000775 0000000 0000000 00000000000 15066521634 0020240 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/fixed-16/CMakeLists.txt 0000664 0000000 0000000 00000007327 15066521634 0023011 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-fixed-16 LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
set( sample_list fixed-16-float fixed-16-double fixed-16-half )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories( ${sample}
PRIVATE $
)
target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device )
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( FIXED_16_OUT_DIR "/../bin" )
else()
set( FIXED_16_OUT_DIR "/bin" )
endif()
string( CONCAT FIXED_16_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_16_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${FIXED_16_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} )
endforeach( )
rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-double.cpp 0000664 0000000 0000000 00000011434 15066521634 0023542 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-float.cpp 0000664 0000000 0000000 00000011430 15066521634 0023371 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(float2);
// Create HIP device object.
float2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-half.cpp 0000664 0000000 0000000 00000011660 15066521634 0023203 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include "rocfft/rocfft.h"
#include
#include
#include
#include
int main()
{
const size_t N = 16;
std::vector<_Float16_2> cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7));
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(_Float16_2);
// Create HIP device object.
_Float16_2* x = nullptr;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = NULL;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_half,
1,
&length,
1,
NULL)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector<_Float16_2> y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << ","
<< static_cast(cx[i].y) << ")"
<< " output: (" << static_cast(y[i].x) << ","
<< static_cast(y[i].y) << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/fixed-large/ 0000775 0000000 0000000 00000000000 15066521634 0021104 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/fixed-large/CMakeLists.txt 0000664 0000000 0000000 00000007333 15066521634 0023652 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-fixed-large LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
set( sample_list fixed-large-float fixed-large-double )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories( ${sample}
PRIVATE $
)
target_link_libraries( ${sample} PRIVATE roc::rocfft )
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( FIXED_LARGE_OUT_DIR "/../bin" )
else()
set( FIXED_LARGE_OUT_DIR "/bin" )
endif()
string( CONCAT FIXED_LARGE_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_LARGE_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${FIXED_LARGE_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} )
endforeach( )
rocFFT-rocm-7.1.0/clients/samples/fixed-large/fixed-large-double.cpp 0000664 0000000 0000000 00000011654 15066521634 0025256 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include "rocfft/rocfft.h"
#include
#include
int main()
{
// For size N >= 8192, temporary buffer is required to allocated
const size_t N = 64 * 2048;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Setup work buffer
void* workBuffer = nullptr;
size_t workBufferSize = 0;
if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
// Setup exec info to pass work buffer to the library
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(workBufferSize > 0)
{
printf("size of workbuffer=%d\n", (int)workBufferSize);
if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
if(workBuffer)
if(hipFree(workBuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/fixed-large/fixed-large-float.cpp 0000664 0000000 0000000 00000011650 15066521634 0025105 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include "rocfft/rocfft.h"
#include
#include
int main()
{
// For size N >= 8192, temporary buffer is required to allocated
const size_t N = 64 * 2048;
std::vector cx(N);
for(size_t i = 0; i < N; i++)
{
cx[i].x = i + (i % 3) - (i % 7);
cx[i].y = 0;
}
// rocfft gpu compute
// ========================================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(float2);
// Create HIP device object.
float2* x;
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_single,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Setup work buffer
void* workBuffer = nullptr;
size_t workBufferSize = 0;
if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
// Setup exec info to pass work buffer to the library
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(workBufferSize > 0)
{
printf("size of workbuffer=%d\n", (int)workBufferSize);
if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
if(workBuffer)
if(hipFree(workBuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
// Copy result back to host
std::vector y(N);
if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/mpi/ 0000775 0000000 0000000 00000000000 15066521634 0017502 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/mpi/CMakeLists.txt 0000664 0000000 0000000 00000010721 15066521634 0022243 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-rocfft LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
if( NOT MPI_FOUND )
find_package( MPI REQUIRED )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list rocfft_mpi_example )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
${MPI_CXX_INCLUDE_PATH}
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
MPI::MPI_CXX
)
message( "MPI_CXX_LIB_NAMES: ${MPI_CXX_LIB_NAMES}")
if ( ROCFFT_CRAY_MPI_ENABLE )
target_link_libraries( ${sample}
PRIVATE
"mpi_gtl_hsa"
)
get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY )
target_link_directories( ${sample}
PRIVATE
${MPI_LIBDIR}/../../../../gtl/lib )
endif()
if ( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
rocFFT-rocm-7.1.0/clients/samples/mpi/rocfft_mpi_example.cpp 0000664 0000000 0000000 00000040341 15066521634 0024053 0 ustar 00root root 0000000 0000000
/******************************************************************************
* Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#include
#include
#include
#include
#include
#include
#include
#include "rocfft.h"
// Check all ranks for an rocFFT non-success status.
auto rocfft_status_sync(const rocfft_status fftrc, const MPI_Comm comm)
{
// Since hipSuccess is the lowest enum value, we can find if there are any errors
// by getting the maximum value of the return code over all procs.
// Guarantee that the enum is an unsigned int so that we can send this via MPI:
static_assert(std::is_same_v, unsigned int>);
auto global_fftrc = rocfft_status_success;
const auto mpirc = MPI_Allreduce(&fftrc, &global_fftrc, 1, MPI_UNSIGNED, MPI_MAX, comm);
if(mpirc != MPI_SUCCESS)
{
return rocfft_status_failure;
}
return global_fftrc;
}
// Check all ranks for an hip runtime non-success status.
auto hip_status_sync(const hipError_t hiprc, const MPI_Comm comm)
{
// Since rocfft_status_success is the lowest enum value, we can find if there are any errors
// by getting the maximum value of the return code over all procs.
// Guarantee that the enum is an unsigned int so that we can send this via MPI:
static_assert(std::is_same_v, unsigned int>);
auto global_hiprc = hipSuccess;
const auto mpirc = MPI_Allreduce(&hiprc, &global_hiprc, 1, MPI_UNSIGNED, MPI_MAX, comm);
if(mpirc != MPI_SUCCESS)
{
return hipErrorUnknown;
}
return global_hiprc;
}
int main(int argc, char** argv)
{
MPI_Init(&argc, &argv);
MPI_Comm mpi_comm = MPI_COMM_WORLD;
int mpi_size = 0;
MPI_Comm_size(mpi_comm, &mpi_size);
int mpi_rank = 0;
MPI_Comm_rank(mpi_comm, &mpi_rank);
if(mpi_rank == 0)
{
std::cout << "rocFFT MPI example\n";
std::cout << "MPI size: " << mpi_size << "\n";
}
// General FFT parameters:
std::vector length = {8, 8};
const rocfft_transform_type direction = rocfft_transform_type_complex_forward;
const rocfft_result_placement place = rocfft_placement_notinplace;
auto fftrc = rocfft_status_success;
auto hiprc = hipSuccess;
fftrc = rocfft_setup();
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to set up rocFFT");
rocfft_plan_description description = nullptr;
rocfft_plan_description_create(&description);
fftrc = rocfft_plan_description_set_comm(description, rocfft_comm_mpi, &mpi_comm);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed add communicator to description");
// Do not set stride information via the descriptor, they are to be defined during field
// creation below
fftrc = rocfft_plan_description_set_data_layout(description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
0,
nullptr,
0,
0,
nullptr,
0);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to create description");
// This example is unbatched, so the batch stride is not used
// for anything. For batched examples, this would be
// distance in elements between consecutive batches.
const size_t batch_stride = 0;
if(mpi_rank == 0)
{
std::cout << "input data decomposition:\n";
}
std::vector gpu_in = {nullptr};
{
rocfft_field infield = nullptr;
rocfft_field_create(&infield);
std::vector inbrick_stride = {1, length[1], batch_stride};
const size_t inbrick_length1 = length[1] / (size_t)mpi_size
+ ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
const size_t inbrick_lower1
= mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1;
std::vector inbrick_lower = {0, inbrick_lower1, 0};
std::vector inbrick_upper = {length[0], inbrick_upper1, 1};
rocfft_brick inbrick = nullptr;
rocfft_brick_create(&inbrick,
inbrick_lower.data(),
inbrick_upper.data(),
inbrick_stride.data(),
inbrick_lower.size(),
0);
rocfft_field_add_brick(infield, inbrick);
rocfft_brick_destroy(inbrick);
inbrick = nullptr;
const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex);
std::vector> host_in(length[0] * inbrick_length1);
for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0)
{
for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[1]) * inbrick_stride[1];
host_in[pos] = std::complex(idx0, idx1);
}
}
// Serialize output:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "in-brick rank " << irank;
std::cout << "\n\tlower indices:";
for(const auto val : inbrick_lower)
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : inbrick_upper)
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : inbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0)
{
for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[1]) * inbrick_stride[1];
std::cout << host_in[pos] << " ";
}
std::cout << "\n";
}
}
MPI_Barrier(mpi_comm);
}
hiprc = hipMalloc(&gpu_in[0], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("inbrick hipMalloc failed");
hiprc = hipMemcpy(gpu_in[0], host_in.data(), memSize, hipMemcpyHostToDevice);
if(hiprc != hipSuccess)
throw std::runtime_error("inbrick hipMemcpy failed");
rocfft_plan_description_add_infield(description, infield);
fftrc = rocfft_field_destroy(infield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy infield");
}
if(mpi_rank == 0)
{
std::cout << "output data decomposition:\n";
}
std::vector gpu_out = {nullptr};
std::vector outbrick_lower;
std::vector outbrick_upper;
std::vector outbrick_stride = {1, length[1], batch_stride};
{
const size_t outbrick_length1 = length[1] / (size_t)mpi_size
+ ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0);
const size_t outbrick_lower1
= mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size);
const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1;
outbrick_lower = {0, outbrick_lower1, 0};
outbrick_upper = {length[0], outbrick_upper1, 1};
const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex);
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "out-brick rank " << irank;
std::cout << "\n\tlower indices:";
for(const auto val : outbrick_lower)
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : outbrick_upper)
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : outbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
}
MPI_Barrier(mpi_comm);
}
rocfft_field outfield = nullptr;
rocfft_field_create(&outfield);
rocfft_brick outbrick = nullptr;
outbrick_lower = {0, outbrick_lower1, 0};
outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1, 1};
rocfft_brick_create(&outbrick,
outbrick_lower.data(),
outbrick_upper.data(),
outbrick_stride.data(),
outbrick_lower.size(),
0);
rocfft_field_add_brick(outfield, outbrick);
rocfft_brick_destroy(outbrick);
outbrick = nullptr;
hiprc = hipMalloc(&gpu_out[0], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("outbrick hipMalloc failed");
rocfft_plan_description_add_outfield(description, outfield);
fftrc = rocfft_field_destroy(outfield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy outfield");
}
// In order still handle non-success return codes without killing all of the MPI processes, we
// put object creation in a try/catch block and destroy non-nullptr objects.
// Serialize output:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "rank " << irank << "\n";
std::cout << "input ";
for(const auto& b : gpu_in)
std::cout << " " << b;
std::cout << "\n";
std::cout << "output ";
for(const auto& b : gpu_out)
std::cout << " " << b;
std::cout << "\n";
}
MPI_Barrier(mpi_comm);
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
hiprc = hip_status_sync(hiprc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success && hiprc == hipSuccess)
{
std::cout << "so far so good, trying to make a plan....\n";
}
else
{
std::cout << "failure: will not make a plan....\n";
}
}
// Create a multi-process plan:
rocfft_plan gpu_plan = nullptr;
if(fftrc == rocfft_status_success && hiprc == hipSuccess)
{
fftrc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
description); // Description
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success)
{
std::cout << "so far so good, we have a plan....\n";
}
else
{
std::cout << "failure: we do not have a plan....\n";
}
}
// Execute plan:
if(fftrc == rocfft_status_success)
{
fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), nullptr);
}
fftrc = rocfft_status_sync(fftrc, mpi_comm);
if(mpi_rank == 0)
{
if(fftrc == rocfft_status_success)
{
std::cout << "The FFT was succesful....\n";
}
else
{
std::cout << "The FFT execution failed....\n";
}
}
// Output the data:
for(int irank = 0; irank < mpi_size; ++irank)
{
if(mpi_rank == irank)
{
std::cout << "out brick rank " << irank << "\n";
const size_t outcount
= (outbrick_upper[0] - outbrick_lower[0]) * (outbrick_upper[1] - outbrick_lower[1]);
std::vector> host_out(outcount);
hiprc = hipMemcpy(host_out.data(),
gpu_out[0],
outcount * sizeof(std::complex),
hipMemcpyDeviceToHost);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
for(auto idx0 = outbrick_lower[0]; idx0 < outbrick_upper[0]; ++idx0)
{
for(auto idx1 = outbrick_lower[1]; idx1 < outbrick_upper[1]; ++idx1)
{
const auto pos = (idx0 - outbrick_lower[0]) * outbrick_stride[0]
+ (idx1 - outbrick_lower[1]) * outbrick_stride[1];
std::cout << host_out[pos] << " ";
}
std::cout << "\n";
}
}
MPI_Barrier(mpi_comm);
}
// Cleanup anything plan-generation structs (that aren't null pointers):
if(description != nullptr)
{
if(rocfft_plan_description_destroy(description) != rocfft_status_success)
{
std::cerr << "description descruction failed\n";
}
else
{
description = nullptr;
}
}
// Clean up the plan and rocfft:
try
{
if(gpu_plan != nullptr)
{
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
}
}
catch(const std::exception&)
{
std::cerr << "rank " << mpi_rank << " plan destroy failed\n";
}
for(auto& buf : gpu_in)
{
if(buf != nullptr)
{
hiprc = hipFree(buf);
if(hiprc != hipSuccess)
std::cerr << "hipFree failed\n";
buf = nullptr;
}
}
for(auto& buf : gpu_out)
{
if(buf != nullptr)
{
hiprc = hipFree(buf);
if(hiprc != hipSuccess)
std::cerr << "hipFree failed\n";
buf = nullptr;
}
}
fftrc = rocfft_cleanup();
MPI_Finalize();
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/multi_gpu/ 0000775 0000000 0000000 00000000000 15066521634 0020722 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/multi_gpu/CMakeLists.txt 0000664 0000000 0000000 00000010023 15066521634 0023456 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-multi_gpu LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list mgpu_complex)
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
)
if( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
rocFFT-rocm-7.1.0/clients/samples/multi_gpu/mgpu_complex.cpp 0000664 0000000 0000000 00000032345 15066521634 0024134 0 ustar 00root root 0000000 0000000 // Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include
#include "../../../shared/CLI11.hpp"
#include "rocfft/rocfft.h"
#include
#include
#include
int main(int argc, char* argv[])
{
std::cout << "rocfft single-node multi-gpu complex-to-complex 3D FFT example\n";
// Length of transform, first dimension must be greather than number of GPU devices
std::vector length = {8, 8};
// Gpu device ids:
std::vector devices = {0, 1};
// Command-line options:
CLI::App app{"rocfft sample command line options"};
app.add_option("--length", length, "2-D FFT size (eg: --length 256 256)");
app.add_option(
"--devices", devices, "List of devices to use separated by spaces (eg: --devices 1 3)");
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
int deviceCount = devices.size();
std::cout << "Using " << deviceCount << " device(s)\n";
int nDevices;
(void)hipGetDeviceCount(&nDevices);
std::cout << "Number of available GPUs: " << nDevices << " \n";
if(nDevices <= static_cast(*std::max_element(devices.begin(), devices.end())))
throw std::runtime_error("device ID greater than number of available devices");
// Placeness for the transform
auto fftrc = rocfft_status_success;
fftrc = rocfft_setup();
if(fftrc != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
const rocfft_result_placement place = rocfft_placement_notinplace;
// Direction of transform
const rocfft_transform_type direction = rocfft_transform_type_complex_forward;
rocfft_plan_description description = nullptr;
rocfft_plan_description_create(&description);
// Do not set stride information via the descriptor, they are to be defined during field
// creation below
rocfft_plan_description_set_data_layout(description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
0,
nullptr,
0,
0,
nullptr,
0);
auto hiprc = hipSuccess;
std::cout << "input data decomposition:\n";
std::vector gpu_in(devices.size());
{
// Row-major stride for brick data layout in memory
std::vector inbrick_stride = {1, length[1]};
rocfft_field infield = nullptr;
rocfft_field_create(&infield);
std::vector> inbrick_lower(gpu_in.size());
std::vector> inbrick_upper(gpu_in.size());
for(size_t idx = 0; idx < gpu_in.size(); ++idx)
{
const size_t inbrick_length1
= length[1] / gpu_in.size() + (idx < length[1] % gpu_in.size() ? 1 : 0);
const size_t inbrick_lower1
= idx * (length[1] / gpu_in.size()) + std::min(idx, length[1] % gpu_in.size());
const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1;
inbrick_lower[idx] = {0, inbrick_lower1};
inbrick_upper[idx] = {length[0], inbrick_upper1};
rocfft_brick inbrick = nullptr;
rocfft_brick_create(&inbrick,
inbrick_lower[idx].data(),
inbrick_upper[idx].data(),
inbrick_stride.data(),
inbrick_lower[idx].size(),
devices[idx]);
rocfft_field_add_brick(infield, inbrick);
rocfft_brick_destroy(inbrick);
inbrick = nullptr;
const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex);
std::cout << "in-brick " << idx;
std::cout << "\n\tlower indices:";
for(const auto val : inbrick_lower[idx])
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : inbrick_upper[idx])
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : inbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
hiprc = hipSetDevice(devices[idx]);
if(hiprc != hipSuccess)
throw std::runtime_error("hipSetDevice failed");
hiprc = hipMalloc(&gpu_in[idx], memSize);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMalloc failed");
std::vector> host_in(length[0] * inbrick_length1);
for(auto idx0 = inbrick_lower[idx][0]; idx0 < inbrick_upper[idx][0]; ++idx0)
{
for(auto idx1 = inbrick_lower[idx][1]; idx1 < inbrick_upper[idx][1]; ++idx1)
{
const auto pos = (idx0 - inbrick_lower[idx][0]) * inbrick_stride[0]
+ (idx1 - inbrick_lower[idx][1]) * inbrick_stride[1];
host_in[pos] = std::complex(idx0, idx1);
std::cout << host_in[pos] << " ";
}
std::cout << "\n";
}
hiprc = hipMemcpy(gpu_in[idx], host_in.data(), memSize, hipMemcpyHostToDevice);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
}
rocfft_plan_description_add_infield(description, infield);
fftrc = rocfft_field_destroy(infield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy infield");
}
std::cout << "output data decomposition:\n";
std::vector gpu_out(devices.size());
std::vector> outbrick_lower(gpu_out.size());
std::vector> outbrick_upper(gpu_out.size());
std::vector outbrick_stride = {1, length[1]};
{
rocfft_field outfield = nullptr;
rocfft_field_create(&outfield);
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
const size_t outbrick_length1
= length[1] / gpu_out.size() + (idx < length[1] % gpu_in.size() ? 1 : 0);
const size_t outbrick_lower1
= idx * (length[1] / gpu_out.size()) + std::min(idx, length[1] % gpu_out.size());
rocfft_brick outbrick = nullptr;
outbrick_lower[idx] = {0, outbrick_lower1};
outbrick_upper[idx] = {length[0], outbrick_lower1 + outbrick_length1};
rocfft_brick_create(&outbrick,
outbrick_lower[idx].data(),
outbrick_upper[idx].data(),
outbrick_stride.data(),
outbrick_lower[idx].size(),
devices[idx]);
rocfft_field_add_brick(outfield, outbrick);
rocfft_brick_destroy(outbrick);
outbrick = nullptr;
const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex);
std::cout << "out-brick " << idx;
std::cout << "\n\tlower indices:";
for(const auto val : outbrick_lower[idx])
std::cout << " " << val;
std::cout << "\n\tupper indices:";
for(const auto val : outbrick_upper[idx])
std::cout << " " << val;
std::cout << "\n\tstrides:";
for(const auto val : outbrick_stride)
std::cout << " " << val;
std::cout << "\n";
std::cout << "\tbuffer size: " << memSize << "\n";
(void)hipSetDevice(devices[idx]);
if(hipMalloc(&gpu_out[idx], memSize) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
}
rocfft_plan_description_add_outfield(description, outfield);
fftrc = rocfft_field_destroy(outfield);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed destroy outfield");
}
// Create a multi-gpu plan:
(void)hipSetDevice(devices[0]);
rocfft_plan gpu_plan = nullptr;
fftrc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
description); // Description
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to create plan");
// Get execution information and allocate work buffer
rocfft_execution_info planinfo = nullptr;
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(gpu_plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
if(work_buf_size)
{
if(rocfft_execution_info_create(&planinfo) != rocfft_status_success)
throw std::runtime_error("failed to create execution info");
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed");
if(rocfft_execution_info_set_work_buffer(planinfo, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Execute plan:
fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), planinfo);
if(fftrc != rocfft_status_success)
throw std::runtime_error("failed to execute.");
// Output the data.
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
std::cout << "out brick " << idx << "\n";
const auto nbrick = (outbrick_upper[idx][0] - outbrick_lower[idx][0])
* (outbrick_upper[idx][1] - outbrick_lower[idx][1]);
std::vector> host_out(nbrick);
hiprc = hipMemcpy(host_out.data(),
gpu_out[idx],
nbrick * sizeof(std::complex),
hipMemcpyDeviceToHost);
if(hiprc != hipSuccess)
throw std::runtime_error("hipMemcpy failed");
for(auto idx0 = outbrick_lower[idx][0]; idx0 < outbrick_upper[idx][0]; ++idx0)
{
for(auto idx1 = outbrick_lower[idx][1]; idx1 < outbrick_upper[idx][1]; ++idx1)
{
const auto pos = (idx0 - outbrick_lower[idx][0]) * outbrick_stride[0]
+ (idx1 - outbrick_lower[idx][1]) * outbrick_stride[1];
std::cout << host_out[pos] << " ";
}
std::cout << "\n";
}
}
// Destroy plan
if(planinfo != nullptr)
{
if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
planinfo = nullptr;
}
if(rocfft_plan_description_destroy(description) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_description_destroy failed.");
description = nullptr;
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
for(size_t idx = 0; idx < gpu_in.size(); ++idx)
{
(void)hipFree(gpu_in[idx]);
}
for(size_t idx = 0; idx < gpu_out.size(); ++idx)
{
(void)hipFree(gpu_out[idx]);
}
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/rocfft/ 0000775 0000000 0000000 00000000000 15066521634 0020200 5 ustar 00root root 0000000 0000000 rocFFT-rocm-7.1.0/clients/samples/rocfft/CMakeLists.txt 0000664 0000000 0000000 00000010613 15066521634 0022741 0 ustar 00root root 0000000 0000000 # #############################################################################
# Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
# #############################################################################
cmake_minimum_required( VERSION 3.16 )
# This should appear before the project command, because it does not
# use FORCE
if( WIN32 )
set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH
"Install path prefix, prepended onto install directories" )
else( )
set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH
"Install path prefix, prepended onto install directories" )
endif( )
# This has to be initialized before the project() command appears
# Set the default of CMAKE_BUILD_TYPE to be release, unless user
# specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE
if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE )
set( CMAKE_BUILD_TYPE Release CACHE STRING
"Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." )
endif()
project( rocfft-clients-samples-rocfft LANGUAGES CXX )
list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake )
if( NOT TARGET rocfft )
find_package( rocfft REQUIRED CONFIG PATHS )
endif( )
if( NOT HIP_FOUND )
find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ )
endif()
if( USE_HIPRAND AND NOT hiprand_FOUND )
find_package( hiprand REQUIRED )
endif()
set( sample_list rocfft_example_complexcomplex rocfft_example_realcomplex rocfft_example_set_stream
rocfft_example_callback )
foreach( sample ${sample_list} )
add_executable( ${sample} ${sample}.cpp )
target_include_directories(
${sample}
PRIVATE
$
)
target_link_libraries(
${sample}
PRIVATE roc::rocfft
)
if( USE_HIPRAND )
target_link_libraries(
${sample}
PRIVATE
hip::hiprand
)
target_compile_definitions( ${sample} PRIVATE USE_HIPRAND )
endif()
target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp )
set_target_properties( ${sample} PROPERTIES
CXX_STANDARD 17
CXX_STANDARD_REQUIRED ON
)
if( ROCFFT_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" )
elseif( ROCFFT_CLIENTS_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" )
elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE )
set( SAMPLES_ROCFFT_OUT_DIR "/../bin" )
else()
set( SAMPLES_ROCFFT_OUT_DIR "/bin" )
endif()
string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} )
set_target_properties(${sample}
PROPERTIES
RUNTIME_OUTPUT_DIRECTORY
${SAMPLES_ROCFFT_OUT_DIR})
if( CUDA_FOUND )
target_include_directories( ${sample}
PRIVATE
$
$
)
target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ )
endif( )
target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} )
endforeach( )
# callback functions need to be built as relocatable device code
# (causes failure at link stage on Windows)
if (NOT WIN32)
target_compile_options( rocfft_example_callback PRIVATE -fgpu-rdc )
target_link_options( rocfft_example_callback PRIVATE -fgpu-rdc )
endif()
rocFFT-rocm-7.1.0/clients/samples/rocfft/examplekernels.h 0000664 0000000 0000000 00000036177 15066521634 0023406 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef EXAMPLEKERNELS_H
#define EXAMPLEKERNELS_H
#include "../../../shared/data_gen_device.h"
#include
#include
#include
// Kernel for initializing 1D real input data on the GPU.
__global__ void initrdata1(double* x, const size_t Nx, const size_t xstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < Nx)
{
const auto pos = idx * xstride;
x[pos] = idx + 1;
}
}
// Kernel for initializing 2D real input data on the GPU.
__global__ void initrdata2(
double* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx < Nx && idy < Ny)
{
const auto pos = idx * xstride + idy * ystride;
x[pos] = idx + idy;
}
}
// Kernel for initializing 3D real input data on the GPU.
__global__ void initrdata3(double* x,
const size_t Nx,
const size_t Ny,
const size_t Nz,
const size_t xstride,
const size_t ystride,
const size_t zstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
const size_t idz = blockIdx.z * blockDim.z + threadIdx.z;
if(idx < Nx && idy < Ny && idz < Nz)
{
const auto pos = idx * xstride + idy * ystride + idz * zstride;
x[pos] = cos(cos(idx + 2)) * sin(idy * idy + 1) / (idz + 1);
}
}
// Kernel for initializing 1D complex data on the GPU.
__global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
if(idx < Nx)
{
const auto pos = idx * xstride;
x[pos].x = 1 + idx;
x[pos].y = 1 + idx;
}
}
// Kernel for initializing 2D complex input data on the GPU.
__global__ void initcdata2(hipDoubleComplex* x,
const size_t Nx,
const size_t Ny,
const size_t xstride,
const size_t ystride)
{
const auto idx = blockIdx.x * blockDim.x + threadIdx.x;
const auto idy = blockIdx.y * blockDim.y + threadIdx.y;
if(idx < Nx && idy < Ny)
{
const auto pos = idx * xstride + idy * ystride;
x[pos].x = idx + 1;
x[pos].y = idy + 1;
}
}
// Kernel for initializing 3D complex input data on the GPU.
__global__ void initcdata3(hipDoubleComplex* x,
const size_t Nx,
const size_t Ny,
const size_t Nz,
const size_t xstride,
const size_t ystride,
const size_t zstride)
{
const size_t idx = blockIdx.x * blockDim.x + threadIdx.x;
const size_t idy = blockIdx.y * blockDim.y + threadIdx.y;
const size_t idz = blockIdx.z * blockDim.z + threadIdx.z;
if(idx < Nx && idy < Ny && idz < Nz)
{
const auto pos = idx * xstride + idy * ystride + idz * zstride;
x[pos].x = idx + 10.0 * idz + 1;
x[pos].y = idy + 10;
}
}
// Helper function for determining grid dimensions
template
Tint1 ceildiv(const Tint1 nominator, const Tint2 denominator)
{
return (nominator + denominator - 1) / denominator;
}
// The following functions call the above kernels to initalize the input data for the transform.
void initcomplex_cm(const std::vector& length_cm,
const std::vector& stride_cm,
void* gpu_in)
{
size_t blockSize = DATA_GEN_THREADS;
const dim3 blockdim(blockSize);
switch(length_cm.size())
{
case 1:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x));
hipLaunchKernelGGL(initcdata1,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
stride_cm[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y));
hipLaunchKernelGGL(initcdata2,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
length_cm[1],
stride_cm[0],
stride_cm[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x),
ceildiv(length_cm[1], blockdim.y),
ceildiv(length_cm[2], blockdim.z));
hipLaunchKernelGGL(initcdata3,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
length_cm[0],
length_cm[1],
length_cm[2],
stride_cm[0],
stride_cm[1],
stride_cm[2]);
break;
}
default:
std::cout << "invalid dimension!\n";
exit(1);
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_complex_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Initialize the real input buffer where the data has lengths given in length and stride given in
// stride. The device buffer is assumed to have been allocated.
void initreal_cm(const std::vector& length_cm,
const std::vector& stride_cm,
void* gpu_in)
{
size_t blockSize = DATA_GEN_THREADS;
const dim3 blockdim(blockSize);
switch(length_cm.size())
{
case 1:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x));
hipLaunchKernelGGL(
initrdata1, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], stride_cm[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y));
hipLaunchKernelGGL(initrdata2,
griddim,
blockdim,
0,
0,
(double*)gpu_in,
length_cm[0],
length_cm[1],
stride_cm[0],
stride_cm[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(length_cm[0], blockdim.x),
ceildiv(length_cm[1], blockdim.y),
ceildiv(length_cm[2], blockdim.z));
hipLaunchKernelGGL(initrdata3,
griddim,
blockdim,
0,
0,
(double*)gpu_in,
length_cm[0],
length_cm[1],
length_cm[2],
stride_cm[0],
stride_cm[1],
stride_cm[2]);
break;
}
default:
std::cout << "invalid dimension!\n";
exit(1);
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_real_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Imposes Hermitian symmetry for the input device buffer.
// Note: input parameters are in column-major ordering.
void impose_hermitian_symmetry_cm(const std::vector& length,
const std::vector& ilength,
const std::vector& stride,
void* gpu_in)
{
size_t batch = 1;
size_t dist = 1;
size_t blockSize = DATA_GEN_THREADS;
auto inputDim = length.size();
// Launch impose_hermitian_symmetry kernels.
// NOTE: input parameters must be in row-major
// ordering for these kernels.
switch(inputDim)
{
case 1:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize));
const auto blockDim = dim3(blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[0],
stride[0],
dist,
batch,
length[0] % 2 == 0);
break;
}
case 2:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize),
DivRoundingUp((length[1] + 1) / 2 - 1, blockSize));
const auto blockDim = dim3(blockSize, blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[1],
length[0],
stride[1],
stride[0],
dist,
batch,
(ilength[1] + 1) / 2 - 1,
length[1] % 2 == 0,
length[0] % 2 == 0);
break;
}
case 3:
{
const auto gridDim = dim3(DivRoundingUp(batch, blockSize),
DivRoundingUp((length[2] + 1) / 2 - 1, blockSize),
DivRoundingUp(length[1] - 1, blockSize));
const auto blockDim = dim3(blockSize, blockSize, blockSize);
hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel,
gridDim,
blockDim,
0,
0,
(hipDoubleComplex*)gpu_in,
length[2],
length[1],
length[0],
stride[2],
stride[1],
stride[0],
dist,
batch,
(ilength[2] + 1) / 2 - 1,
ilength[1] - 1,
(ilength[1] + 1) / 2 - 1,
length[2] % 2 == 0,
length[1] % 2 == 0,
length[0] % 2 == 0);
break;
}
default:
throw std::runtime_error("Invalid dimension");
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("impose_hermitian_symmetry_interleaved kernel launch failure: "
+ std::string(hipGetErrorName(err)));
}
// Initialize the Hermitian complex input buffer where the data has lengths given in length, the
// transform has lengths given in length and stride given in stride. The device buffer is assumed
// to have been allocated.
void init_hermitiancomplex_cm(const std::vector& length,
const std::vector& ilength,
const std::vector& stride,
void* gpu_in)
{
size_t blockSize = 256;
const dim3 blockdim(blockSize);
switch(length.size())
{
case 1:
{
const dim3 griddim(ceildiv(ilength[0], blockSize));
hipLaunchKernelGGL(
initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]);
break;
}
case 2:
{
const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y));
hipLaunchKernelGGL(initcdata2,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
ilength[0],
ilength[1],
stride[0],
stride[1]);
break;
}
case 3:
{
const dim3 griddim(ceildiv(ilength[0], blockdim.x),
ceildiv(ilength[1], blockdim.y),
ceildiv(ilength[2], blockdim.z));
hipLaunchKernelGGL(initcdata3,
griddim,
blockdim,
0,
0,
(hipDoubleComplex*)gpu_in,
ilength[0],
ilength[1],
ilength[2],
stride[0],
stride[1],
stride[2]);
break;
}
default:
throw std::runtime_error("Invalid dimension");
}
auto err = hipGetLastError();
if(err != hipSuccess)
throw std::runtime_error("init_complex_data kernel launch failure: "
+ std::string(hipGetErrorName(err)));
impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in);
}
#endif /* EXAMPLEKERNELS_H */
rocFFT-rocm-7.1.0/clients/samples/rocfft/exampleutils.h 0000664 0000000 0000000 00000013644 15066521634 0023075 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#ifndef EXAMPLEUTILS_H
#define EXAMPLEUTILS_H
std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c)
{
stream << "(" << c.x << "," << c.y << ")";
return stream;
}
// Increment the index (column-major) for looping over arbitrary dimensional loops with
// dimensions length.
template
bool increment_cm(std::vector& index, const std::vector& length)
{
for(unsigned int idim = 0; idim < length.size(); ++idim)
{
if(index[idim] < length[idim])
{
if(++index[idim] == length[idim])
{
index[idim] = 0;
continue;
}
break;
}
}
// End the loop when we get back to the start:
return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; });
}
// Output a formatted general-dimensional array with given length and stride in batches
// separated by dist, in column-major order.
template
void printbuffer_cm(const std::vector& data,
const std::vector& length,
const std::vector& stride,
const size_t nbatch,
const size_t dist)
{
for(size_t b = 0; b < nbatch; b++)
{
std::vector index(length.size());
std::fill(index.begin(), index.end(), 0);
do
{
const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist);
assert(i >= 0);
assert(i < data.size());
std::cout << data[i] << " ";
for(size_t idx = 0; idx < index.size(); ++idx)
{
if(index[idx] == (length[idx] - 1))
{
std::cout << "\n";
}
else
{
break;
}
}
} while(increment_cm(index, length));
std::cout << std::endl;
}
}
// Check that an multi-dimensional array of complex values with dimensions length
// and straide stride, with nbatch copies separated by dist is Hermitian-symmetric.
// Column-major version.
template
bool check_symmetry_cm(const std::vector& data,
const std::vector& length_cm,
const std::vector& stride_cm,
const size_t nbatch,
const size_t dist,
const bool verbose = true)
{
bool issymmetric = true;
for(size_t b = 0; b < nbatch; b++)
{
std::vector index(length_cm.size());
std::fill(index.begin(), index.end(), 0);
do
{
bool skip = false;
std::vector negindex(index.size());
for(size_t idx = 0; idx < index.size(); ++idx)
{
if(index[0] > length_cm[0] / 2)
{
skip = true;
break;
}
negindex[idx] = (length_cm[idx] - index[idx]) % length_cm[idx];
}
if(negindex[0] > length_cm[0] / 2)
{
skip = true;
}
if(!skip)
{
const auto i
= std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist);
const auto j = std::inner_product(
negindex.begin(), negindex.end(), stride_cm.begin(), b * dist);
if((data[i].x != data[j].x) or (data[i].y != -data[j].y))
{
if(verbose)
{
std::cout << "(";
std::string separator;
for(auto val : index)
{
std::cout << separator << val;
separator = ",";
}
std::cout << ")->";
std::cout << i << "\t";
std::cout << "(";
separator = "";
for(auto val : negindex)
{
std::cout << separator << val;
separator = ",";
}
std::cout << ")->";
std::cout << j << ":\t";
std::cout << data[i] << " " << data[j];
std::cout << "\tnot conjugate!" << std::endl;
}
issymmetric = false;
}
}
} while(increment_cm(index, length_cm));
}
return issymmetric;
}
#endif /* EXAMPLEUTILS_H */
rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_callback.cpp 0000664 0000000 0000000 00000016120 15066521634 0025516 0 ustar 00root root 0000000 0000000 /******************************************************************************
* Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in
* all copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
* THE SOFTWARE.
*******************************************************************************/
#include
#ifndef WIN32
#include "rocfft/rocfft.h"
#include
#include
#include
#include
#include
#include
// example of using load/store callbacks with rocfft
struct load_cbdata
{
double2* filter;
double scale;
};
__device__ double2 load_callback(double2* input, size_t offset, void* cbdata, void* sharedMem)
{
auto data = static_cast(cbdata);
// multiply each element by filter element and scale
return hipCmul(hipCmul(input[offset], data->filter[offset]),
make_hipDoubleComplex(data->scale, data->scale));
}
__device__ auto load_callback_dev = load_callback;
#endif
int main()
{
#ifdef WIN32
std::cout << "This sample is temporarily disabled on Windows" << std::endl;
return EXIT_SUCCESS;
#else
const size_t N = 8;
std::vector cx(N), filter(N);
// initialize data and filter
for(size_t i = 0; i < N; i++)
{
cx[i].x = i;
cx[i].y = i;
filter[i].x = rand() / static_cast(RAND_MAX);
filter[i].y = 0;
}
// rocfft gpu compute
// ==================
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
size_t Nbytes = N * sizeof(double2);
// Create HIP device object.
double2 *x, *filter_dev;
// create buffers
if(hipMalloc(&x, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(hipMalloc(&filter_dev, Nbytes) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Copy data to device
hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Create plan
rocfft_plan plan = nullptr;
size_t length = N;
if(rocfft_plan_create(&plan,
rocfft_placement_inplace,
rocfft_transform_type_complex_forward,
rocfft_precision_double,
1,
&length,
1,
nullptr)
!= rocfft_status_success)
throw std::runtime_error("rocfft_plan_create failed.");
// Check if the plan requires a work buffer
size_t work_buf_size = 0;
if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_get_work_buffer_size failed.");
void* work_buf = nullptr;
rocfft_execution_info info = nullptr;
if(rocfft_execution_info_create(&info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_create failed.");
if(work_buf_size)
{
if(hipMalloc(&work_buf, work_buf_size) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_work_buffer failed.");
}
// Prepare callback
load_cbdata cbdata_host;
cbdata_host.filter = filter_dev;
cbdata_host.scale = 1.0 / static_cast(N);
void* cbdata_dev;
if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
// Get a properly-typed host pointer to the device function, as
// rocfft_execution_info_set_load_callback expects void*.
void* cbptr_host = nullptr;
hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*));
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpyFromSymbol failed.");
// set callback
if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0)
!= rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_set_load_callback failed.");
// Execute plan
if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success)
throw std::runtime_error("rocfft_execute failed.");
// Clean up work buffer
if(work_buf_size)
{
if(hipFree(work_buf) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_execution_info_destroy(info) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
info = nullptr;
}
// Destroy plan
if(rocfft_plan_destroy(plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
plan = nullptr;
// Copy result back to host
std::vector y(N);
hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
for(size_t i = 0; i < N; i++)
{
std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")"
<< " output: (" << y[i].x << "," << y[i].y << ")" << std::endl;
}
if(hipFree(cbdata_dev) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(hipFree(filter_dev) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(hipFree(x) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
#endif
}
rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp 0000664 0000000 0000000 00000024541 15066521634 0027027 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include
#include
#include
#include
#include
#include
#include "../../../shared/CLI11.hpp"
#include "examplekernels.h"
#include "exampleutils.h"
#include
int main(int argc, char* argv[])
{
std::cout << "rocfft double-precision complex-to-complex transform\n" << std::endl;
// Length of transform:
std::vector length = {8};
// Gpu device id:
size_t deviceId = 0;
// Command-line options:
CLI::App app{"rocfft sample command line options"};
app.add_option("--device", deviceId, "Select a specific device id")->default_val(0);
CLI::Option* opt_outofplace
= app.add_flag("-o, --outofplace", "Perform an out-of-place transform");
CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform");
app.add_option(
"--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)");
try
{
app.parse(argc, argv);
}
catch(const CLI::ParseError& e)
{
return app.exit(e);
}
// Placeness for the transform
if(rocfft_setup() != rocfft_status_success)
throw std::runtime_error("rocfft_setup failed.");
const rocfft_result_placement place
= *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace;
const bool inplace = place == rocfft_placement_inplace;
// Direction of transform
const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_complex_forward
: rocfft_transform_type_complex_inverse;
// Set up the strides and buffer size for the input:
std::vector istride = {1};
for(unsigned int i = 1; i < length.size(); ++i)
{
istride.push_back(length[i - 1] * istride[i - 1]);
}
const size_t isize = length[length.size() - 1] * istride[istride.size() - 1];
// Set up the strides and buffer size for the output:
std::vector ostride = {1};
for(unsigned int i = 1; i < length.size(); ++i)
{
ostride.push_back(length[i - 1] * ostride[i - 1]);
}
const size_t osize = length[length.size() - 1] * ostride[ostride.size() - 1];
// Print information about the transform:
std::cout << "direction: ";
if(direction == rocfft_transform_type_complex_forward)
std::cout << "forward\n";
else
std::cout << "inverse\n";
std::cout << "length:";
for(const auto i : length)
std::cout << " " << i;
std::cout << "\n";
if(inplace)
std::cout << "in-place transform\n";
else
std::cout << "out-of-place transform\n";
std::cout << "deviceID: " << deviceId << "\n";
std::cout << "input strides:";
for(auto i : istride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "output strides:";
for(auto i : ostride)
std::cout << " " << i;
std::cout << "\n";
std::cout << "input size: " << isize << "\n";
std::cout << "output size: " << isize << "\n";
std::cout << std::endl;
// Set the device:
if(hipSetDevice(deviceId) != hipSuccess)
throw std::runtime_error("hipSetDevice failed.");
// Create HIP device object and allocate data
hipDoubleComplex* gpu_in = nullptr;
if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
// Inititalize the data on the device
initcomplex_cm(length, istride, gpu_in);
if(hipDeviceSynchronize() != hipSuccess)
throw std::runtime_error("hipDeviceSynchronize failed.");
hipError_t hip_status = hipGetLastError();
if(hip_status != hipSuccess)
throw std::runtime_error("device error");
std::cout << "input:\n";
std::vector idata(isize);
hip_status
= hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(idata, length, istride, 1, isize);
// Create the a descrition struct to set data layout:
rocfft_plan_description gpu_description = nullptr;
// rocfft_status can be used to capture API status info
rocfft_status rc = rocfft_plan_description_create(&gpu_description);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan description");
rc = rocfft_plan_description_set_data_layout(gpu_description,
rocfft_array_type_complex_interleaved,
rocfft_array_type_complex_interleaved,
nullptr,
nullptr,
istride.size(), // input stride length
istride.data(), // input stride data
0, // input batch distance
ostride.size(), // output stride length
ostride.data(), // output stride data
0); // ouptut batch distance
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set data layout");
// We can also pass "nullptr" instead of a description; rocFFT will use reasonable
// default parameters. If the data isn't contiguous, we need to set strides, etc,
// using the description.
// Create the plan
rocfft_plan gpu_plan = nullptr;
rc = rocfft_plan_create(&gpu_plan,
place,
direction,
rocfft_precision_double,
length.size(), // Dimension
length.data(), // lengths
1, // Number of transforms
gpu_description); // Description
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create plan");
// Get the execution info for the fft plan (in particular, work memory requirements):
rocfft_execution_info planinfo = nullptr;
rc = rocfft_execution_info_create(&planinfo);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to create execution info");
size_t workbuffersize = 0;
rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to get work buffer size");
// If the transform requires work memory, allocate a work buffer:
void* wbuffer = nullptr;
if(workbuffersize > 0)
{
hip_status = hipMalloc(&wbuffer, workbuffersize);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize);
if(rc != rocfft_status_success)
throw std::runtime_error("failed to set work buffer.");
}
// If the transform is out-of-place, allocate the output buffer as well:
double2* gpu_out = inplace ? gpu_in : nullptr;
if(!inplace)
{
hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex));
if(hip_status != hipSuccess)
throw std::runtime_error("hipMalloc failed.");
}
// Execute the GPU transform:
rc = rocfft_execute(gpu_plan, // plan
(void**)&gpu_in, // in_buffer
(void**)&gpu_out, // out_buffer
planinfo); // execution info
if(rc != rocfft_status_success)
throw std::runtime_error("failed to execute.");
// Get the output from the device and print to cout:
std::cout << "output:\n";
std::vector odata(osize);
hip_status
= hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost);
if(hip_status != hipSuccess)
throw std::runtime_error("hipMemcpy failed.");
printbuffer_cm(odata, length, istride, 1, isize);
// Clean up: free GPU memory:
if(hipFree(gpu_in) != hipSuccess)
throw std::runtime_error("hipFree failed.");
if(!inplace)
{
if(hipFree(gpu_out) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
if(wbuffer != nullptr)
{
if(hipFree(wbuffer) != hipSuccess)
throw std::runtime_error("hipFree failed.");
}
// Clean up: destroy plans:
if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success)
throw std::runtime_error("rocfft_execution_info_destroy failed.");
planinfo = nullptr;
if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_description_destroy failed.");
gpu_description = nullptr;
if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success)
throw std::runtime_error("rocfft_plan_destroy failed.");
gpu_plan = nullptr;
if(rocfft_cleanup() != rocfft_status_success)
throw std::runtime_error("rocfft_cleanup failed.");
return 0;
}
rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp 0000664 0000000 0000000 00000027731 15066521634 0026307 0 ustar 00root root 0000000 0000000 // Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved.
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.
#include
#include
#include