pax_global_header00006660000000000000000000000064150665216340014522gustar00rootroot0000000000000052 comment=fece8692bafd8117e2ad50036ca646ba91c5d3ba rocFFT-rocm-7.1.0/000077500000000000000000000000001506652163400136105ustar00rootroot00000000000000rocFFT-rocm-7.1.0/.azuredevops/000077500000000000000000000000001506652163400162355ustar00rootroot00000000000000rocFFT-rocm-7.1.0/.azuredevops/rocm-ci.yml000066400000000000000000000012401506652163400203060ustar00rootroot00000000000000resources: repositories: - repository: pipelines_repo type: github endpoint: ROCm name: ROCm/ROCm variables: - group: common - template: /.azuredevops/variables-global.yml@pipelines_repo trigger: batch: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' pr: autoCancel: true branches: include: - develop - mainline paths: exclude: - .githooks - .github - .jenkins - docs - '.*.y*ml' - '*.md' drafts: false jobs: - template: ${{ variables.CI_COMPONENT_PATH }}/rocFFT.yml@pipelines_repo rocFFT-rocm-7.1.0/.clang-format000066400000000000000000000065421506652163400161720ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- rocFFT-rocm-7.1.0/.githooks/000077500000000000000000000000001506652163400155155ustar00rootroot00000000000000rocFFT-rocm-7.1.0/.githooks/install000077500000000000000000000002221506652163400171050ustar00rootroot00000000000000#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocFFT-rocm-7.1.0/.githooks/pre-commit000077500000000000000000000017671506652163400175320ustar00rootroot00000000000000#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/llvm/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done rocFFT-rocm-7.1.0/.github/000077500000000000000000000000001506652163400151505ustar00rootroot00000000000000rocFFT-rocm-7.1.0/.github/CODEOWNERS000066400000000000000000000005741506652163400165510ustar00rootroot00000000000000* @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd # Documentation files docs/ @ROCm/rocm-documentation *.md @ROCm/rocm-documentation *.rst @ROCm/rocm-documentation .readthedocs.yaml @ROCm/rocm-documentation # Header directory for Doxygen documentation library/include/ @ROCm/rocm-documentation @af-ayala @eng-flavio-teixeira @evetsso @malcolmroberts @regan-amd rocFFT-rocm-7.1.0/.github/CONTRIBUTING.md000066400000000000000000000146331506652163400174100ustar00rootroot00000000000000 # Contributing to rocFFT # We welcome contributions to rocFFT. Please follow these details to help ensure your contributions will be successfully accepted. ## Issue Discussion ## Please use the GitHub Issues tab to notify us of issues. * Use your best judgment for issue creation. If your issue is already listed, upvote the issue and comment or post to provide additional details, such as how you reproduced this issue. * If you're not sure if your issue is the same, err on the side of caution and file your issue. You can add a comment to include the issue number (and link) for the similar issue. If we evaluate your issue as being the same as the existing issue, we'll close the duplicate. * If your issue doesn't exist, use the issue template to file a new issue. * When filing an issue, be sure to provide as much information as possible, including script output so we can collect information about your configuration. This helps reduce the time required to reproduce your issue. * Check your issue regularly, as we may require additional information to successfully reproduce the issue. * You may also open an issue to ask questions to the maintainers about whether a proposed change meets the acceptance criteria, or to discuss an idea pertaining to the library. ## Acceptance Criteria ## When a contribution is submitted via a pull request, a number of automated checks are run in order to verify compilation correctness and prevent performance regressions. These checks include: * Building and testing the change on various OS platforms (Ubuntu, RHEL, etc.) * Running on different GPU architectures (MI-series, Radeon series cards, etc.) * Running benchmarks to check for performance degradation In order for a submission to be accepted: * It must pass all of the automated checks * It must undergo a code review Users can visualize our continuous integration infrastructure in: `rocFFT/.jenkins`. The GitHub "Issues" tab may also be used to discuss ideas surrounding particular features or changes before raising pull requests. ## Code Structure ## In a broad view, rocFFT library is structured as follows: ├── docs/: contains rocFFT documentation ├── library/: contains main source code and headers ├── clients/: │   ├── bench/ : contains benchmarking code │   ├── samples/ : contains examples │   ├── tests/ : contains our test infrastructure ├── shared/: contains important global headers and those for linking to other applications ## Coding Style ## * All public APIs are C89 compatible; all other library code should use c++17. * Our minimum supported compiler is clang 3.6. * Avoid CamelCase: rule applies specifically to publicly visible APIs, but is encouraged (not mandated) for internal code. * C and C++ code should be formatted using `clang-format`. You can use the clang-format version available in `rocFFT/.clang-format`. To format a C/C++ file, use: ``` clang-format -style=file -i ``` * Python code should use: ``` yapf --style pep8 ``` ## Pull Request Guidelines ## Our code contribution guidelines closely follow the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. Note that a [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. The following guidelines apply: * When you create a pull request, you should target the default branch. Our current default branch is the **develop** branch. * Note that releases are cut to release/rocm-rel-x.y, where x and y refer to the release major and minor numbers. * Ensure code builds successfully. * Do not break existing test cases * Code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. ### Deliverables ### New changes should include test coverage. Our testing infrastructure is located in `clients/tests/`, and can be used as a reference. The following guidelines apply: * New functionality will only be merged with new unit tests. * New unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md). * Tests must have good code coverage. ### Process ### All pull requests must pass through the checks and the code review described in the [Acceptance Criteria](#acceptance-criteria) section before they can be merged. Once a contribution is ready to be submitted, consider the following: * Before you create a PR, ensure that all files have been gone through the clang formatting: clang-format -i * While creating a PR, you can take a look at a `diff` of the changes you made using the PR's "Files" tab, and verify that no unintentional changes are being submitted. * Checks may take some time to complete. You can view their progress in the table near the bottom of the pull request page. You may also be able to use the links in the table to view logs associated with a check if it fails. * During code reviews, another developer will take a look through your proposed change. If any modifications are requested (or further discussion about anything is needed), they may leave a comment. You can follow up and respond to the comment, and/or create comments of your own if you have questions or ideas. * When a modification request has been completed, the conversation thread about it will be marked as resolved. * To update the code in your PR (eg. in response to a code review discussion), you can simply push another commit to the branch used in your pull request. * Once your contribution is approved, we will use the *squash merge* option from GitHub to integrate it to the corresponding branch. ## Code License ## All code contributed to this project will be licensed under the license identified in the [LICENSE.md](https://github.com/ROCm/rocFFT/blob/develop/LICENSE.md). Your contribution will be accepted under the same license. rocFFT-rocm-7.1.0/.github/ISSUE_TEMPLATE.md000066400000000000000000000004611506652163400176560ustar00rootroot00000000000000### What is the expected behavior - ### What actually happens - ### How to reproduce - ### Environment | Hardware | description | |-----|-----| | GPU | device string | | CPU | device string | | Software | version | |-----|-----| | ROCK | v0.0 | | ROCR | v0.0 | | HCC | v0.0 | | Library | v0.0 | rocFFT-rocm-7.1.0/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000000701506652163400207460ustar00rootroot00000000000000resolves #___ Summary of proposed changes: - - - rocFFT-rocm-7.1.0/.github/dependabot.yml000066400000000000000000000012231506652163400177760ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" labels: - "documentation" - "dependencies" - "ci:docs-only" reviewers: - "samjwu" rocFFT-rocm-7.1.0/.gitignore000066400000000000000000000005541506652163400156040ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Visual Studio Code .vscode # install.sh build dir build/ # python bytecode __pycache__ rocFFT-rocm-7.1.0/.readthedocs.yaml000066400000000000000000000005721506652163400170430ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip, pdf, epub] python: install: - requirements: docs/sphinx/requirements.txt build: os: ubuntu-22.04 tools: python: "mambaforge-22.9" conda: environment: docs/environment.yml rocFFT-rocm-7.1.0/CHANGELOG.md000066400000000000000000000514461506652163400154330ustar00rootroot00000000000000# Changelog for rocFFT Documentation for rocFFT is available at [https://rocm.docs.amd.com/projects/rocFFT/en/latest/](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). ## rocFFT 1.0.35 for ROCM 7.1.0 ### Optimized * Implemented single-kernel plans for some 2D problem sizes, on devices with at least 160KiB of LDS. * Improved performance of unit-strided, complex-interleaved, forward/inverse FFTs for lengths: - (64,64,128) - (64,64,52) - (60,60,60) - (32,32,128) - (32,32,64) - (64,32,128) * Improved performance of 3D MPI pencil decompositions by using sub-communicators for global transpose operations. ## rocFFT 1.0.34 for ROCm 7.0.0 ### Added * Added gfx950 support. ### Removed * Removed rocfft-rider legacy compatibility from clients * Removed support for the gfx940 and gfx941 targets from the client programs. ### Optimized * Removed unnecessary HIP event/stream allocation and synchronization during MPI transforms. * Implemented single-precision 1D kernels for lengths: - 4704 - 5488 - 6144 - 6561 - 8192 * Implemented single-kernel plans for some large 1D problem sizes, on devices with at least 160KiB of LDS. ### Resolved issues * Fixed kernel faults on multi-device transforms that gather to a single device, when the input/output bricks are not contiguous. ## rocFFT 1.0.32 for ROCm 6.4.0 ### Changed * Building with the address sanitizer option sets xnack+ on relevant GPU architectures and adds address-sanitizer support to runtime-compiled kernels. * The `AMDGPU_TARGETS` build variable should be replaced with `GPU_TARGETS`. `AMDGPU_TARGETS` is deprecated. ### Removed * Removed ahead-of-time compiled kernels for the gfx906, gfx940, and gfx941 architectures. These architectures still function the same, but kernels for them are now compiled at runtime. * Removed consumer GPU architectures from the precompiled kernel cache that ships with rocFFT. rocFFT continues to ship with a cache of precompiled RTC kernels for data-center and workstation architectures. As before, user-level caches can be enabled by setting the environment variable ROCFFT_RTC_CACHE_PATH to a writeable file location. ### Optimized * Improved MPI transform performance by using all-to-all communication for global transpose operations. Point-to-point communications are still used when all-to-all is not possible. * Improved the performance of unit-strided, complex interleaved, forward and inverse, length (64,64,64) FFTs. ### Resolved issues * Fixed incorrect results from 2-kernel 3D FFT plans that used non-default output strides. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/507). * Plan descriptions can be reused with different strides for different plans. For more information, see the [rocFFT GitHub issue](https://github.com/ROCm/rocFFT/issues/504). * Fixed client packages to depend on hipRAND instead of rocRAND. * Fixed potential integer overflows during large MPI transforms. ## rocFFT 1.0.31 for ROCm 6.3.0 ### Added * rocfft-test now includes a --smoketest option. * Support for the gfx1151, gfx1200, and gfx1201 architectures. * Implemented experimental APIs to allow computing FFTs on data distributed across multiple MPI ranks. These APIs can be enabled with the `ROCFFT_MPI_ENABLE` CMake option. This option defaults to `OFF`. When `ROCFFT_MPI_ENABLE` is set to `ON`: * `rocfft_plan_description_set_comm` can be called to provide an MPI communicator to a plan description, which can then be passed to `rocfft_plan_create`. Each rank calls `rocfft_field_add_brick` to specify the layout of data bricks on that rank. * An MPI library with ROCm acceleration enabled is required at build time and at runtime. ### Changed * Compilation uses amdclang++ instead of hipcc. * CLI11 replaces Boost Program Options as the command line parser for clients and samples. ## rocFFT 1.0.30 for ROCm 6.2.4 ### Optimizations * Implemented 1D kernels for factorizable sizes > 1024 and < 2048. ### Fixes * Fixed plan creation failure on some even-length real-complex transforms that use Bluestein's algorithm. ### Additions * GFX1151 Support ## rocFFT 1.0.29 for ROCm 6.2.1 ### Optimizations * Implemented 1D kernels for factorizable sizes < 1024 ## rocFFT 1.0.28 for ROCm 6.2.0 ### Optimizations * Implemented multi-device transform for 3D pencil decomposition. Contiguous dimensions on input and output bricks are transformed locally, with global transposes to make remaining dimensions contiguous. ### Changes * Add option in dyna-bench to load the libs in forward and then reverse order for benchmark tests. * Randomly generated accuracy tests are now disabled by default; these can be enabled using the --nrand option (which defaults to 0). * Use Bonferroni multi-hypothesis testing framework by default for benchmark tests. ## rocFFT 1.0.27 for ROCm 6.1.1 ### Fixes * Fixed kernel launch failure on execute of very large odd-length real-complex transforms. ### Additions * Enable multi-gpu testing on systems without direct GPU-interconnects ## rocFFT 1.0.26 for ROCm 6.1.0 ### Changes * Multi-device FFTs now allow batch greater than 1 * Multi-device, real-complex FFTs are now supported * rocFFT now statically links libstdc++ when only `std::experimental::filesystem` is available (to guard against ABI incompatibilities with newer libstdc++ libraries that include `std::filesystem`) ## rocFFT 1.0.25 for ROCm 6.0.0 ### Additions * Implemented experimental APIs to allow computing FFTs on data distributed across multiple devices in a single process * `rocfft_field` is a new type that can be added to a plan description to describe the layout of FFT input or output * `rocfft_field_add_brick` can be called to describe the brick decomposition of an FFT field, where each brick can be assigned a different device These interfaces are still experimental and subject to change. We are interested in getting feedback. You can raise questions and concerns by opening issues in the [rocFFT issue tracker](https://github.com/ROCmSoftwarePlatform/rocFFT/issues). Note that multi-device FFTs currently have several limitations (we plan to address these in future releases): * Real-complex (forward or inverse) FFTs are not supported * Planar format fields are not supported * Batch (the `number_of_transforms` provided to `rocfft_plan_create`) must be 1 * FFT input is gathered to the current device at run time, so all FFT data must fit on that device ### Optimizations * Improved the performance of several 2D/3D real FFTs supported by `2D_SINGLE` kernel. Offline tuning provides more optimization for fx90a * Removed an extra kernel launch from even-length, real-complex FFTs that use callbacks ### Changes * Built kernels in a solution map to the library kernel cache * Real forward transforms (real-to-complex) no longer overwrite input; rocFFT may still overwrite real inverse (complex-to-real) input, as this allows for faster performance * `rocfft-rider` and `dyna-rocfft-rider` have been renamed to `rocfft-bench` and `dyna-rocfft-bench`; these are controlled by the `BUILD_CLIENTS_BENCH` CMake option * Links for the former file names are installed, and the former `BUILD_CLIENTS_RIDER` CMake option is accepted for compatibility, but both will be removed in a future release * Binaries in debug builds no longer have a `-d` suffix ### Fixes * rocFFT now correctly handles load callbacks that convert data from a smaller data type (e.g., 16-bit integers -> 32-bit float) ## rocFFT 1.0.24 for ROCm 5.7.0 ### Optimizations * Improved the performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use Bluestein's algorithm ### Additions * Implemented a solution map version converter and finished the first conversion from ver.0 to ver.1 * Version 1 removes some incorrect kernels (sbrc/sbcr using `half_lds`) ### Changes * Moved `rocfft_rtc_helper` executable to the `lib/rocFFT` directory on Linux * Moved library kernel cache to the `lib/rocFFT` directory ## rocFFT 1.0.23 for ROCm 5.6.0 ### Additions * Implemented half-precision transforms; these can be requested by passing `rocfft_precision_half` to `rocfft_plan_create` * Implemented a hierarchical solution map that saves information on how to decompose a problem and the kernels that are used * Implemented a first version of offline-tuner to support tuning kernels for C2C and Z2Z problems ### Changes * Replaced `std::complex` with hipComplex data types for the data generator * FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example) * Added the `--precision` argument to benchmark and test clients (`--double` is still accepted but is deprecated as a method to request a double-precision transform) * Improved performance test suite statistical framework ### Fixes * Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure ## rocFFT 1.0.22 for ROCm 5.5.0 ### Optimizations * Improved the performance of 1D lengths < 2048 that use Bluestein's algorithm * Reduced code generation time during plan creation * Optimized 3D R2C and C2R lengths 32, 84, 128 * Optimized batched small 1D R2C and C2R cases ### Additions * Added gfx1101 to default `AMDGPU_TARGETS` ### Changes * Moved client programs to C++17 * Moved planar kernels and infrequently used Stockham kernels to be runtime-compiled * Moved transpose, real-complex, Bluestein, and Stockham kernels to the library kernel cache ### Fixes * Removed zero-length twiddle table allocations, which fixes errors from `hipMallocManaged` * Fixed incorrect freeing of HIP stream handles during twiddle computation when multiple devices are present ## rocFFT 1.0.21 for ROCm 5.4.3 ### Fixes * Removed the source directory from `rocm_install_targets` to prevent the installation of `rocfft.h` in an unintended location ## rocFFT 1.0.20 for ROCm 5.4.1 ### Fixes * Fixed incorrect results on strided large 1D FFTs where batch size does not equal the stride ## rocFFT 1.0.19 for ROCm 5.4.0 ### Optimizations * Optimized some strided large 1D plans ### Additions * Added the `rocfft_plan_description_set_scale_factor` API to efficiently multiply each output element of an FFT by a given scaling factor * Created a `rocfft_kernel_cache.db` file next to the installed library; SBCC, CR, and RC kernels are moved to this file when built with the library, and are runtime-compiled for new GPU architectures * Added gfx1100 and gfx1102 to default `AMDGPU_TARGETS` ### Changes * Moved the runtime compilation cache to in-memory by default * A default on-disk cache can encounter contention problems on multi-node clusters with a shared filesystem * rocFFT can still use an on-disk cache by setting the `ROCFFT_RTC_CACHE_PATH` environment variable ## rocFFT 1.0.18 for ROCm 5.3.0 ### Changes * The runtime compilation cache now looks for environment variables `XDG_CACHE_HOME` (on Linux) and `LOCALAPPDATA` (on Windows) before falling back to `HOME` * Moved computation of the twiddle table from the host to the device ### Optimizations * Optimized 2D R2C and C2R to use 2-kernel plans where possible * Improved performance of the Bluestein algorithm * Optimized sbcc-168 and 100 by using half-LDS * Optimized length-280 2D and 3D transforms * Added kernels for factorizable 1D lengths < 128 ### Fixes * Fixed occasional failures to parallelize runtime compilation of kernels (failures would be retried serially and ultimately succeed, but this would take extra time) * Fixed failures of some R2C 3D transforms that use the unsupported `TILE_UNALGNED` SBRC kernels (an example is 98^3 R2C out-of-place) * Fixed bugs in the `SBRC_ERC` type ## rocFFT 1.0.17 for ROCm 5.2.0 ### Additions * Packages for test and benchmark executables on all supported operating systems using CPack * Added file and folder reorganization changes, with backward compatibility support, using `rocm-cmake` wrapper functions ### Changes * Improved reuse of twiddle memory between plans * Set a default load/store callback when only one callback type is set via the API (for improved performance) * Updated the GoogleTest dependency to version 1.11 ### Optimizations * Introduced a new access pattern of LDS (non-linear) and applied it on sbcc kernels len 64 and 81 for a performance improvement * Applied `lds-non-linear`, `direct-load-to-register`, and `direct-store-from-register` on sbcr kernels for a performance improvement ### Fixes * Correctness of certain transforms with unusual strides * Incorrect handling of user-specified stream for runtime-compiled kernels * Incorrect buffer allocation in `rocfft-test` on in-place transforms with different input and output sizes ## rocFFT 1.0.16 for ROCm 5.1.0 ### Changes * Supported unaligned tile dimension for `SBRC_2D` kernels * Improved test and benchmark infrastructure by adding RAII * Enabled runtime compilation of length-2304 FFT kernel during plan creation * Added tokenizer for test suite * Reduce twiddle memory requirements for even-length, real-complex transforms * Clients can now be built separately from the main library ### Optimizations * Optimized more large 1D cases by using `L1D_CC` plan * Optimized the 3D 200^3 C2R case * Optimized the 1D 2^30 double precision on MI200 * Added padding to work buffer sizes to improve performance in many cases ### Fixes * Fixed the correctness of some R2C transforms with unusual strides ### Removals * The hipFFT API (header) has been removed; use the [hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT) package or repository to obtain the API ## rocFFT 1.0.15 for ROCm 5.0.0 ### Changes * Enabled runtime compilation of single FFT kernels > length 1024 * Re-aligned the split device library into four roughly equal libraries * Implemented the FuseShim framework to replace the original OptimizePlan * Implemented the generic buffer-assignment framework * The buffer assignment is no longer performed by each node--we designed a generic algorithm to test and pick the best assignment path * With the help of FuseShim, we can achieve the most kernel-fusions possible * Don't read the imaginary part of the DC and Nyquist modes for even-length complex-to-real transforms ### Optimizations * Optimized twiddle conjugation; complex-to-complex inverse transforms should now have similar performance to forward transforms * Improved performance of single-kernel, small 2D transforms ## rocFFT 1.0.14 for ROCm 4.5.0 ### Optimizations * Optimized SBCC kernels of lengths 52, 60, 72, 80, 84, 96, 104, 108, 112, 160, 168, 208, 216, 224, and 240 with a new kernel generator ### Additions * Added support for Windows 10 as a build target ### Changes * Packaging has been split into a runtime package (`rocfft`) and a development package (`rocfft-devel`): The development package depends on the runtime package. When installing the runtime package, the package manager will suggest the installation of the development package to aid users transitioning from the previous version's combined package. This suggestion by package manager is for all supported operating systems (except CentOS 7) to aid in the transition. The `suggestion` feature in the runtime package is introduced as a deprecated feature and will be removed in a future ROCm release. ### Fixes * Fixed validation failures for even-length R2C inplace 2D and 3D cubics sizes, such as 100^2 (or ^3), 200^2 (or ^3), and 256^2 (or ^3) * We combine two kernels (`r2c-transpose`) instead of combining the three kernels (`stockham-r2c-transpose`) ### Changes * Split 2D device code into separate libraries ## rocFFT 1.0.13 for ROCm 4.4.0 ### Optimizations * Improved plans by removing unnecessary transpose steps * Optimized scheme selection for 3D problems * Imposed fewer restrictions on `3D_BLOCK_RC` selection (more problems can use `3D_BLOCK_RC` and have performance gains) * Enabled `3D_RC`; some 3D problems with SBCC-supported z-dim can use fewer kernels to get benefits * Forced `--length` 336 336 56 (dp) to use faster `3D_RC` to prevent it from being skipped by a conservative threshold test * Optimized some even-length R2C/C2R cases by doing more in-place operations and combining pre- and post-processing into Stockham kernels * Added radix-17 ### Additions * Added a new kernel generator for select fused 2D transforms ### Fixes * Improved large 1D transform decompositions ## rocFFT 1.0.12 for ROCm 4.3.0 ### Changes * Re-split device code into single-precision, double-precision, and miscellaneous kernels ### Fixes * Fixed potential crashes in double-precision planar->planar transpose * Fixed potential crashes in 3D transforms with unusual strides for SBCC-optimized sizes * Improved buffer placement logic ### Additions * Added a new kernel generator for select lengths; new kernels have improved performance * Added public `rocfft_execution_info_set_load_callback` and`rocfft_execution_info_set_store_callback` API functions to allow running extra logic when loading data from and storing data to global memory during a transform ### Removals * Removed R2C pair schemes and kernels ### Optimizations * Optimized 2D and 3D R2C 100 and 1D Z2Z 2500 * Reduced number of kernels for 2D/3D sizes where higher dimension is 64, 128, 256 ### Fixes * Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes ## rocFFT 1.0.11 for ROCm 4.2.0 ### Changes * Move device code into the main library ### Optimizations * Improved performance for single-precision kernels exercising all except radix-2/7 butterfly ops * Minor optimization for C2R 3D 100 and 200 cube sizes * Optimized some C2C and R2C 3D 64, 81, 100, 128, 200, and 256 rectangular sizes * When factoring, test to see if the remaining length is explicitly supported * Explicitly added radix-7 lengths 14, 21, and 224 to list of supported lengths * Optimized R2C 2D and 3D 128, 200, and 256 cube sizes ### Known issues * Fixed potential crashes in small 3D transforms with unusual strides ([issue 311](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/311)) * Fixed potential crashes when running transforms on multiple devices ([issue 310](https://github.com/ROCmSoftwarePlatform/rocFFT/issues/310)) ## rocFFT 1.0.10 for ROCm 4.1.0 ### Additions * Explicitly specify `MAX_THREADS_PER_BLOCK` through `__launch_bounds_` for all kernels * Switched to a new syntax for specifying AMD GPU architecture names and features ### Optimizations * Optimized C2C and R2C 3D 64, 81, 100, 128, 200, and 256 cube sizes * Improved the performance of the standalone out-of-place transpose kernel * Optimized the 1D length 40000 C2C case * Enabled radix-7 for size 336 * New radix-11 and radix-13 kernels; used in length 11 and 13 (and some of their multiples) transforms ### Changes * rocFFT now automatically allocates a work buffer if the plan requires one and none is provided * An explicit `rocfft_status_invalid_work_buffer` error is now returned when a work buffer of insufficient size is provided * Updated online documentation * Updated Debian package name version with separated underscore ( _ ) * Adjusted accuracy test tolerances and how they are compared ### Fixes * Fixed a 4x4x8192 accuracy failure ## rocFFT 1.0.8 for ROCm 3.10.0 ### Optimizations * Optimized the 1D length 10000 C2C case ### Changes * Added the `BUILD_CLIENTS_ALL` CMake option ### Fixes * Fixed the correctness of SBCC and SBRC kernels with non-unit strides * Fixed fused C2R kernel when a Bluestein transform follows it ## rocFFT 1.0.7 for ROCm 3.9.0 ### Optimizations * New R2C and C2R fused kernels to combine pre- and post-processing steps with transpose * Enabled diagonal transpose for 1D and 2D power-of-2 cases * New single kernels for small power-of-2, 3, and 5 sizes * Added more radix-7 kernels ### Changes * Explicitly disabled XNACK and SRAM-ECC features on AMDGPU hardware ### Fixes * Fixed 2D C2R transform with length 1 on one dimension * Fixed a potential thread unsafety in logging ## rocFFT 1.0.6 for ROCm 3.8.0 ### Optimizations * Improved the performance of 1D batch-paired R2C transforms of odd length * Added some radix-7 kernels * Improved the performance for 1D length 6561 and 10000 * Improved the performance for certain 2D transform sizes ### Changes * Allowed a static library build with `BUILD_SHARED_LIBS=OFF` CMake option * Updated GoogleTest dependency to version 1.10 ### Fixes * Correctness of certain large 2D sizes ## rocFFT 1.0.5 for ROCM 3.7.0 ### Optimizations * Optimized C2C power-of-2 middle sizes ### Changes * Parallelized work in unit tests and eliminated duplicate cases ### Fixes * Correctness of certain large 1D, and 2D power-of-3 and 5 sizes * Incorrect buffer assignment for some even-length R2C transforms * `` inclusion on C compilers * Incorrect results on non-unit strides with SBCC/SBRC kernels rocFFT-rocm-7.1.0/CMakeLists.txt000066400000000000000000000245261506652163400163610ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_BUILD_SCOPE ON ) project( rocfft LANGUAGES CXX C ) # This finds the rocm-cmake project, and installs it if not found # rocm-cmake contains common cmake code for rocm projects to help setup and install set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) find_package( ROCmCMakeBuildTools PATHS ${ROCM_PATH} /opt/rocm ) if( NOT ROCmCMakeBuildTools_FOUND ) include( FetchContent ) FetchContent_Declare( rocm_cmake_local GIT_REPOSITORY https://github.com/ROCm/rocm-cmake GIT_TAG rocm-6.4.1 GIT_SHALLOW ON ) FetchContent_MakeAvailable( rocm_cmake_local ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) execute_process( COMMAND ${CMAKE_COMMAND} --build ${rocm_cmake_local_SOURCE_DIR} --target install WORKING_DIRECTORY ${rocm_cmake_local_SOURCE_DIR} ) find_package( ROCmCMakeBuildTools REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif( ) include( ROCMSetupVersion ) include( ROCMCreatePackage ) include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) include( ROCMCheckTargetIds ) include( ROCMClients ) include( ROCMHeaderWrapper ) if( ROCM_PATH ) list( APPEND CMAKE_BUILD_RPATH ${ROCM_PATH}/lib ) endif() # Using standardized versioning from rocm-cmake set ( VERSION_STRING "1.0.35" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for # convenience). # Users may override HIP path by specifying their own in CMAKE_MODULE_PATH list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) # Enable verbose output option( BUILD_VERBOSE "Output additional build information" OFF ) # BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option # such that it shows in cmake-gui option( BUILD_SHARED_LIBS "Build rocFFT as a shared library" ON ) option( WERROR "Treat warnings as errors" OFF ) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF) # Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable, # Set default to OFF since users are not likely to tune option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF) # Provide ability to disable hipRAND dependency option(USE_HIPRAND "Use hipRAND to provide device-side input generation" ON) # Split up function pool compilation across N files to parallelize its build set(ROCFFT_FUNCTION_POOL_N 8 CACHE STRING "Number of files to split function_pool into for compilation") set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare -Wno-deprecated-declarations ) if( WERROR ) set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) endif( ) set(DEFAULT_GPUS gfx803 gfx900 gfx906 gfx908 gfx90a gfx942 gfx950 gfx1030 gfx1100 gfx1101 gfx1102 gfx1151 gfx1200 gfx1201) if(BUILD_ADDRESS_SANITIZER) add_compile_options(-fsanitize=address) add_link_options(-fsanitize=address) add_link_options(-shared-libasan) SET(DEFAULT_GPUS gfx908:xnack+ gfx90a:xnack+ gfx942:xnack+) add_link_options(-fuse-ld=lld) set(ROCFFT_KERNEL_CACHE_ENABLE off) add_compile_definitions(ADDRESS_SANITIZER) endif() # Build only for local GPU architecture if (BUILD_LOCAL_GPU_TARGET_ONLY) message(STATUS "Building only for local GPU target") if (COMMAND rocm_local_targets) rocm_local_targets(DEFAULT_GPUS) else() message(WARNING "Unable to determine local GPU targets. Falling back to default GPUs.") endif() endif() if(AMDGPU_TARGETS AND NOT GPU_TARGETS) message( DEPRECATION "AMDGPU_TARGETS use is deprecated. Use GPU_TARGETS." ) endif() set(AMDGPU_TARGETS "${DEFAULT_GPUS}" CACHE STRING "Target default GPUs if AMDGPU_TARGETS is not defined. (Deprecated, prefer GPU_TARGETS)") rocm_check_target_ids(AMDGPU_TARGETS TARGETS "${AMDGPU_TARGETS}") # Don't force, users should be able to override GPU_TARGETS at the command line if desired set(GPU_TARGETS "${AMDGPU_TARGETS}" CACHE STRING "GPU architectures to build for") # HIP is required - library and clients use HIP to access the device find_package( hip REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hip/ ) find_package( hiprtc REQUIRED CONFIG PATHS /opt/rocm/lib/cmake/hiprtc/ ) # The nvidia backend can be used to compile for CUDA devices. # Specify the CUDA prefix in the CUDA_PREFIX variable. # CUDA_ARCH (e.g. sm_75) is also required. if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required (e.g. /usr/local/cuda-11.4)" ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required. (e.g. sm_75)" ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) endif( ) # hipcc automatically provides HIP include dirs and HIP platform, # but plain clang needs to be told if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) include_directories( ${HIP_INCLUDE_DIRS} ) if( USE_CUDA ) add_compile_definitions( __HIP_PLATFORM_NVIDIA__ ) else() add_compile_definitions( __HIP_PLATFORM_AMD__ ) endif() endif() # Enable MPI support in rocFFT: option(ROCFFT_MPI_ENABLE "Enable MPI" OFF) option(ROCFFT_CRAY_MPI_ENABLE "Cray MPI" OFF) if( ROCFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) include_directories(SYSTEM ${MPI_INCLUDE_PATH}) endif() add_subdirectory( library ) include( clients/cmake/build-options.cmake ) # Build clients of the library if( BUILD_CLIENTS ) set( BUILD_CLIENTS_BENCH ON ) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) endif( ) # old name for BUILD_CLIENTS_BENCH if( BUILD_CLIENTS_RIDER ) set( BUILD_CLIENTS_BENCH ${BUILD_CLIENTS_RIDER} ) endif() if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_BENCH ) if( NOT CLIENTS_OS ) rocm_set_os_id( CLIENTS_OS ) endif() if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP)) set(OPENMP_DEB "libgomp1") set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(OPENMP_RPM "libgomp1") set(FFTW_RPM "libfftw3-3") else() set(OPENMP_RPM "libgomp") set(FFTW_RPM "fftw-libs") endif() endif() rocm_package_setup_component(clients) if( USE_HIPRAND ) set( HIPRAND_DEP hiprand ) endif() if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS DEB ${OPENMP_DEB} ${FFTW_DEB} ${HIPRAND_DEP} RPM ${OPENMP_RPM} ${FFTW_RPM} ${HIPRAND_DEP} ) endif() if(BUILD_CLIENTS_BENCH) rocm_package_setup_client_component( benchmarks DEPENDS DEB ${HIPRAND_DEP} RPM ${HIPRAND_DEP} ) endif() add_subdirectory( clients ) endif( ) if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") if(CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT) set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) endif() set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) endif() # Package specific CPACK vars string( TOLOWER "${HIP_RUNTIME}" HIP_RUNTIME_LOWER ) if( HIP_RUNTIME_LOWER STREQUAL "rocclr" ) if(BUILD_ADDRESS_SANITIZER) set(DEPENDS_HIP_RUNTIME "hip-runtime-amd-asan" ) else() set(DEPENDS_HIP_RUNTIME "hip-runtime-amd" ) endif() rocm_package_add_dependencies("${DEPENDS_HIP_RUNTIME} >= 4.5.0") endif( ) set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) set( CPACK_RPM_PACKAGE_LICENSE "MIT" ) set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) set( ROCFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" ) set( package_name rocfft ) rocm_create_package( NAME ${package_name} DESCRIPTION "ROCm FFT library" MAINTAINER "rocfft-maintainer@amd.com" LDCONFIG LDCONFIG_DIR ${ROCFFT_CONFIG_DIR} ) option(BUILD_CODE_COVERAGE "Build with code coverage flags (clang only)" OFF) rocFFT-rocm-7.1.0/CppCheckSuppressions.txt000066400000000000000000000003421506652163400204660ustar00rootroot00000000000000// generator uses implicit constructors for convenience noExplicitConstructor:library/src/device/generator/generator.h // has some false positives and isn't hard to run manually for periodic // dead code sweeps unusedFunction rocFFT-rocm-7.1.0/LICENSE.md000066400000000000000000000053601506652163400152200ustar00rootroot00000000000000MIT License Copyright (C) Advanced Micro Devices, Inc. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. --- This product includes software from copyright holders as shown below, and distributed under their license terms as specified. CLI11 2.2 Copyright (c) 2017-2024 University of Cincinnati, developed by Henry Schreiner under NSF AWARD 1414736. All rights reserved. Redistribution and use in source and binary forms of CLI11, with or without modification, are permitted provided that the following conditions are met: 1. Redistributions of source code must retain the above copyright notice, this list of conditions and the following disclaimer. 2. Redistributions in binary form must reproduce the above copyright notice, this list of conditions and the following disclaimer in the documentation and/or other materials provided with the distribution. 3. Neither the name of the copyright holder nor the names of its contributors may be used to endorse or promote products derived from this software without specific prior written permission. THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. rocFFT-rocm-7.1.0/README.md000066400000000000000000000107561506652163400151000ustar00rootroot00000000000000# rocFFT rocFFT is a software library for computing fast Fourier transforms (FFTs) written in the HIP programming language. It's part of the AMD software ecosystem based on [ROCm](https://github.com/ROCm/ROCm). The rocFFT library can be used with AMD GPUs. ## Documentation > [!NOTE] > The published rocFFT documentation is available at [rocFFT](https://rocm.docs.amd.com/projects/rocFFT/en/latest/index.html) in an organized, easy-to-read format, with search and a table of contents. The documentation source files reside in the projects/rocfft/docs folder of the rocm-libraries repository. As with all ROCm projects, the documentation is open source. For more information, see [Contribute to ROCm documentation](https://rocm.docs.amd.com/en/latest/contribute/contributing.html). To build our documentation locally, use the following code: ```Bash cd projects/rocfft/docs pip3 install -r sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Build and install You can install rocFFT using pre-built packages or building from source. * Installing pre-built packages: 1. Download the pre-built packages from the [ROCm package servers](https://rocm.docs.amd.com/en/latest/deploy/linux/index.html) or use the GitHub releases tab to download the source (this may give you a more recent version than the pre-built packages). 2. Run: `sudo apt update && sudo apt install rocfft` * Building from source: rocFFT is compiled with AMD's clang++ and uses CMake. You can specify several options to customize your build. The following commands build a shared library for supported AMD GPUs. Run these commands from the `rocm-libraries/projects/rocfft` directory: ```bash mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_C_COMPILER=amdclang .. make -j ``` You can compile a static library using the `-DBUILD_SHARED_LIBS=off` option. With rocFFT, you can use indirect function calls by default; this requires ROCm 4.3 or higher. You can use `-DROCFFT_CALLBACKS_ENABLED=off` with CMake to prevent these calls on older ROCm compilers. Note that with this configuration, callbacks won't work correctly. rocFFT includes the following clients: * `rocfft-bench`: Runs general transforms and is useful for performance analysis * `rocfft-test`: Runs various regression tests * Various small samples | Client | CMake option | Dependencies | |:------|:-----------------|:-----------------| | `rocfft-bench` | `-DBUILD_CLIENTS_BENCH=on` | hipRAND | | `rocfft-test` | `-DBUILD_CLIENTS_TESTS=on` | hipRAND, FFTW, GoogleTest | | samples | `-DBUILD_CLIENTS_SAMPLES=on` | None | | coverage | `-DBUILD_CODE_COVERAGE=ON` | clang, llvm-cov | Clients are not built by default. To build them, use `-DBUILD_CLIENTS=on`. The build process downloads and builds GoogleTest and FFTW if they are not already installed. Clients can be built separately from the main library. For example, you can build all the clients with an existing rocFFT library by invoking CMake from within the `rocFFT-src/clients` folder: ```bash mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=amdclang++ -DCMAKE_PREFIX_PATH=/path/to/rocFFT-lib .. make -j ``` To install client dependencies on Ubuntu, run: ```bash sudo apt install libgtest-dev libfftw3-dev libboost-dev ``` rocFFT uses version 1.11 of GoogleTest. You can generate a test coverage report with the following: ```bash cmake -DCMAKE_CXX_COMPILER=amdclang++ -DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CODE_COVERAGE=ON .. make -j coverage ``` The above will output the coverage report to the terminal and also save an html coverage report to `$PWD/coverage-report`. ## Examples A summary of the latest functionality and workflow to compute an FFT with rocFFT is available on the [rocFFT documentation portal](https://rocm.docs.amd.com/projects/rocFFT/en/latest/). You can find additional examples in the `clients/samples` subdirectory. ## Support You can report bugs and feature requests through the rocm-libraries GitHub [issue tracker](https://github.com/ROCm/rocm-libraries/issues). ## Contribute If you want to contribute to rocFFT, you must follow the [contribution guidelines](https://github.com/ROCm/rocm-libraries/blob/develop/projects/rocfft/.github/CONTRIBUTING.md). rocFFT-rocm-7.1.0/ValgrindSuppressions.txt000066400000000000000000000002671506652163400205620ustar00rootroot00000000000000{ Memcheck:Param sched_setaffinity(mask) ... fun:hipMalloc } { Memcheck:Param sched_setaffinity(mask) ... fun:hipMemGetInfo }rocFFT-rocm-7.1.0/clients/000077500000000000000000000000001506652163400152515ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/CMakeLists.txt000066400000000000000000000106731506652163400200200ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_BUILD_SCOPE ON ) # This project may compile dependencies for clients project( rocfft-clients LANGUAGES CXX C ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT ROCmCMakeBuildTools_FOUND ) find_package( ROCmCMakeBuildTools REQUIRED ) endif() include( ROCMInstallTargets ) # Adding Version File to rocfft-client, this avoids empty rocfft-client package file ( WRITE "${PROJECT_BINARY_DIR}/package/client-version" "${rocfft_VERSION_MAJOR}.${rocfft_VERSION_MINOR}.${rocfft_VERSION_PATCH}-${BUILD_ID}\n" ) rocm_install ( FILES ${PROJECT_BINARY_DIR}/package/client-version DESTINATION .info COMPONENT clients) # This option only works for make/nmake and the ninja generators, but # no reason it shouldn't be on all the time. # This tells cmake to create a compile_commands.json file that can be # used with clang tooling or vim. set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) if(NOT ROCFFT_BUILD_SCOPE AND NOT BUILD_CLIENTS_SAMPLES AND NOT BUILD_CLIENTS_TESTS AND NOT BUILD_CLIENTS_BENCH) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) set( BUILD_CLIENTS_BENCH ON ) endif() # each backend requires different libraries for host and device code if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required." ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required." ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) add_compile_options(--cuda-path=${CUDA_PREFIX} --cuda-gpu-arch=${CUDA_ARCH} -xcuda) set( ROCFFT_CLIENTS_HOST_LINK_LIBS -lcudart -ldl -lrt ) else() set( ROCFFT_CLIENTS_HOST_LINK_LIBS hip::host ) set( ROCFFT_CLIENTS_DEVICE_LINK_LIBS hip::device ) endif() if( ROCFFT_MPI_ENABLE ) find_package( MPI REQUIRED ) endif() if( BUILD_CLIENTS_SAMPLES ) add_subdirectory( samples ) endif( ) if( BUILD_CLIENTS_TESTS ) add_subdirectory( tests ) endif( ) if( BUILD_CLIENTS_BENCH ) add_subdirectory( bench ) endif( ) rocFFT-rocm-7.1.0/clients/bench/000077500000000000000000000000001506652163400163305ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/bench/CMakeLists.txt000066400000000000000000000122011506652163400210640ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-bench LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() if( NOT ROCmCMakeBuildTools_FOUND ) find_package( ROCmCMakeBuildTools REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) set( bench_list rocfft-bench dyna-rocfft-bench ) foreach( bench ${bench_list}) if(${bench} STREQUAL "rocfft-bench") add_executable( ${bench} ../../shared/array_validator.cpp bench.cpp bench.h ) else() add_executable( ${bench} ../../shared/array_validator.cpp dyna-bench.cpp bench.h ) endif() target_compile_options( ${bench} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) # NB: hip-clang includes omp.h, so we need to specify the location # of ROCM_CLANG_ROOT at cmake config time if we are using clang++. target_include_directories( ${bench} PRIVATE $ ${HIP_CLANG_ROOT}/include ${ROCM_CLANG_ROOT}/include ) if(${bench} STREQUAL "rocfft-bench") target_link_libraries( ${bench} PRIVATE hip::device roc::rocfft ) else() target_link_libraries( ${bench} PRIVATE ${CMAKE_DL_LIBS} hip::device ) endif() if( USE_HIPRAND ) target_link_libraries( ${bench} PRIVATE hip::hiprand ) target_compile_definitions( ${bench} PRIVATE USE_HIPRAND ) endif() # We need to include both rocfft.h and rocfft-export.h target_include_directories( ${bench} PRIVATE ${CMAKE_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/ ${HIP_CLANG_ROOT}/include ) target_link_libraries( ${bench} PUBLIC ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) if( ROCFFT_MPI_ENABLE ) target_link_libraries( ${bench} PRIVATE MPI::MPI_CXX ) if ( ROCFFT_CRAY_MPI_ENABLE) target_link_libraries( ${bench} PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${bench} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() endif() set_target_properties( ${bench} PROPERTIES CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( BENCH_OUT_DIR "/../bin" ) else() set( BENCH_OUT_DIR "/bin") endif() string( CONCAT BENCH_OUT_DIR "${PROJECT_BINARY_DIR}" ${BENCH_OUT_DIR} ) set_target_properties(${bench} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${BENCH_OUT_DIR} ) rocm_install(TARGETS ${bench} COMPONENT benchmarks) endforeach() # Link dyna-rocfft-bench to the experimental filesystem library if # it's not available in the standard library. include( ../../cmake/std-filesystem.cmake ) target_link_std_experimental_filesystem( dyna-rocfft-bench ) rocFFT-rocm-7.1.0/clients/bench/bench.cpp000066400000000000000000000362101506652163400201150ustar00rootroot00000000000000// Copyright (C) 2016 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_params.h" #include "bench.h" #include "rocfft/rocfft.h" int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio(false); // Control output verbosity: int verbose{}; // number of GPUs to use: int ngpus{}; // hip Device number for running tests: int deviceId{}; // Ignore runtime failures. // eg: hipMalloc failing when there isn't enough free vram. bool ignore_hip_runtime_failures{true}; // Number of performance trial samples int ntrial{}; // FFT parameters: rocfft_params params; // input/output FFT grids std::vector ingrid; std::vector outgrid; // Token string to fully specify fft params. std::string token; CLI::App app{"rocfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("--version", "Print queryable version information from the rocfft library") ->each([](const std::string&) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << std::endl; return EXIT_SUCCESS; }); CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--auto_allocation", params.auto_allocate, "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token->add_option("--ngpus", ngpus, "Number of GPUs to use") ->default_val(1) ->check(CLI::NonNegativeNumber); // define multi-GPU grids for FFT computation, CLI::Option* opt_ingrid = non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input") ->expected(1, 3) ->needs("--ngpus"); CLI::Option* opt_outgrid = non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output") ->expected(1, 3) ->needs("--ngpus"); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures", ignore_hip_runtime_failures, "Ignore hip runtime failures"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } std::cout << std::flush; } else // generate token { if(ngpus > 1) { // set default GPU grids in case none were given params.set_default_grid(ngpus, ingrid, outgrid); // split the problem among ngpus params.mp_lib = fft_params::fft_mp_lib_none; int localDeviceCount = 0; if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) { throw std::runtime_error("hipGetDeviceCount failed"); } // start with all-ones in grids std::vector input_grid(params.length.size() + 1, 1); std::vector output_grid(params.length.size() + 1, 1); // create input and output grids and distribute it according to user requirements std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1); std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1); params.distribute_input(localDeviceCount, input_grid); params.distribute_output(localDeviceCount, output_grid); } if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ingrid || !ingrid.empty()) { std::cout << "input grid:"; for(auto& i : ingrid) std::cout << " " << i; std::cout << "\n"; } if(*opt_outgrid || !outgrid.empty()) { std::cout << "output grid:"; for(auto& i : outgrid) std::cout << " " << i; std::cout << "\n"; } std::cout << "\n"; } std::cout << std::flush; rocfft_setup(); // Set GPU for single-device FFT computation rocfft_scoped_device dev(deviceId); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str(" ") << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; try { HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } auto ret = params.create_plan(); if(ret != fft_status_success) LIB_V_THROW(rocfft_status_failure, "Plan creation failed"); // GPU input buffer: std::vector ibuffer; std::vector pibuffer; // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); auto ibricks = get_input_bricks(params); auto obricks = get_output_bricks(params); std::vector obuffer_data; std::vector* obuffer = nullptr; alloc_bench_bricks( params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen); pibuffer.resize(ibuffer.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { pibuffer[i] = ibuffer[i].data(); } // print input if requested if(verbose > 1) { if(is_host_gen) { // data is already on host params.print_ibuffer(ibuffer_cpu); } else { print_device_buffer(params, ibuffer, true); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); // Execute a warm-up call params.execute(pibuffer.data(), pobuffer.data()); // Run the transform several times and record the execution time: std::vector gpu_time(ntrial); hipEvent_wrapper_t start, stop; start.alloc(); stop.alloc(); for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial) { // Create input at every iteration to avoid overflow if(is_host_gen) { copy_host_input_to_dev(ibuffer_cpu, ibuffer); } else { init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); } HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); params.execute(pibuffer.data(), pobuffer.data()); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); gpu_time[itrial] = time; // Print result after FFT transform if(verbose > 2) { print_device_buffer(params, *obuffer, false); } } std::cout << "\nExecution gpu time:"; for(const auto& i : gpu_time) { std::cout << " " << i; } std::cout << " ms" << std::endl; std::cout << "Execution gflops: "; const double totsize = product(params.length.begin(), params.length.end()); const double k = ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5 : 5.0; const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0); for(const auto& i : gpu_time) { std::cout << " " << opscount / (1e6 * i); } std::cout << std::endl; rocfft_cleanup(); } rocFFT-rocm-7.1.0/clients/bench/bench.h000066400000000000000000000250631506652163400175660ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_BENCH_H #define ROCFFT_BENCH_H #include "../../shared/fft_params.h" #include "../../shared/rocfft_hip.h" #include "rocfft/rocfft.h" #include #include class rocfft_hip_runtime_error : public std::runtime_error { public: rocfft_hip_runtime_error(const std::string& msg = "") : runtime_error(msg) { } }; // This is used to either wrap a HIP function call, or to explicitly check a variable // for an error condition. If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings // supported inline void hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != hipSuccess) { std::stringstream tmp; tmp << "HIP_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw rocfft_hip_runtime_error(errorm); } } class rocfft_runtime_error : public std::runtime_error { public: rocfft_runtime_error(const std::string& msg = "") : runtime_error(msg) { } }; inline void lib_V_Throw(rocfft_status res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != rocfft_status_success) { std::stringstream tmp; tmp << "LIB_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw rocfft_runtime_error(errorm); } } #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) // return input bricks for params, or one big brick covering the // input field if no bricks are specified template std::vector get_input_bricks(const Tparams& params) { std::vector bricks; if(!params.ifields.empty()) bricks = params.ifields[0].bricks; else { auto len = params.ilength(); // just make one big brick covering the whole input field bricks.resize(1); bricks.front().lower.resize(len.size() + 1); bricks.front().upper.resize(len.size() + 1); bricks.front().stride.resize(len.size() + 1); bricks.front().upper.front() = params.nbatch; std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1); bricks.front().stride.front() = params.idist; std::copy(params.istride.begin(), params.istride.end(), bricks.front().stride.begin() + 1); } return bricks; } // return output bricks for params, or one big brick covering the // output field if no bricks are specified template std::vector get_output_bricks(const Tparams& params) { std::vector bricks; if(!params.ofields.empty()) bricks = params.ofields[0].bricks; else { auto len = params.olength(); // just make one big brick covering the whole output field bricks.resize(1); bricks.front().lower.resize(len.size() + 1); bricks.front().upper.resize(len.size() + 1); bricks.front().stride.resize(len.size() + 1); bricks.front().upper.front() = params.nbatch; std::copy(len.begin(), len.end(), bricks.front().upper.begin() + 1); bricks.front().stride.front() = params.odist; std::copy(params.ostride.begin(), params.ostride.end(), bricks.front().stride.begin() + 1); } return bricks; } // Allocate input/output buffers for a bench run. template void alloc_bench_bricks(const Tparams& params, const std::vector& ibricks, const std::vector& obricks, std::vector& ibuffers, std::vector& obuffer_data, std::vector*& obuffers, std::vector& host_buffers, bool is_host_gen) { auto alloc_buffers = [¶ms, &host_buffers](const std::vector& bricks, fft_array_type type, std::vector& output, bool is_host_gen) { auto elem_size = var_size(params.precision, type); const bool is_planar = type == fft_array_type_complex_planar || type == fft_array_type_hermitian_planar; // alloc 2x buffers, each half size for planar if(is_planar) elem_size /= 2; for(const auto& b : bricks) { rocfft_scoped_device dev(b.device); size_t brick_size_bytes = compute_ptrdiff(b.length(), b.stride, 0, 0) * elem_size; output.emplace_back(); if(output.back().alloc(brick_size_bytes) != hipSuccess) throw std::runtime_error("hipMalloc failed"); if(is_planar) { output.emplace_back(); if(output.back().alloc(brick_size_bytes) != hipSuccess) throw std::runtime_error("hipMalloc failed"); } if(is_host_gen) { host_buffers.emplace_back(); host_buffers.back().alloc(brick_size_bytes); if(is_planar) { host_buffers.emplace_back(); host_buffers.back().alloc(brick_size_bytes); } } } }; // If brick shape differs, inplace is only allowed for single // bricks. e.g. in-place real-complex if(params.placement == fft_placement_inplace) { if(ibricks.size() != 1 && obricks.size() != 1 && ibricks != obricks) throw std::runtime_error( "in-place transform to different brick shapes only allowed for single bricks"); // allocate the larger of the two bricks auto isize_bytes = compute_ptrdiff(ibricks.front().length(), ibricks.front().stride, 0, 0) * var_size(params.precision, params.itype); auto osize_bytes = compute_ptrdiff(obricks.front().length(), obricks.front().stride, 0, 0) * var_size(params.precision, params.otype); alloc_buffers(isize_bytes > osize_bytes ? ibricks : obricks, isize_bytes > osize_bytes ? params.itype : params.otype, ibuffers, is_host_gen); obuffers = &ibuffers; } else { alloc_buffers(ibricks, params.itype, ibuffers, is_host_gen); alloc_buffers(obricks, params.otype, obuffer_data, false); obuffers = &obuffer_data; } } void copy_host_input_to_dev(std::vector& host_buffers, std::vector& buffers) { for(size_t i = 0; i < buffers.size(); ++i) { if(hipMemcpy(buffers[i].data(), host_buffers[i].data(), host_buffers[i].size(), hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failure"); } } template void init_bench_input(const Tparams& params, const std::vector& bricks, std::vector& buffers, std::vector& host_buffers, bool is_host_gen) { auto elem_size = var_size(params.precision, params.itype); if(is_host_gen) { std::vector ptrs; ptrs.reserve(host_buffers.size()); for(auto& buf : host_buffers) ptrs.push_back(buf.data()); init_local_input(0, params, bricks, elem_size, ptrs); copy_host_input_to_dev(host_buffers, buffers); } else { #ifdef USE_HIPRAND std::vector ptrs; ptrs.reserve(buffers.size()); for(auto& buf : buffers) ptrs.push_back(buf.data()); init_local_input(0, params, bricks, elem_size, ptrs); #endif } } template void print_device_buffer(const Tparams& params, std::vector& buffer, bool input) { // copy data back to host std::vector print_buffer; for(auto& buf : buffer) { print_buffer.emplace_back(); print_buffer.back().alloc(buf.size()); if(hipMemcpy(print_buffer.back().data(), buf.data(), buf.size(), hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed"); } if(input) params.print_ibuffer(print_buffer); else params.print_obuffer(print_buffer); } #endif // ROCFFT_BENCH_H rocFFT-rocm-7.1.0/clients/bench/dyna-bench.cpp000066400000000000000000000731441506652163400210550ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // This file allows one to run tests multiple different rocFFT libraries at the same time. // This allows one to randomize the execution order for better a better experimental setup // which produces fewer type 1 errors where one incorrectly rejects the null hypothesis. #include #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif #include #include #include #include #ifdef WIN32 #include // psapi.h requires windows.h to be included first #include #else #include #include #endif #include "../../shared/CLI11.hpp" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/rocfft_params.h" #include "bench.h" #include "rocfft/rocfft.h" #ifdef WIN32 typedef HMODULE ROCFFT_LIB; #else typedef void* ROCFFT_LIB; #endif // Load the rocfft library ROCFFT_LIB rocfft_lib_load(const std::string& path) { #ifdef WIN32 return LoadLibraryA(path.c_str()); #else return dlopen(path.c_str(), RTLD_LAZY); #endif } // Return a string describing the error loading rocfft const char* rocfft_lib_load_error() { #ifdef WIN32 // just return the error number static std::string error_str; error_str = std::to_string(GetLastError()); return error_str.c_str(); #else return dlerror(); #endif } // Get symbol from rocfft lib void* rocfft_lib_symbol(ROCFFT_LIB libhandle, const char* sym) { #ifdef WIN32 return reinterpret_cast(GetProcAddress(libhandle, sym)); #else return dlsym(libhandle, sym); #endif } void rocfft_lib_close(ROCFFT_LIB libhandle) { #ifdef WIN32 FreeLibrary(libhandle); #else dlclose(libhandle); #endif } // Given a libhandle from dload, return a plan to a rocFFT plan with the given parameters. rocfft_plan make_plan(ROCFFT_LIB libhandle, const fft_params& params) { auto procfft_setup = (decltype(&rocfft_setup))rocfft_lib_symbol(libhandle, "rocfft_setup"); if(procfft_setup == NULL) throw rocfft_runtime_error("rocfft_setup failed"); auto procfft_plan_description_create = (decltype(&rocfft_plan_description_create))rocfft_lib_symbol( libhandle, "rocfft_plan_description_create"); auto procfft_plan_description_destroy = (decltype(&rocfft_plan_description_destroy))rocfft_lib_symbol( libhandle, "rocfft_plan_description_destroy"); auto procfft_plan_description_set_data_layout = (decltype(&rocfft_plan_description_set_data_layout))rocfft_lib_symbol( libhandle, "rocfft_plan_description_set_data_layout"); auto procfft_plan_create = (decltype(&rocfft_plan_create))rocfft_lib_symbol(libhandle, "rocfft_plan_create"); procfft_setup(); rocfft_plan_description desc = NULL; LIB_V_THROW(procfft_plan_description_create(&desc), "rocfft_plan_description_create failed"); LIB_V_THROW( procfft_plan_description_set_data_layout(desc, rocfft_array_type_from_fftparams(params.itype), rocfft_array_type_from_fftparams(params.otype), params.ioffset.data(), params.ooffset.data(), params.istride.size(), params.istride.data(), params.idist, params.ostride.size(), params.ostride.data(), params.odist), "rocfft_plan_description_data_layout failed"); rocfft_plan plan = NULL; LIB_V_THROW(procfft_plan_create(&plan, rocfft_result_placement_from_fftparams(params.placement), rocfft_transform_type_from_fftparams(params.transform_type), rocfft_precision_from_fftparams(params.precision), params.length.size(), params.length.data(), params.nbatch, desc), "rocfft_plan_create failed"); LIB_V_THROW(procfft_plan_description_destroy(desc), "rocfft_plan_description_destroy failed"); return plan; } // Given a libhandle from dload and a rocFFT plan, destroy the plan. void destroy_plan(ROCFFT_LIB libhandle, rocfft_plan& plan) { auto procfft_plan_destroy = (decltype(&rocfft_plan_destroy))rocfft_lib_symbol(libhandle, "rocfft_plan_destroy"); LIB_V_THROW(procfft_plan_destroy(plan), "rocfft_plan_destroy failed"); auto procfft_cleanup = (decltype(&rocfft_cleanup))rocfft_lib_symbol(libhandle, "rocfft_cleanup"); if(procfft_cleanup) LIB_V_THROW(procfft_cleanup(), "rocfft_cleanup failed"); } // Given a libhandle from dload and a rocFFT execution info structure, destroy the info. void destroy_info(ROCFFT_LIB libhandle, rocfft_execution_info& info) { auto procfft_execution_info_destroy = (decltype(&rocfft_execution_info_destroy))rocfft_lib_symbol( libhandle, "rocfft_execution_info_destroy"); LIB_V_THROW(procfft_execution_info_destroy(info), "rocfft_execution_info_destroy failed"); } // Given a libhandle from dload, and a corresponding rocFFT plan, return how much work // buffer is required. size_t get_wbuffersize(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_work_buffer_size = (decltype(&rocfft_plan_get_work_buffer_size))rocfft_lib_symbol( libhandle, "rocfft_plan_get_work_buffer_size"); // Get the buffersize size_t workBufferSize = 0; LIB_V_THROW(procfft_plan_get_work_buffer_size(plan, &workBufferSize), "rocfft_plan_get_work_buffer_size failed"); return workBufferSize; } // Given a libhandle from dload and a corresponding rocFFT plan, print the plan information. void show_plan(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_print = (decltype(&rocfft_plan_get_print))rocfft_lib_symbol(libhandle, "rocfft_plan_get_print"); LIB_V_THROW(procfft_plan_get_print(plan), "rocfft_plan_get_print failed"); } // FIXME: doc rocfft_execution_info make_execinfo(ROCFFT_LIB libhandle) { auto procfft_execution_info_create = (decltype(&rocfft_execution_info_create))rocfft_lib_symbol( libhandle, "rocfft_execution_info_create"); rocfft_execution_info info = NULL; LIB_V_THROW(procfft_execution_info_create(&info), "rocfft_execution_info_create failed"); return info; } // FIXME: doc void set_work_buffer(const ROCFFT_LIB& libhandle, rocfft_execution_info& info, const size_t wbuffersize, void* wbuffer) { if(wbuffersize > 0 && wbuffer != NULL) { auto procfft_execution_info_set_work_buffer = (decltype(&rocfft_execution_info_set_work_buffer))rocfft_lib_symbol( libhandle, "rocfft_execution_info_set_work_buffer"); LIB_V_THROW(procfft_execution_info_set_work_buffer(info, wbuffer, wbuffersize), "rocfft_execution_info_set_work_buffer failed"); } } // Given a libhandle from dload and a corresponding rocFFT plan and execution info, // execute a transform on the given input and output buffers and return the kernel // execution time. float run_plan( ROCFFT_LIB libhandle, rocfft_plan plan, rocfft_execution_info info, void** in, void** out) { auto procfft_execute = (decltype(&rocfft_execute))rocfft_lib_symbol(libhandle, "rocfft_execute"); hipEvent_wrapper_t start, stop; start.alloc(); stop.alloc(); HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); auto rcfft = procfft_execute(plan, in, out, info); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); if(rcfft != rocfft_status_success) { throw std::runtime_error("execution failed"); } float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); return time; } std::pair create_handleplan(const std::string& libstring, const fft_params& params) { auto libhandle = rocfft_lib_load(libstring); if(libhandle == NULL) { std::stringstream ss; ss << "Failed to open " << libstring << ", error: " << rocfft_lib_load_error(); throw std::runtime_error(ss.str()); } auto plan = make_plan(libhandle, params); return std::make_pair(libhandle, plan); } int main(int argc, char* argv[]) { // Control output verbosity: int verbose{}; // number of GPUs to use: int ngpus{}; // hip Device number for running tests: int deviceId{}; // Ignore runtime failures. // eg: hipMalloc failing when there isn't enough free vram. bool ignore_hip_runtime_failures{true}; // Number of performance trial samples: int ntrial{}; // Bool to specify whether the libs are loaded in forward or forward+reverse order. int reverse{}; // Test sequence choice: int test_sequence{}; // Vector of test target libraries std::vector lib_strings; // FFT parameters: fft_params params; // input/output FFT grids std::vector ingrid; std::vector outgrid; // Token string to fully specify fft params. std::string token; CLI::App app{"dyna-rocfft-bench command line options"}; // Declare the supported options. Some option pointers are declared to track passed opts. // FIXME: version needs to be implemented app.add_flag("--version", "Print queryable version information from the rocfft library and exit"); app.add_flag("--reverse", reverse, "Load libs in forward and reverse order")->default_val(1); app.add_option( "--sequence", test_sequence, "Test sequence:\n0) random\n1) alternating\n2) sequential") ->default_val(0); app.add_option("--lib", lib_strings, "Set test target library full path (appendable)"); CLI::Option* opt_token = app.add_option("--token", token, "Token to read FFT params from")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option( "--precision", params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); CLI::Option* opt_not_in_place = non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); CLI::Option* opt_length = non_token->add_option("--length", params.length, "Lengths")->required()->expected(1, 3); non_token->add_option("--ngpus", ngpus, "Number of GPUs to use") ->default_val(1) ->check(CLI::NonNegativeNumber); // define multi-GPU grids for FFT computation, CLI::Option* opt_ingrid = non_token->add_option("--ingrid", ingrid, "Single-process grid of GPUs at input") ->expected(1, 3) ->needs("--ngpus"); CLI::Option* opt_outgrid = non_token->add_option("--outgrid", outgrid, "Single-process grid of GPUs at output") ->expected(1, 3) ->needs("--ngpus"); non_token ->add_option("-b, --batchSize", params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); CLI::Option* opt_istride = non_token->add_option("--istride", params.istride, "Input strides"); CLI::Option* opt_ostride = non_token->add_option("--ostride", params.ostride, "Output strides"); non_token->add_option("--idist", params.idist, "Logical distance between input batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "idist: " << val << "\n"; }); non_token->add_option("--odist", params.odist, "Logical distance between output batches") ->default_val(0) ->each([&](const std::string& val) { std::cout << "odist: " << val << "\n"; }); CLI::Option* opt_ioffset = non_token->add_option("--ioffset", params.ioffset, "Input offset"); CLI::Option* opt_ooffset = non_token->add_option("--ooffset", params.ooffset, "Output offset"); app.add_flag("--ignore_runtime_failures,!--no-ignore_runtime_failures", ignore_hip_runtime_failures, "Ignore hip runtime failures"); app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); app.add_option("--verbose", verbose, "Control output verbosity")->default_val(0); app.add_option("-N, --ntrial", ntrial, "Trial size for the problem") ->default_val(1) ->each([&](const std::string& val) { std::cout << "Running profile with " << val << " samples\n"; }); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); app.add_option("--isize", params.isize, "Logical size of input buffer"); app.add_option("--osize", params.osize, "Logical size of output buffer"); app.add_option("--scalefactor", params.scale_factor, "Scale factor to apply to output"); // Parse args and catch any errors here try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Check if all the provided libraries are actually there: for(const auto& lib_string : lib_strings) { if(!std::filesystem::exists(lib_string)) { std::cerr << "Error: lib " << lib_string << " does not exist\n"; return EXIT_FAILURE; } } if(!token.empty()) { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } } else { if(ngpus > 1) { // set default GPU grids in case none were given params.set_default_grid(ngpus, ingrid, outgrid); // split the problem among ngpus params.mp_lib = fft_params::fft_mp_lib_none; int localDeviceCount = 0; if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) { throw std::runtime_error("hipGetDeviceCount failed"); } // start with all-ones in grids std::vector input_grid(params.length.size() + 1, 1); std::vector output_grid(params.length.size() + 1, 1); // create input and output grids and distribute it according to user requirements std::copy(ingrid.begin(), ingrid.end(), input_grid.begin() + 1); std::copy(outgrid.begin(), outgrid.end(), output_grid.begin() + 1); params.distribute_input(localDeviceCount, input_grid); params.distribute_output(localDeviceCount, output_grid); } if(*opt_not_in_place) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(*opt_length) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(*opt_istride) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ostride) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(*opt_ioffset) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ooffset) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } if(*opt_ingrid || !ingrid.empty()) { std::cout << "input grid:"; for(auto& i : ingrid) std::cout << " " << i; std::cout << "\n"; } if(*opt_outgrid || !outgrid.empty()) { std::cout << "output grid:"; for(auto& i : outgrid) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; // Set GPU for single-device FFT computation rocfft_scoped_device dev(deviceId); params.validate(); if(!params.valid(verbose)) { throw rocfft_runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str() << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; try { HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } // GPU input buffer: std::vector ibuffer; std::vector pibuffer; // CPU-side input buffer std::vector ibuffer_cpu; auto is_host_gen = (params.igen == fft_input_generator_host || params.igen == fft_input_random_generator_host); auto ibricks = get_input_bricks(params); auto obricks = get_output_bricks(params); std::vector obuffer_data; std::vector* obuffer = nullptr; alloc_bench_bricks( params, ibricks, obricks, ibuffer, obuffer_data, obuffer, ibuffer_cpu, is_host_gen); init_bench_input(params, ibricks, ibuffer, ibuffer_cpu, is_host_gen); for(unsigned int i = 0; i < ibuffer.size(); ++i) { pibuffer.push_back(ibuffer[i].data()); } // print input if requested if(verbose > 1) { if(is_host_gen) { // data is already on host params.print_ibuffer(ibuffer_cpu); } else { print_device_buffer(params, ibuffer, true); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // Execution times for loaded libraries: std::vector> time(lib_strings.size()); // If we are doing a reverse-run, then we need two ntrials; otherwise, just one. std::vector ntrial_runs; if(reverse == 0) { ntrial_runs.push_back(ntrial); } else { ntrial_runs.push_back((ntrial + 1) / 2); ntrial_runs.push_back(ntrial / 2); } for(size_t ridx = 0; ridx < ntrial_runs.size(); ++ridx) { std::vector> index_lib_string; for(size_t i = 0; i < lib_strings.size(); ++i) { index_lib_string.push_back(std::make_pair(i, lib_strings[i])); } if(ridx == 1) { std::reverse(index_lib_string.begin(), index_lib_string.end()); } // Create the handles to the libs and the associated fft plans. std::vector handle; std::vector plan; // Allocate the work buffer: just one, big enough for any dloaded library. std::vector info; size_t wbuffer_size = 0; for(unsigned int idx = 0; idx < lib_strings.size(); ++idx) { std::cout << idx << ": " << lib_strings[idx] << "\n"; auto libhandle = rocfft_lib_load(lib_strings[idx]); if(libhandle == NULL) { std::cout << "Failed to open " << lib_strings[idx] << ", error: " << rocfft_lib_load_error() << "\n"; return 1; } handle.push_back(libhandle); plan.push_back(make_plan(handle[idx], params)); show_plan(handle[idx], plan[idx]); wbuffer_size = std::max(wbuffer_size, get_wbuffersize(handle[idx], plan[idx])); info.push_back(make_execinfo(handle[idx])); } std::cout << "Work buffer size: " << wbuffer_size << std::endl; if(!vram_fits_problem(raw_vram_footprint + wbuffer_size, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << " + " << +wbuffer_size << " = " << raw_vram_footprint + wbuffer_size << " ) data too large for device.\n"; return EXIT_SUCCESS; } gpubuf wbuffer; if(wbuffer_size) { try { HIP_V_THROW(wbuffer.alloc(wbuffer_size), "Creating intermediate Buffer failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } // Associate the work buffer to the individual libraries: for(unsigned int idx = 0; idx < lib_strings.size(); ++idx) { set_work_buffer(handle[idx], info[idx], wbuffer_size, wbuffer.data()); } // Run the plan using its associated rocFFT library: for(unsigned int idx = 0; idx < handle.size(); ++idx) { try { run_plan(handle[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data()); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::vector testcase(ntrial_runs[ridx] * index_lib_string.size()); switch(test_sequence) { case 0: { // Random order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[index_lib_string.size() * itrial + ilib] = ilib; } } std::random_device rd; std::mt19937 g(rd()); std::shuffle(testcase.begin(), testcase.end(), g); break; } case 1: // Alternating order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[index_lib_string.size() * itrial + ilib] = ilib; } } break; case 2: // Sequential order: for(int itrial = 0; itrial < ntrial_runs[ridx]; ++itrial) { for(size_t ilib = 0; ilib < index_lib_string.size(); ++ilib) { testcase[ilib * ntrial + itrial] = ilib; } } break; default: throw std::runtime_error("Invalid test sequence choice."); } if(verbose > 3) { std::cout << "Test case order:"; for(const auto val : testcase) std::cout << " " << val; std::cout << "\n"; } std::cout << "Running the tests...\n"; for(size_t itest = 0; itest < testcase.size(); ++itest) { const int tidx = testcase[itest]; if(verbose > 3) { std::cout << "running test case " << tidx << " with lib " << index_lib_string[tidx].second << "\n"; } #ifdef USE_HIPRAND if(!is_host_gen) params.compute_input(ibuffer); #endif if(is_host_gen) { for(unsigned int bidx = 0; bidx < ibuffer_cpu.size(); ++bidx) { try { HIP_V_THROW(hipMemcpy(pibuffer[bidx], ibuffer_cpu[bidx].data(), ibuffer_cpu[bidx].size(), hipMemcpyHostToDevice), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } } // Run the plan using its associated rocFFT library: try { time[tidx].push_back(run_plan( handle[tidx], plan[tidx], info[tidx], pibuffer.data(), pobuffer.data())); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int iout = 0; iout < output.size(); ++iout) { try { HIP_V_THROW(hipMemcpy(output[iout].data(), pobuffer[iout], output[iout].size(), hipMemcpyDeviceToHost), "hipMemcpy failed"); } catch(rocfft_hip_runtime_error) { return ignore_hip_runtime_failures ? EXIT_SUCCESS : EXIT_FAILURE; } } std::cout << "GPU output:\n"; params.print_obuffer(output); } } // Clean up: for(unsigned int hidx = 0; hidx < handle.size(); ++hidx) { destroy_info(handle[hidx], info[hidx]); destroy_plan(handle[hidx], plan[hidx]); rocfft_lib_close(handle[hidx]); } } std::cout << "Execution times in ms:\n"; for(unsigned int idx = 0; idx < time.size(); ++idx) { std::cout << "\nExecution gpu time:"; for(auto& i : time[idx]) { std::cout << " " << i; } std::cout << " ms" << std::endl; } return EXIT_SUCCESS; } rocFFT-rocm-7.1.0/clients/cmake/000077500000000000000000000000001506652163400163315ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/cmake/build-gtest.cmake000066400000000000000000000046041506652163400215620ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include( ExternalProject ) option( BUILD_GTEST "Download and build GoogleTest" OFF ) if( NOT BUILD_GTEST ) find_package( GTest 1.11.0 ) endif() if( (BUILD_GTEST OR NOT GTEST_FOUND) AND (NOT TARGET gtest) ) set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include) set(GTEST_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}) set(GTEST_SRC_URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz CACHE STRING "Location of GTest source code") set(GTEST_SRC_SHA256 b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 CACHE STRING "SHA256 hash of GTest source code") ExternalProject_Add(gtest URL ${GTEST_SRC_URL} URL_HASH SHA256=${GTEST_SRC_SHA256} PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${GTEST_LIBRARIES}) ExternalProject_Get_Property( gtest source_dir binary_dir ) endif() rocFFT-rocm-7.1.0/clients/cmake/build-options.cmake000066400000000000000000000036001506652163400221220ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # This file is intended to be used in two ways; independently in a stand alone PROJECT # and as part of a superbuild. If the file is included in a stand alone project, the # variables are not expected to be preset, and this will produce options() in the GUI # for the user to examine. If this file is included in a superbuild, the options will be # presented in the superbuild GUI, but then passed into the ExternalProject as -D # parameters, which would already define them. if( NOT BUILD_CLIENTS_TESTS ) option( BUILD_CLIENTS_TESTS "Build rocFFT unit tests" OFF ) endif( ) if( NOT BUILD_CLIENTS_BENCH ) option( BUILD_CLIENTS_BENCH "Build rocFFT benchmarks" OFF ) endif( ) if( NOT BUILD_CLIENTS_SAMPLES ) option( BUILD_CLIENTS_SAMPLES "Build rocFFT samples" OFF ) endif( ) rocFFT-rocm-7.1.0/clients/samples/000077500000000000000000000000001506652163400167155ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/CMakeLists.txt000066400000000000000000000050531506652163400214600ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ON ) project( rocfft-clients-samples LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) list( APPEND samples_subdirs "fixed-16" ) list( APPEND samples_subdirs "fixed-large" ) list( APPEND samples_subdirs "rocfft" ) list( APPEND samples_subdirs "multi_gpu" ) if( ROCFFT_MPI_ENABLE ) list( APPEND samples_subdirs "mpi" ) endif() foreach( client ${samples_subdirs} ) add_subdirectory( ${client} ) endforeach( ) rocFFT-rocm-7.1.0/clients/samples/fixed-16/000077500000000000000000000000001506652163400202405ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/fixed-16/CMakeLists.txt000066400000000000000000000073271506652163400230110ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-16 LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() set( sample_list fixed-16-float fixed-16-double fixed-16-half ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../bin" ) else() set( FIXED_16_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_16_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_16_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_16_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-double.cpp000066400000000000000000000114341506652163400235420ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-float.cpp000066400000000000000000000114301506652163400233710ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/fixed-16/fixed-16-half.cpp000066400000000000000000000116601506652163400232030ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft/rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector<_Float16_2> cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7)); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(_Float16_2); // Create HIP device object. _Float16_2* x = nullptr; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_half, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector<_Float16_2> y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << "," << static_cast(cx[i].y) << ")" << " output: (" << static_cast(y[i].x) << "," << static_cast(y[i].y) << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/fixed-large/000077500000000000000000000000001506652163400211045ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/fixed-large/CMakeLists.txt000066400000000000000000000073331506652163400236520ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-large LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() set( sample_list fixed-large-float fixed-large-double ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../bin" ) else() set( FIXED_LARGE_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_LARGE_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_LARGE_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_LARGE_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-7.1.0/clients/samples/fixed-large/fixed-large-double.cpp000066400000000000000000000116541506652163400252560ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft/rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/fixed-large/fixed-large-float.cpp000066400000000000000000000116501506652163400251050ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft/rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/mpi/000077500000000000000000000000001506652163400175025ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/mpi/CMakeLists.txt000066400000000000000000000107211506652163400222430ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-rocfft LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() if( NOT MPI_FOUND ) find_package( MPI REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list rocfft_mpi_example ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ${MPI_CXX_INCLUDE_PATH} ) target_link_libraries( ${sample} PRIVATE roc::rocfft MPI::MPI_CXX ) message( "MPI_CXX_LIB_NAMES: ${MPI_CXX_LIB_NAMES}") if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( ${sample} PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${sample} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() if ( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-7.1.0/clients/samples/mpi/rocfft_mpi_example.cpp000066400000000000000000000403411506652163400240530ustar00rootroot00000000000000 /****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include #include #include #include #include #include "rocfft.h" // Check all ranks for an rocFFT non-success status. auto rocfft_status_sync(const rocfft_status fftrc, const MPI_Comm comm) { // Since hipSuccess is the lowest enum value, we can find if there are any errors // by getting the maximum value of the return code over all procs. // Guarantee that the enum is an unsigned int so that we can send this via MPI: static_assert(std::is_same_v, unsigned int>); auto global_fftrc = rocfft_status_success; const auto mpirc = MPI_Allreduce(&fftrc, &global_fftrc, 1, MPI_UNSIGNED, MPI_MAX, comm); if(mpirc != MPI_SUCCESS) { return rocfft_status_failure; } return global_fftrc; } // Check all ranks for an hip runtime non-success status. auto hip_status_sync(const hipError_t hiprc, const MPI_Comm comm) { // Since rocfft_status_success is the lowest enum value, we can find if there are any errors // by getting the maximum value of the return code over all procs. // Guarantee that the enum is an unsigned int so that we can send this via MPI: static_assert(std::is_same_v, unsigned int>); auto global_hiprc = hipSuccess; const auto mpirc = MPI_Allreduce(&hiprc, &global_hiprc, 1, MPI_UNSIGNED, MPI_MAX, comm); if(mpirc != MPI_SUCCESS) { return hipErrorUnknown; } return global_hiprc; } int main(int argc, char** argv) { MPI_Init(&argc, &argv); MPI_Comm mpi_comm = MPI_COMM_WORLD; int mpi_size = 0; MPI_Comm_size(mpi_comm, &mpi_size); int mpi_rank = 0; MPI_Comm_rank(mpi_comm, &mpi_rank); if(mpi_rank == 0) { std::cout << "rocFFT MPI example\n"; std::cout << "MPI size: " << mpi_size << "\n"; } // General FFT parameters: std::vector length = {8, 8}; const rocfft_transform_type direction = rocfft_transform_type_complex_forward; const rocfft_result_placement place = rocfft_placement_notinplace; auto fftrc = rocfft_status_success; auto hiprc = hipSuccess; fftrc = rocfft_setup(); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to set up rocFFT"); rocfft_plan_description description = nullptr; rocfft_plan_description_create(&description); fftrc = rocfft_plan_description_set_comm(description, rocfft_comm_mpi, &mpi_comm); if(fftrc != rocfft_status_success) throw std::runtime_error("failed add communicator to description"); // Do not set stride information via the descriptor, they are to be defined during field // creation below fftrc = rocfft_plan_description_set_data_layout(description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 0, nullptr, 0, 0, nullptr, 0); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to create description"); // This example is unbatched, so the batch stride is not used // for anything. For batched examples, this would be // distance in elements between consecutive batches. const size_t batch_stride = 0; if(mpi_rank == 0) { std::cout << "input data decomposition:\n"; } std::vector gpu_in = {nullptr}; { rocfft_field infield = nullptr; rocfft_field_create(&infield); std::vector inbrick_stride = {1, length[1], batch_stride}; const size_t inbrick_length1 = length[1] / (size_t)mpi_size + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); const size_t inbrick_lower1 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1; std::vector inbrick_lower = {0, inbrick_lower1, 0}; std::vector inbrick_upper = {length[0], inbrick_upper1, 1}; rocfft_brick inbrick = nullptr; rocfft_brick_create(&inbrick, inbrick_lower.data(), inbrick_upper.data(), inbrick_stride.data(), inbrick_lower.size(), 0); rocfft_field_add_brick(infield, inbrick); rocfft_brick_destroy(inbrick); inbrick = nullptr; const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex); std::vector> host_in(length[0] * inbrick_length1); for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0) { for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0] + (idx1 - inbrick_lower[1]) * inbrick_stride[1]; host_in[pos] = std::complex(idx0, idx1); } } // Serialize output: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "in-brick rank " << irank; std::cout << "\n\tlower indices:"; for(const auto val : inbrick_lower) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : inbrick_upper) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : inbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; for(auto idx0 = inbrick_lower[0]; idx0 < inbrick_upper[0]; ++idx0) { for(auto idx1 = inbrick_lower[1]; idx1 < inbrick_upper[1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[0]) * inbrick_stride[0] + (idx1 - inbrick_lower[1]) * inbrick_stride[1]; std::cout << host_in[pos] << " "; } std::cout << "\n"; } } MPI_Barrier(mpi_comm); } hiprc = hipMalloc(&gpu_in[0], memSize); if(hiprc != hipSuccess) throw std::runtime_error("inbrick hipMalloc failed"); hiprc = hipMemcpy(gpu_in[0], host_in.data(), memSize, hipMemcpyHostToDevice); if(hiprc != hipSuccess) throw std::runtime_error("inbrick hipMemcpy failed"); rocfft_plan_description_add_infield(description, infield); fftrc = rocfft_field_destroy(infield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy infield"); } if(mpi_rank == 0) { std::cout << "output data decomposition:\n"; } std::vector gpu_out = {nullptr}; std::vector outbrick_lower; std::vector outbrick_upper; std::vector outbrick_stride = {1, length[1], batch_stride}; { const size_t outbrick_length1 = length[1] / (size_t)mpi_size + ((size_t)mpi_rank < length[1] % (size_t)mpi_size ? 1 : 0); const size_t outbrick_lower1 = mpi_rank * (length[1] / mpi_size) + std::min((size_t)mpi_rank, length[1] % mpi_size); const size_t outbrick_upper1 = outbrick_lower1 + outbrick_length1; outbrick_lower = {0, outbrick_lower1, 0}; outbrick_upper = {length[0], outbrick_upper1, 1}; const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex); for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "out-brick rank " << irank; std::cout << "\n\tlower indices:"; for(const auto val : outbrick_lower) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : outbrick_upper) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : outbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; } MPI_Barrier(mpi_comm); } rocfft_field outfield = nullptr; rocfft_field_create(&outfield); rocfft_brick outbrick = nullptr; outbrick_lower = {0, outbrick_lower1, 0}; outbrick_upper = {length[0], outbrick_lower1 + outbrick_length1, 1}; rocfft_brick_create(&outbrick, outbrick_lower.data(), outbrick_upper.data(), outbrick_stride.data(), outbrick_lower.size(), 0); rocfft_field_add_brick(outfield, outbrick); rocfft_brick_destroy(outbrick); outbrick = nullptr; hiprc = hipMalloc(&gpu_out[0], memSize); if(hiprc != hipSuccess) throw std::runtime_error("outbrick hipMalloc failed"); rocfft_plan_description_add_outfield(description, outfield); fftrc = rocfft_field_destroy(outfield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy outfield"); } // In order still handle non-success return codes without killing all of the MPI processes, we // put object creation in a try/catch block and destroy non-nullptr objects. // Serialize output: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "rank " << irank << "\n"; std::cout << "input "; for(const auto& b : gpu_in) std::cout << " " << b; std::cout << "\n"; std::cout << "output "; for(const auto& b : gpu_out) std::cout << " " << b; std::cout << "\n"; } MPI_Barrier(mpi_comm); } fftrc = rocfft_status_sync(fftrc, mpi_comm); hiprc = hip_status_sync(hiprc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success && hiprc == hipSuccess) { std::cout << "so far so good, trying to make a plan....\n"; } else { std::cout << "failure: will not make a plan....\n"; } } // Create a multi-process plan: rocfft_plan gpu_plan = nullptr; if(fftrc == rocfft_status_success && hiprc == hipSuccess) { fftrc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms description); // Description } fftrc = rocfft_status_sync(fftrc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success) { std::cout << "so far so good, we have a plan....\n"; } else { std::cout << "failure: we do not have a plan....\n"; } } // Execute plan: if(fftrc == rocfft_status_success) { fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), nullptr); } fftrc = rocfft_status_sync(fftrc, mpi_comm); if(mpi_rank == 0) { if(fftrc == rocfft_status_success) { std::cout << "The FFT was succesful....\n"; } else { std::cout << "The FFT execution failed....\n"; } } // Output the data: for(int irank = 0; irank < mpi_size; ++irank) { if(mpi_rank == irank) { std::cout << "out brick rank " << irank << "\n"; const size_t outcount = (outbrick_upper[0] - outbrick_lower[0]) * (outbrick_upper[1] - outbrick_lower[1]); std::vector> host_out(outcount); hiprc = hipMemcpy(host_out.data(), gpu_out[0], outcount * sizeof(std::complex), hipMemcpyDeviceToHost); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(auto idx0 = outbrick_lower[0]; idx0 < outbrick_upper[0]; ++idx0) { for(auto idx1 = outbrick_lower[1]; idx1 < outbrick_upper[1]; ++idx1) { const auto pos = (idx0 - outbrick_lower[0]) * outbrick_stride[0] + (idx1 - outbrick_lower[1]) * outbrick_stride[1]; std::cout << host_out[pos] << " "; } std::cout << "\n"; } } MPI_Barrier(mpi_comm); } // Cleanup anything plan-generation structs (that aren't null pointers): if(description != nullptr) { if(rocfft_plan_description_destroy(description) != rocfft_status_success) { std::cerr << "description descruction failed\n"; } else { description = nullptr; } } // Clean up the plan and rocfft: try { if(gpu_plan != nullptr) { if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; } } catch(const std::exception&) { std::cerr << "rank " << mpi_rank << " plan destroy failed\n"; } for(auto& buf : gpu_in) { if(buf != nullptr) { hiprc = hipFree(buf); if(hiprc != hipSuccess) std::cerr << "hipFree failed\n"; buf = nullptr; } } for(auto& buf : gpu_out) { if(buf != nullptr) { hiprc = hipFree(buf); if(hiprc != hipSuccess) std::cerr << "hipFree failed\n"; buf = nullptr; } } fftrc = rocfft_cleanup(); MPI_Finalize(); return 0; } rocFFT-rocm-7.1.0/clients/samples/multi_gpu/000077500000000000000000000000001506652163400207225ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/multi_gpu/CMakeLists.txt000066400000000000000000000100231506652163400234560ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-multi_gpu LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list mgpu_complex) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) if( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-7.1.0/clients/samples/multi_gpu/mgpu_complex.cpp000066400000000000000000000323451506652163400241340ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "rocfft/rocfft.h" #include #include #include int main(int argc, char* argv[]) { std::cout << "rocfft single-node multi-gpu complex-to-complex 3D FFT example\n"; // Length of transform, first dimension must be greather than number of GPU devices std::vector length = {8, 8}; // Gpu device ids: std::vector devices = {0, 1}; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--length", length, "2-D FFT size (eg: --length 256 256)"); app.add_option( "--devices", devices, "List of devices to use separated by spaces (eg: --devices 1 3)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } int deviceCount = devices.size(); std::cout << "Using " << deviceCount << " device(s)\n"; int nDevices; (void)hipGetDeviceCount(&nDevices); std::cout << "Number of available GPUs: " << nDevices << " \n"; if(nDevices <= static_cast(*std::max_element(devices.begin(), devices.end()))) throw std::runtime_error("device ID greater than number of available devices"); // Placeness for the transform auto fftrc = rocfft_status_success; fftrc = rocfft_setup(); if(fftrc != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = rocfft_placement_notinplace; // Direction of transform const rocfft_transform_type direction = rocfft_transform_type_complex_forward; rocfft_plan_description description = nullptr; rocfft_plan_description_create(&description); // Do not set stride information via the descriptor, they are to be defined during field // creation below rocfft_plan_description_set_data_layout(description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 0, nullptr, 0, 0, nullptr, 0); auto hiprc = hipSuccess; std::cout << "input data decomposition:\n"; std::vector gpu_in(devices.size()); { // Row-major stride for brick data layout in memory std::vector inbrick_stride = {1, length[1]}; rocfft_field infield = nullptr; rocfft_field_create(&infield); std::vector> inbrick_lower(gpu_in.size()); std::vector> inbrick_upper(gpu_in.size()); for(size_t idx = 0; idx < gpu_in.size(); ++idx) { const size_t inbrick_length1 = length[1] / gpu_in.size() + (idx < length[1] % gpu_in.size() ? 1 : 0); const size_t inbrick_lower1 = idx * (length[1] / gpu_in.size()) + std::min(idx, length[1] % gpu_in.size()); const size_t inbrick_upper1 = inbrick_lower1 + inbrick_length1; inbrick_lower[idx] = {0, inbrick_lower1}; inbrick_upper[idx] = {length[0], inbrick_upper1}; rocfft_brick inbrick = nullptr; rocfft_brick_create(&inbrick, inbrick_lower[idx].data(), inbrick_upper[idx].data(), inbrick_stride.data(), inbrick_lower[idx].size(), devices[idx]); rocfft_field_add_brick(infield, inbrick); rocfft_brick_destroy(inbrick); inbrick = nullptr; const size_t memSize = length[0] * inbrick_length1 * sizeof(std::complex); std::cout << "in-brick " << idx; std::cout << "\n\tlower indices:"; for(const auto val : inbrick_lower[idx]) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : inbrick_upper[idx]) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : inbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; hiprc = hipSetDevice(devices[idx]); if(hiprc != hipSuccess) throw std::runtime_error("hipSetDevice failed"); hiprc = hipMalloc(&gpu_in[idx], memSize); if(hiprc != hipSuccess) throw std::runtime_error("hipMalloc failed"); std::vector> host_in(length[0] * inbrick_length1); for(auto idx0 = inbrick_lower[idx][0]; idx0 < inbrick_upper[idx][0]; ++idx0) { for(auto idx1 = inbrick_lower[idx][1]; idx1 < inbrick_upper[idx][1]; ++idx1) { const auto pos = (idx0 - inbrick_lower[idx][0]) * inbrick_stride[0] + (idx1 - inbrick_lower[idx][1]) * inbrick_stride[1]; host_in[pos] = std::complex(idx0, idx1); std::cout << host_in[pos] << " "; } std::cout << "\n"; } hiprc = hipMemcpy(gpu_in[idx], host_in.data(), memSize, hipMemcpyHostToDevice); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); } rocfft_plan_description_add_infield(description, infield); fftrc = rocfft_field_destroy(infield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy infield"); } std::cout << "output data decomposition:\n"; std::vector gpu_out(devices.size()); std::vector> outbrick_lower(gpu_out.size()); std::vector> outbrick_upper(gpu_out.size()); std::vector outbrick_stride = {1, length[1]}; { rocfft_field outfield = nullptr; rocfft_field_create(&outfield); for(size_t idx = 0; idx < gpu_out.size(); ++idx) { const size_t outbrick_length1 = length[1] / gpu_out.size() + (idx < length[1] % gpu_in.size() ? 1 : 0); const size_t outbrick_lower1 = idx * (length[1] / gpu_out.size()) + std::min(idx, length[1] % gpu_out.size()); rocfft_brick outbrick = nullptr; outbrick_lower[idx] = {0, outbrick_lower1}; outbrick_upper[idx] = {length[0], outbrick_lower1 + outbrick_length1}; rocfft_brick_create(&outbrick, outbrick_lower[idx].data(), outbrick_upper[idx].data(), outbrick_stride.data(), outbrick_lower[idx].size(), devices[idx]); rocfft_field_add_brick(outfield, outbrick); rocfft_brick_destroy(outbrick); outbrick = nullptr; const size_t memSize = length[0] * outbrick_length1 * sizeof(std::complex); std::cout << "out-brick " << idx; std::cout << "\n\tlower indices:"; for(const auto val : outbrick_lower[idx]) std::cout << " " << val; std::cout << "\n\tupper indices:"; for(const auto val : outbrick_upper[idx]) std::cout << " " << val; std::cout << "\n\tstrides:"; for(const auto val : outbrick_stride) std::cout << " " << val; std::cout << "\n"; std::cout << "\tbuffer size: " << memSize << "\n"; (void)hipSetDevice(devices[idx]); if(hipMalloc(&gpu_out[idx], memSize) != hipSuccess) throw std::runtime_error("hipMalloc failed"); } rocfft_plan_description_add_outfield(description, outfield); fftrc = rocfft_field_destroy(outfield); if(fftrc != rocfft_status_success) throw std::runtime_error("failed destroy outfield"); } // Create a multi-gpu plan: (void)hipSetDevice(devices[0]); rocfft_plan gpu_plan = nullptr; fftrc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms description); // Description if(fftrc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get execution information and allocate work buffer rocfft_execution_info planinfo = nullptr; size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(gpu_plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&planinfo) != rocfft_status_success) throw std::runtime_error("failed to create execution info"); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed"); if(rocfft_execution_info_set_work_buffer(planinfo, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan: fftrc = rocfft_execute(gpu_plan, (void**)gpu_in.data(), (void**)gpu_out.data(), planinfo); if(fftrc != rocfft_status_success) throw std::runtime_error("failed to execute."); // Output the data. for(size_t idx = 0; idx < gpu_out.size(); ++idx) { std::cout << "out brick " << idx << "\n"; const auto nbrick = (outbrick_upper[idx][0] - outbrick_lower[idx][0]) * (outbrick_upper[idx][1] - outbrick_lower[idx][1]); std::vector> host_out(nbrick); hiprc = hipMemcpy(host_out.data(), gpu_out[idx], nbrick * sizeof(std::complex), hipMemcpyDeviceToHost); if(hiprc != hipSuccess) throw std::runtime_error("hipMemcpy failed"); for(auto idx0 = outbrick_lower[idx][0]; idx0 < outbrick_upper[idx][0]; ++idx0) { for(auto idx1 = outbrick_lower[idx][1]; idx1 < outbrick_upper[idx][1]; ++idx1) { const auto pos = (idx0 - outbrick_lower[idx][0]) * outbrick_stride[0] + (idx1 - outbrick_lower[idx][1]) * outbrick_stride[1]; std::cout << host_out[pos] << " "; } std::cout << "\n"; } } // Destroy plan if(planinfo != nullptr) { if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; } if(rocfft_plan_description_destroy(description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); for(size_t idx = 0; idx < gpu_in.size(); ++idx) { (void)hipFree(gpu_in[idx]); } for(size_t idx = 0; idx < gpu_out.size(); ++idx) { (void)hipFree(gpu_out[idx]); } return 0; } rocFFT-rocm-7.1.0/clients/samples/rocfft/000077500000000000000000000000001506652163400202005ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/samples/rocfft/CMakeLists.txt000066400000000000000000000106131506652163400227410ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-rocfft LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() set( sample_list rocfft_example_complexcomplex rocfft_example_realcomplex rocfft_example_set_stream rocfft_example_callback ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) if( USE_HIPRAND ) target_link_libraries( ${sample} PRIVATE hip::hiprand ) target_compile_definitions( ${sample} PRIVATE USE_HIPRAND ) endif() target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) # callback functions need to be built as relocatable device code # (causes failure at link stage on Windows) if (NOT WIN32) target_compile_options( rocfft_example_callback PRIVATE -fgpu-rdc ) target_link_options( rocfft_example_callback PRIVATE -fgpu-rdc ) endif() rocFFT-rocm-7.1.0/clients/samples/rocfft/examplekernels.h000066400000000000000000000361771506652163400234060ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEKERNELS_H #define EXAMPLEKERNELS_H #include "../../../shared/data_gen_device.h" #include #include #include // Kernel for initializing 1D real input data on the GPU. __global__ void initrdata1(double* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos] = idx + 1; } } // Kernel for initializing 2D real input data on the GPU. __global__ void initrdata2( double* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos] = idx + idy; } } // Kernel for initializing 3D real input data on the GPU. __global__ void initrdata3(double* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos] = cos(cos(idx + 2)) * sin(idy * idy + 1) / (idz + 1); } } // Kernel for initializing 1D complex data on the GPU. __global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos].x = 1 + idx; x[pos].y = 1 + idx; } } // Kernel for initializing 2D complex input data on the GPU. __global__ void initcdata2(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; const auto idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos].x = idx + 1; x[pos].y = idy + 1; } } // Kernel for initializing 3D complex input data on the GPU. __global__ void initcdata3(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos].x = idx + 10.0 * idz + 1; x[pos].y = idy + 10; } } // Helper function for determining grid dimensions template Tint1 ceildiv(const Tint1 nominator, const Tint2 denominator) { return (nominator + denominator - 1) / denominator; } // The following functions call the above kernels to initalize the input data for the transform. void initcomplex_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { size_t blockSize = DATA_GEN_THREADS; const dim3 blockdim(blockSize); switch(length_cm.size()) { case 1: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL(initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_complex_data kernel launch failure: " + std::string(hipGetErrorName(err))); } // Initialize the real input buffer where the data has lengths given in length and stride given in // stride. The device buffer is assumed to have been allocated. void initreal_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { size_t blockSize = DATA_GEN_THREADS; const dim3 blockdim(blockSize); switch(length_cm.size()) { case 1: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL( initrdata1, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initrdata2, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initrdata3, griddim, blockdim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_real_data kernel launch failure: " + std::string(hipGetErrorName(err))); } // Imposes Hermitian symmetry for the input device buffer. // Note: input parameters are in column-major ordering. void impose_hermitian_symmetry_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { size_t batch = 1; size_t dist = 1; size_t blockSize = DATA_GEN_THREADS; auto inputDim = length.size(); // Launch impose_hermitian_symmetry kernels. // NOTE: input parameters must be in row-major // ordering for these kernels. switch(inputDim) { case 1: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize)); const auto blockDim = dim3(blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[0], stride[0], dist, batch, length[0] % 2 == 0); break; } case 2: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[1] + 1) / 2 - 1, blockSize)); const auto blockDim = dim3(blockSize, blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[1], length[0], stride[1], stride[0], dist, batch, (ilength[1] + 1) / 2 - 1, length[1] % 2 == 0, length[0] % 2 == 0); break; } case 3: { const auto gridDim = dim3(DivRoundingUp(batch, blockSize), DivRoundingUp((length[2] + 1) / 2 - 1, blockSize), DivRoundingUp(length[1] - 1, blockSize)); const auto blockDim = dim3(blockSize, blockSize, blockSize); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3D_kernel, gridDim, blockDim, 0, 0, (hipDoubleComplex*)gpu_in, length[2], length[1], length[0], stride[2], stride[1], stride[0], dist, batch, (ilength[2] + 1) / 2 - 1, ilength[1] - 1, (ilength[1] + 1) / 2 - 1, length[2] % 2 == 0, length[1] % 2 == 0, length[0] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("impose_hermitian_symmetry_interleaved kernel launch failure: " + std::string(hipGetErrorName(err))); } // Initialize the Hermitian complex input buffer where the data has lengths given in length, the // transform has lengths given in length and stride given in stride. The device buffer is assumed // to have been allocated. void init_hermitiancomplex_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { size_t blockSize = 256; const dim3 blockdim(blockSize); switch(length.size()) { case 1: { const dim3 griddim(ceildiv(ilength[0], blockSize)); hipLaunchKernelGGL( initcdata1, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]); break; } case 2: { const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], stride[0], stride[1]); break; } case 3: { const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y), ceildiv(ilength[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, griddim, blockdim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], ilength[2], stride[0], stride[1], stride[2]); break; } default: throw std::runtime_error("Invalid dimension"); } auto err = hipGetLastError(); if(err != hipSuccess) throw std::runtime_error("init_complex_data kernel launch failure: " + std::string(hipGetErrorName(err))); impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in); } #endif /* EXAMPLEKERNELS_H */ rocFFT-rocm-7.1.0/clients/samples/rocfft/exampleutils.h000066400000000000000000000136441506652163400230750ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEUTILS_H #define EXAMPLEUTILS_H std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c) { stream << "(" << c.x << "," << c.y << ")"; return stream; } // Increment the index (column-major) for looping over arbitrary dimensional loops with // dimensions length. template bool increment_cm(std::vector& index, const std::vector& length) { for(unsigned int idim = 0; idim < length.size(); ++idim) { if(index[idim] < length[idim]) { if(++index[idim] == length[idim]) { index[idim] = 0; continue; } break; } } // End the loop when we get back to the start: return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); } // Output a formatted general-dimensional array with given length and stride in batches // separated by dist, in column-major order. template void printbuffer_cm(const std::vector& data, const std::vector& length, const std::vector& stride, const size_t nbatch, const size_t dist) { for(size_t b = 0; b < nbatch; b++) { std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist); assert(i >= 0); assert(i < data.size()); std::cout << data[i] << " "; for(size_t idx = 0; idx < index.size(); ++idx) { if(index[idx] == (length[idx] - 1)) { std::cout << "\n"; } else { break; } } } while(increment_cm(index, length)); std::cout << std::endl; } } // Check that an multi-dimensional array of complex values with dimensions length // and straide stride, with nbatch copies separated by dist is Hermitian-symmetric. // Column-major version. template bool check_symmetry_cm(const std::vector& data, const std::vector& length_cm, const std::vector& stride_cm, const size_t nbatch, const size_t dist, const bool verbose = true) { bool issymmetric = true; for(size_t b = 0; b < nbatch; b++) { std::vector index(length_cm.size()); std::fill(index.begin(), index.end(), 0); do { bool skip = false; std::vector negindex(index.size()); for(size_t idx = 0; idx < index.size(); ++idx) { if(index[0] > length_cm[0] / 2) { skip = true; break; } negindex[idx] = (length_cm[idx] - index[idx]) % length_cm[idx]; } if(negindex[0] > length_cm[0] / 2) { skip = true; } if(!skip) { const auto i = std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist); const auto j = std::inner_product( negindex.begin(), negindex.end(), stride_cm.begin(), b * dist); if((data[i].x != data[j].x) or (data[i].y != -data[j].y)) { if(verbose) { std::cout << "("; std::string separator; for(auto val : index) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << i << "\t"; std::cout << "("; separator = ""; for(auto val : negindex) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << j << ":\t"; std::cout << data[i] << " " << data[j]; std::cout << "\tnot conjugate!" << std::endl; } issymmetric = false; } } } while(increment_cm(index, length_cm)); } return issymmetric; } #endif /* EXAMPLEUTILS_H */ rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_callback.cpp000066400000000000000000000161201506652163400255160ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #ifndef WIN32 #include "rocfft/rocfft.h" #include #include #include #include #include #include // example of using load/store callbacks with rocfft struct load_cbdata { double2* filter; double scale; }; __device__ double2 load_callback(double2* input, size_t offset, void* cbdata, void* sharedMem) { auto data = static_cast(cbdata); // multiply each element by filter element and scale return hipCmul(hipCmul(input[offset], data->filter[offset]), make_hipDoubleComplex(data->scale, data->scale)); } __device__ auto load_callback_dev = load_callback; #endif int main() { #ifdef WIN32 std::cout << "This sample is temporarily disabled on Windows" << std::endl; return EXIT_SUCCESS; #else const size_t N = 8; std::vector cx(N), filter(N); // initialize data and filter for(size_t i = 0; i < N; i++) { cx[i].x = i; cx[i].y = i; filter[i].x = rand() / static_cast(RAND_MAX); filter[i].y = 0; } // rocfft gpu compute // ================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2 *x, *filter_dev; // create buffers if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(hipMalloc(&filter_dev, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(work_buf_size) { if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(N); void* cbdata_dev; if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Get a properly-typed host pointer to the device function, as // rocfft_execution_info_set_load_callback expects void*. void* cbptr_host = nullptr; hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpyFromSymbol failed."); // set callback if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_load_callback failed."); // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(cbdata_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(filter_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; #endif } rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_complexcomplex.cpp000066400000000000000000000245411506652163400270270ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision complex-to-complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: size_t deviceId = 0; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); CLI::Option* opt_outofplace = app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); app.add_option( "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse; // Set up the strides and buffer size for the input: std::vector istride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { istride.push_back(length[i - 1] * istride[i - 1]); } const size_t isize = length[length.size() - 1] * istride[istride.size() - 1]; // Set up the strides and buffer size for the output: std::vector ostride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { ostride.push_back(length[i - 1] * ostride[i - 1]); } const size_t osize = length[length.size() - 1] * ostride[ostride.size() - 1]; // Print information about the transform: std::cout << "direction: "; if(direction == rocfft_transform_type_complex_forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input strides:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "output strides:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "input size: " << isize << "\n"; std::cout << "output size: " << isize << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and allocate data hipDoubleComplex* gpu_in = nullptr; if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Inititalize the data on the device initcomplex_cm(length, istride, gpu_in); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); hipError_t hip_status = hipGetLastError(); if(hip_status != hipSuccess) throw std::runtime_error("device error"); std::cout << "input:\n"; std::vector idata(isize); hip_status = hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(idata, length, istride, 1, isize); // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout(gpu_description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the plan rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer."); } // If the transform is out-of-place, allocate the output buffer as well: double2* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex)); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute."); // Get the output from the device and print to cout: std::cout << "output:\n"; std::vector odata(osize); hip_status = hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(odata, length, istride, 1, isize); // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_realcomplex.cpp000066400000000000000000000277311506652163400263070ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include #include "../../../shared/CLI11.hpp" #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision real/complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: size_t deviceId = 0; // Command-line options: CLI::App app{"rocfft sample command line options"}; app.add_option("--device", deviceId, "Select a specific device id")->default_val(0); CLI::Option* opt_outofplace = app.add_flag("-o, --outofplace", "Perform an out-of-place transform"); CLI::Option* opt_inverse = app.add_flag("-i, --inverse", "Perform an inverse transform"); app.add_option( "--length", length, "Lengths of the transform separated by spaces (eg: --length 4 4)"); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = *opt_outofplace ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = *opt_inverse ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward; const bool forward = direction == rocfft_transform_type_real_forward; // Set up the strides and buffer size for the real values: std::vector rstride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { // In-place transforms need space for two extra real values in the contiguous // direction. auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1]; rstride.push_back(val); } // NB: not tight, but hey const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1]; std::vector rdata(real_size); // host storage // The complex data length is half + 1 of the real data length in the contiguous // dimensions. Since rocFFT is column-major, this is the first index. std::vector clength = length; clength[0] = clength[0] / 2 + 1; std::vector cstride = {1}; for(unsigned int i = 1; i < clength.size(); ++i) { cstride.push_back(clength[i - 1] * cstride[i - 1]); } const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; std::vector cdata(complex_size); // host storage // Based on the direction, we set the input and output parameters appropriately. const size_t isize = forward ? real_size : complex_size; const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); const std::vector ilength = forward ? length : clength; const std::vector istride = forward ? rstride : cstride; const size_t osize = forward ? complex_size : real_size; const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); const std::vector olength = forward ? clength : length; const std::vector ostride = forward ? cstride : rstride; // Print information about the transform: std::cout << "direction: "; if(forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input length:"; for(auto i : ilength) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer stride:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer size: " << ibytes << "\n"; std::cout << "output length:"; for(auto i : olength) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer stride:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer size: " << obytes << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and initialize data // Kernels are provided in examplekernels.h void* gpu_in = nullptr; hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); if(hip_status != hipSuccess) throw std::runtime_error("device error"); if(forward) { initreal_cm(length, istride, gpu_in); } else { init_hermitiancomplex_cm(length, ilength, istride, gpu_in); } // Print the input: std::cout << "input:\n"; if(forward) { hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, ilength, istride, 1, isize); } else { hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, ilength, istride, 1, isize); // Check that the buffer is Hermitian symmetric: check_symmetry_cm(cdata, length, istride, 1, isize); } // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_status_success; // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout( gpu_description, // input data format: forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, // output data format: forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the FFT plan: rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer"); } // If the transform is out-of-place, allocate the output buffer as well: void* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, obytes); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute"); // Get the output from the device and print to cout: std::cout << "output:\n"; if(forward) { hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, olength, ostride, 1, osize); } else { hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, olength, ostride, 1, osize); } // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; rocfft_cleanup(); return 0; } rocFFT-rocm-7.1.0/clients/samples/rocfft/rocfft_example_set_stream.cpp000066400000000000000000000126471506652163400261420ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft/rocfft.h" #include #include #include #include #include struct fft_fixture_t { std::vector cpu_buf; double2* gpu_buf = nullptr; hipStream_t stream = nullptr; rocfft_execution_info info = nullptr; rocfft_plan plan = nullptr; }; int main(int argc, char* argv[]) { std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl; size_t length = 8; size_t total_bytes = length * sizeof(double2); hipError_t hip_status; rocfft_status fft_status; fft_fixture_t ffts[2]; /// preparation if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); for(auto& it : ffts) { // create cpu buffer it.cpu_buf.resize(length); // init cpu buffer... // create gpu buffer if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // copy host to device if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // create stream if(hipStreamCreate(&(it.stream)) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); // create execution info fft_status = rocfft_execution_info_create(&(it.info)); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); // set stream // NOTE: The stream must be of type hipStream_t. // It is an error to pass the address of a hipStream_t object. fft_status = rocfft_execution_info_set_stream(it.info, it.stream); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_stream failed."); // create plan fft_status = rocfft_plan_create(&it.plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); size_t work_buf_size = 0; fft_status = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer } /// execution for(auto& it : ffts) { fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); } /// wait and copy back for(auto& it : ffts) { if(hipStreamSynchronize(it.stream) != hipSuccess) throw std::runtime_error("hipStreamSynchronize failed."); hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); } /// clean up for(auto& it : ffts) { fft_status = rocfft_plan_destroy(it.plan); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); fft_status = rocfft_execution_info_destroy(it.info); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); if(hipStreamDestroy(it.stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); if(hipFree(it.gpu_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-7.1.0/clients/tests/000077500000000000000000000000001506652163400164135ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/tests/CMakeLists.txt000066400000000000000000000364111506652163400211600ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-tests LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( hip REQUIRED PATHS /opt/rocm/lib/cmake/hip/ ) endif() if( NOT ROCmCMakeBuildTools_FOUND ) find_package( ROCmCMakeBuildTools REQUIRED ) endif() if( USE_HIPRAND AND NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) set( rocfft-test_source gtest_main.cpp rocfft_accuracy_test.cpp bitwise_repro/bitwise_repro_test.cpp accuracy_test.cpp accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp accuracy_test_adhoc.cpp accuracy_test_emulation.cpp accuracy_test_callback.cpp accuracy_test_checkstride.cpp multithread_test.cpp multi_device_test.cpp hermitian_test.cpp hipGraph_test.cpp callback_change_type.cpp default_callbacks_test.cpp unit_test.cpp buffer_hash_test.cpp validate_length_stride.cpp random.cpp ../../shared/array_validator.cpp ) add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} ) add_executable( rtc_helper_crash rtc_helper_crash.cpp ) # rocFFT device code builds with -O3 by default. rocfft-test # contains device code for callback functions, so ensure the device # code is built with the same optimization level to minimize chance # of a mismatch target_compile_options( rocfft-test PRIVATE -Xarch_device -O3 ) # callback functions need to be built as relocatable device code # (causes failure at link stage on Windows) if (NOT WIN32) target_compile_options( rocfft-test PRIVATE -fgpu-rdc ) target_link_options( rocfft-test PRIVATE -fgpu-rdc ) endif() find_package( Boost REQUIRED ) set( Boost_DEBUG ON ) set( Boost_DETAILED_FAILURE_MSG ON ) option( BUILD_FFTW "Download and build FFTW" OFF ) # look for installed FFTW if we weren't asked to build it if( NOT BUILD_FFTW ) find_package( FFTW 3.0 MODULE COMPONENTS FLOAT DOUBLE ) endif() include( ExternalProject ) if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) # use extract timestamp for fetched files instead of timestamps in the archive cmake_policy(SET CMP0135 NEW) endif() # also try to build FFTW if FFTW isn't present if( BUILD_FFTW OR NOT FFTW_FOUND ) set(FFTW_LIBRARIES_DOUBLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_LIBRARIES_SINGLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_CMAKE_ARGS_COMMON -DDISABLE_FORTRAN=ON -DENABLE_AVX2=ON -DENABLE_THREADS=ON -DBUILD_SHARED_LIBS=ON -DBUILD_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}) set(FFTW_SRC_URL http://www.fftw.org/fftw-3.3.9.tar.gz CACHE STRING "Location of FFTW source code") set(FFTW_SRC_SHA256 bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d CACHE STRING "SHA256 hash of FFTW source code") # build double-precision FFTW ExternalProject_Add(fftw_double URL ${FFTW_SRC_URL} URL_HASH SHA256=${FFTW_SRC_SHA256} SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_DOUBLE}) ExternalProject_Get_Property( fftw_double source_dir binary_dir ) # also build single-precision fftw from the same source dir ExternalProject_Add(fftw_single DOWNLOAD_COMMAND "" SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} -DENABLE_FLOAT=ON INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_SINGLE} DEPENDS fftw_double) ExternalProject_Get_Property( fftw_single source_dir binary_dir ) set(FFTW_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}/src/fftw/api) set(FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ${FFTW_LIBRARIES_SINGLE}) # FFTW we build is always threaded set( FFTW_MULTITHREAD TRUE ) add_dependencies( rocfft-test fftw_double fftw_single ) rocm_install( FILES ${FFTW_LIBRARIES} DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw COMPONENT clients-common ) else() include_directories(${FFTW_INCLUDE_DIRS}) endif() set( rocfft-test_include_dirs $ $ $ ${ROCM_CLANG_ROOT}/include ) set( rocfft-test_link_libs ${FFTW_LIBRARIES} ) option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) if( BUILD_CLIENTS_TESTS_OPENMP ) # Attempt to find a config version, which provides openmp_LIB_DIR. #find_package( OpenMP CONFIG PATHS "${HIP_CLANG_ROOT}/lib/cmake" ) if( NOT OPENMP_FOUND OR NOT DEFINED ${openmp_LIB_DIR} ) # Fall-back to module mode. find_package( OpenMP REQUIRED ) set( BUILD_RPATH "${HIP_CLANG_ROOT}/lib" ) set( INSTALL_RPATH "$ORIGIN/../llvm/lib" ) else() set( BUILD_RPATH "${HIP_CLANG_ROOT}/${openmp_LIB_DIR}" ) set( INSTALL_RPATH "$ORIGIN/../llvm/${openmp_LIB_DIR}" ) endif() list( APPEND rocfft-test_link_libs OpenMP::OpenMP_CXX ) set_target_properties( rocfft-test PROPERTIES BUILD_RPATH "{$BUILD_RPATH}" ) set_target_properties( rocfft-test PROPERTIES INSTALL_RPATH "${INSTALL_RPATH}" ) endif() find_package( GTest QUIET ) if( GTest_FOUND ) target_link_libraries( rocfft-test PRIVATE GTest::gtest ) else() include( ../cmake/build-gtest.cmake ) add_dependencies( rocfft-test gtest ) list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) endif() target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp ) target_include_directories( rocfft-test PRIVATE ${rocfft-test_include_dirs} ) if( NOT BUILD_SHARED_LIBS ) list(APPEND rocfft-test_link_libs ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS}) endif() if( NOT ROCFFT_BUILD_SCOPE ) find_package(SQLite3 REQUIRED) set( ROCFFT_SQLITE_LIB SQLite::SQLite3) endif() target_link_libraries( rocfft-test PRIVATE hip::device roc::rocfft ${ROCFFT_SQLITE_LIB} ${rocfft-test_link_libs} ) if ( USE_HIPRAND ) target_link_libraries( rocfft-test PRIVATE hip::hiprand ) target_compile_definitions( rocfft-test PRIVATE USE_HIPRAND ) endif() if( ROCFFT_MPI_ENABLE ) target_link_libraries( rocfft-test PRIVATE MPI::MPI_CXX ) add_compile_definitions( ROCFFT_MPI_ENABLE ) if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( rocfft-test PRIVATE "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( rocfft-test PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) endif() endif() include( ../../cmake/std-filesystem.cmake ) target_link_std_experimental_filesystem( rocfft-test ) if( USE_CUDA ) target_include_directories( rocfft-test PRIVATE $ $ ) target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) include( ../../cmake/sqlite.cmake ) target_link_libraries( rocfft-test PUBLIC ${ROCFFT_SQLITE_LIB} ) target_include_directories( rocfft-test PRIVATE ${sqlite_local_SOURCE_DIR} ) set_property( TARGET rocfft-test APPEND PROPERTY LINK_LIBRARIES ${ROCFFT_SQLITE_LIB} ) if(FFTW_MULTITHREAD) target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD ) endif( ) set_target_properties( rocfft-test PROPERTIES CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../bin" ) else() set( TESTS_OUT_DIR "/bin" ) endif() string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} ) set_target_properties(rocfft-test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) set_target_properties(rtc_helper_crash PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS rocfft-test rtc_helper_crash COMPONENT tests) if (WIN32) # Ensure tests run with HIP DLLs and not anything the driver owns # in system32. Libraries like amdhip64.dll are also in the HIP # runtime, and we need run with those. But the only way to make a # same-named DLL override something in system32 is to have it next # to the executable. So copy them in. file( GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() if( ROCFFT_MPI_ENABLE ) # normal and dynamic-loading MPI worker processes foreach(worker rocfft_mpi_worker dyna_rocfft_mpi_worker) add_executable( ${worker} rocfft_mpi_worker.cpp ) if( BUILD_CLIENTS_TESTS_OPENMP ) set_target_properties( ${worker} PROPERTIES BUILD_RPATH "${BUILD_RPATH}" ) set_target_properties( ${worker} PROPERTIES INSTALL_RPATH "${INSTALL_RPATH}" ) endif() if( BUILD_FFTW OR NOT FFTW_FOUND ) add_dependencies( ${worker} fftw_double fftw_single ) endif() target_include_directories( ${worker} PRIVATE ${CMAKE_BINARY_DIR}/include ${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/ ${MPI_C_INCLUDE_PATH} $ ) target_compile_options( ${worker} PRIVATE ${WARNING_FLAGS} ) if ( ROCFFT_CRAY_MPI_ENABLE ) target_link_libraries( ${worker} OpenMP::OpenMP_CXX hip::hiprand hip::device MPI::MPI_CXX ${FFTW_LIBRARIES} "mpi_gtl_hsa" ) get_filename_component( MPI_LIBDIR ${MPI_LIBRARY} DIRECTORY ) target_link_directories( ${worker} PRIVATE ${MPI_LIBDIR}/../../../../gtl/lib ) else() target_link_libraries( ${worker} OpenMP::OpenMP_CXX hip::hiprand hip::device MPI::MPI_CXX ${FFTW_LIBRARIES} ) endif() set_target_properties(${worker} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS ${worker} COMPONENT tests) endforeach() # link normal MPI worker against rocFFT target_link_libraries( rocfft_mpi_worker roc::rocfft ) # dyna worker only needs to dynamically load libraries target_compile_definitions( dyna_rocfft_mpi_worker PRIVATE ROCFFT_DYNA_MPI_WORKER ) target_link_libraries( dyna_rocfft_mpi_worker ${CMAKE_DL_LIBS} ) endif() set(COVERAGE_TEST_OPTIONS "--smoketest;--gtest_filter=-*call*" CACHE STRING "Command line arguments for rocfft-test when generating a code coverage report") if(BUILD_CODE_COVERAGE) # Coverage won't work in a standalone build of the tests, as we can't # guarantee the library was built with coverage enabled if( NOT TARGET rocfft ) message( FATAL_ERROR "BUILD_CODE_COVERAGE requires building from the root of rocFFT" ) endif() add_custom_target( code_cov_tests DEPENDS rocfft-test rocfft_rtc_helper COMMAND ${CMAKE_COMMAND} -E rm -rf ./coverage-report COMMAND ${CMAKE_COMMAND} -E make_directory ./coverage-report/profraw COMMAND ${CMAKE_COMMAND} -E env LLVM_PROFILE_FILE="./coverage-report/profraw/rocfft-coverage_%p.profraw" GTEST_LISTENER=NO_PASS_LINE_IN_LOG $ --precompile=rocfft-test-precompile.db ${COVERAGE_TEST_OPTIONS} WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) find_program( LLVM_PROFDATA llvm-profdata REQUIRED HINTS ${ROCM_PATH}/llvm/bin PATHS /opt/rocm/llvm/bin ) find_program( LLVM_COV llvm-cov REQUIRED HINTS ${ROCM_PATH}/llvm/bin PATHS /opt/rocm/llvm/bin ) add_custom_target( coverage DEPENDS code_cov_tests COMMAND ${LLVM_PROFDATA} merge -sparse ./coverage-report/profraw/rocfft-coverage_*.profraw -o ./coverage-report/rocfft.profdata COMMAND ${LLVM_COV} report -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata COMMAND ${LLVM_COV} show -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=html -output-dir=coverage-report COMMAND ${LLVM_COV} export -object ./library/src/librocfft.so -instr-profile=./coverage-report/rocfft.profdata -format=lcov > ./coverage-report/coverage.info WORKING_DIRECTORY ${CMAKE_BINARY_DIR} ) endif() rocFFT-rocm-7.1.0/clients/tests/accuracy_test.cpp000066400000000000000000000624771506652163400217700ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/rocfft_complex.h" #include // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // multiply each element by scalar if(input == testdata->base) return input[offset] * testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_dev_half = load_callback; __device__ auto load_callback_dev_complex_half = load_callback>; __device__ auto load_callback_dev_float = load_callback; __device__ auto load_callback_dev_complex_float = load_callback>; __device__ auto load_callback_dev_double = load_callback; __device__ auto load_callback_dev_complex_double = load_callback>; // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // subtract each element by scalar if(input == testdata->base) return input[offset] - testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_round_trip_inverse_dev_half = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_half = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_float = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_float = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_double = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_double = load_callback_round_trip_inverse>; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } default: // planar is unsupported for now return load_callback_host; } } template __host__ __device__ static void store_callback(Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_dev_half = store_callback; __device__ auto store_callback_dev_complex_half = store_callback>; __device__ auto store_callback_dev_float = store_callback; __device__ auto store_callback_dev_complex_float = store_callback>; __device__ auto store_callback_dev_double = store_callback; __device__ auto store_callback_dev_complex_double = store_callback>; template __host__ __device__ static void store_callback_round_trip_inverse( Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // divide each element by scalar if(output == testdata->base) { output[offset] = element / testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_round_trip_inverse_dev_half = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_half = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_float = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_float = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_double = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_double = store_callback_round_trip_inverse>; void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } default: // planar is unsupported for now return store_callback_host; } } // Apply store callback if necessary void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; callback_test_data cbdata; cbdata.scalar = params.store_cb_scalar; cbdata.base = output.front().data(); switch(params.otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } // apply load callback if necessary void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; // we're applying callbacks to FFTW input/output which we can // assume is contiguous and non-planar callback_test_data cbdata; cbdata.scalar = params.load_cb_scalar; cbdata.base = input.front().data(); switch(params.itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_fp16); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } rocFFT-rocm-7.1.0/clients/tests/accuracy_test_1D.cpp000066400000000000000000000610701506652163400223000ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P( small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward, fft_transform_type_real_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { if(t == fft_transform_type_complex_forward) return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; else return std::vector{std::make_tuple( t, place_range[0], fft_array_type_real, fft_array_type_hermitian_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_inplace}, true)), accuracy_test::TestName); // NB: // We have known non-unit strides issues for 1D: // - C2C middle size(for instance, single precision, 8192) // - C2C large size(for instance, single precision, 524288) // We need to fix non-unit strides first, and then address non-unit strides + batch tests. // Then check these problems of R2C and C2R. After that, we could open arbitrary permutations in the // main tests. // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // invloved. INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex_half, accuracy_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, accuracy_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // Create an array parameters for strided 2D batched transforms. inline auto param_generator_complex_1d_batched_2d(const double base_prob, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range) { std::vector params; for(auto& transform_type : trans_type_range_complex) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { assert(false); continue; } for(const auto precision : precision_range) { for(const auto& types : generate_types(transform_type, place_range, true)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = lengths; param.ostride = lengths; param.nbatch = lengths[0]; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = 1; param.odist = 1; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = base_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } } return params; } INSTANTIATE_TEST_SUITE_P( pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow2_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow3_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({pow5_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(test_prob, generate_lengths({prime_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_2D.cpp000066400000000000000000000332341506652163400223020ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_2D, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_3D.cpp000066400000000000000000000317161506652163400223060ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "accuracy_tests_range.h" using ::testing::ValuesIn; INSTANTIATE_TEST_SUITE_P(pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_3D, pow2_range_half_3D, pow2_range_half_3D}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({sbrc_range_3D, sbrc_range_3D, sbrc_range_3D}), precision_range_sp_dp, sbrc_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D_half, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({inner_batch_3D_range_half, inner_batch_3D_range_half, inner_batch_3D_range_half}), {fft_precision_half}, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(partial_pass_3D, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, partial_pass_adhoc_3D, precision_range_sp_dp, partial_pass_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( #ifdef WIN32 DISABLED_partial_pass_3D_callback, #else partial_pass_3D_callback, #endif accuracy_test, ::testing::ValuesIn(param_generator(test_prob, partial_pass_adhoc_3D, precision_range_sp_dp, partial_pass_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_adhoc.cpp000066400000000000000000000371361506652163400231200ustar00rootroot00000000000000// Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> adhoc_sizes = { // sizes that exercise L1D_TRTRT subplan of 2D_RTRT or 3D_TRTRTR {1, 220}, {1, 330}, {81, 220, 36}, // L1D_CC subplan of 3D_TRTRTR {4, 4, 8192}, // SBRC 192 with special param {192, 192, 192}, {192, 84, 84}, // Failure with build_CS_3D_BLOCK_RC {680, 128, 128}, // Large 1D primes that fall above the block threshold (length 262144). // Bluestein requires two forwards and one inverse FFTs, and the plan // for these sizes breakdown these FFTs either as: // L1D_TRTRT (T + STOCKHAM + T + STOCKHAM + T) for lengthBlue <= 4096^2 // or // L1D_TRTRT (T + L1D_CC + STOCKHAM_BL_CC + STOCHMAM_BL_RC + T + STOCKHAM + T) // for lengthBlue > 4096^2. {196597}, {25165813}, // 2D single-kernel bluestein size combined with multi-kernel bluestein {19, 2053}, // TILE_UNALIGNED type of SBRC 3D ERC {98, 98, 98}, // 3D_BLOCK_CR {336, 336, 56}, }; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(adhoc, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, adhoc_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_adhoc, accuracy_test, ::testing::ValuesIn(param_generator(test_prob, adhoc_sizes, precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // Test that dist is ignored for batch-1 transforms. Normally, // in-place transforms require same dist, but for batch-1 dist isn't // used for anything and differing dist should be allowed. inline auto param_permissive_iodist() { std::vector> lengths = adhoc_sizes; lengths.push_back({4}); std::vector params; for(const auto precision : precision_range_sp_dp) { for(const auto trans_type : trans_type_range) { for(const auto& types : generate_types(trans_type, place_range, true)) { if(std::get<1>(types) != fft_placement_inplace) continue; for(const auto& len : lengths) { fft_params param; param.length = len; param.precision = precision; param.idist = 2; param.odist = 3; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_dist, accuracy_test, ::testing::ValuesIn(param_permissive_iodist()), accuracy_test::TestName); inline auto param_adhoc_colmajor() { // generate basic FFTs of adhoc sizes auto params = param_generator(test_prob, adhoc_sizes, {fft_precision_single}, {2}, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, false); // remove any params that are: // - 1D (not enough dims to swap) // - real-complex 2D (we only get to play with higher dims, so // again not enough dims to swap) params.erase(std::remove_if(params.begin(), params.end(), [](const fft_params& param) { if(param.length.size() == 1) return true; if(param.length.size() == 2) { if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) return true; } return false; }), params.end()); // reverse length/stride order on remaining params to make them // col-major std::for_each(params.begin(), params.end(), [](fft_params& param) { size_t start_dim = 0; // for real-complex we can't touch the fastest dim if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) ++start_dim; std::reverse(param.length.rbegin() + start_dim, param.length.rend()); std::reverse(param.istride.rbegin() + start_dim, param.istride.rend()); std::reverse(param.ostride.rbegin() + start_dim, param.ostride.rend()); }); return params; } INSTANTIATE_TEST_SUITE_P(adhoc_colmajor, accuracy_test, ::testing::ValuesIn(param_adhoc_colmajor()), accuracy_test::TestName); inline auto param_adhoc_stride() { std::vector params; for(const auto precision : precision_range_full) { for(const auto& types : generate_types(fft_transform_type_complex_forward, {fft_placement_inplace, fft_placement_notinplace}, true)) { // 2D with non-contiguous strides and dist fft_params param; param.length = {2, 35}; param.precision = precision; param.idist = 200; param.odist = 200; param.transform_type = fft_transform_type_complex_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {90, 2}; param.ostride = {90, 2}; params.push_back(param); } // test C2R/R2C with non-contiguous higher strides and dist - we // want unit stride for length0 so we do the even-length optimization for(const auto& types : generate_types(fft_transform_type_real_forward, {fft_placement_notinplace}, true)) { fft_params param; param.length = {4, 4, 4}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {16, 4, 1}; param.ostride = {16, 4, 1}; param.validate(); { const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } else { if(param.valid(0)) { params.push_back(param); } } } param.length = {2, 2, 2}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {20, 6, 1}; param.ostride = {20, 6, 1}; param.validate(); { const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } else { if(param.valid(0)) { params.push_back(param); } } } } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_stride, accuracy_test, ::testing::ValuesIn(param_adhoc_stride()), accuracy_test::TestName); const auto adhoc_tokens = { // clang-format off "complex_forward_len_4_4_4_single_op_batch_2_istride_16_4_1_CI_ostride_4_16_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0", "complex_forward_len_512_64_single_ip_batch_3_istride_192_3_CI_ostride_192_3_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", "real_forward_len_1024_1024_1024_single_op_batch_1_istride_1048576_1024_1_R_ostride_525312_513_1_HI_idist_1073741824_odist_537919488_ioffset_0_0_ooffset_0_0", "complex_forward_len_6144_single_ip_batch_34_istride_35_CI_ostride_35_CI_idist_1_odist_1_ioffset_0_0_ooffset_0_0", "real_forward_len_8192_single_ip_batch_65537_istride_1_R_ostride_1_HI_idist_8194_odist_4097_ioffset_0_0_ooffset_0_0", "real_forward_len_520_single_op_batch_270400_istride_1_R_ostride_1_HI_idist_520_odist_261_ioffset_0_0_ooffset_0_0", "real_forward_len_630_single_op_batch_396900_istride_1_R_ostride_1_HI_idist_630_odist_316_ioffset_0_0_ooffset_0_0", "real_forward_len_660_single_op_batch_435600_istride_1_R_ostride_1_HI_idist_660_odist_331_ioffset_0_0_ooffset_0_0", "real_forward_len_700_single_op_batch_490000_istride_1_R_ostride_1_HI_idist_700_odist_351_ioffset_0_0_ooffset_0_0", "real_forward_len_728_single_op_batch_529984_istride_1_R_ostride_1_HI_idist_728_odist_365_ioffset_0_0_ooffset_0_0", "real_forward_len_968_single_op_batch_937024_istride_1_R_ostride_1_HI_idist_968_odist_485_ioffset_0_0_ooffset_0_0", "real_forward_len_1020_single_op_batch_1040400_istride_1_R_ostride_1_HI_idist_1020_odist_511_ioffset_0_0_ooffset_0_0", "real_forward_len_378_42_single_ip_batch_66000_istride_44_1_R_ostride_22_1_HI_idist_16632_odist_8316_ioffset_0_0_ooffset_0_0", "real_forward_len_527_25_single_ip_batch_67500_istride_26_1_R_ostride_13_1_HI_idist_13702_odist_6851_ioffset_0_0_ooffset_0_0", "real_forward_len_630_38_single_ip_batch_65540_istride_40_1_R_ostride_20_1_HI_idist_25200_odist_12600_ioffset_0_0_ooffset_0_0", // clang-format on }; INSTANTIATE_TEST_SUITE_P(adhoc_token, accuracy_test, ::testing::ValuesIn(param_generator_token(test_prob, adhoc_tokens)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_callback.cpp000066400000000000000000000135061506652163400235710ustar00rootroot00000000000000// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" std::vector> callback_sizes = { // some single kernel sizes {4}, {16}, {81}, {100}, // L1D_TRTRT sizes {220}, {330}, {1344}, // L1D_CC sizes {8192}, {10000}, // prime {23}, {29}, // 2D_SINGLE sizes, small and big {16, 8}, {32, 32}, {9, 81}, {27, 81}, {81, 27}, {256, 9}, {9, 256}, {125, 32}, {32, 125}, // 2D_RTRT {20, 40}, {81, 81}, // 2D_RC {128, 64}, {128, 256}, // more complicated children of 2D_RTRT (L1D_TRTRT, L1D_CC, prime) {4, 220}, {220, 4}, {4, 8192}, {8192, 4}, {4, 23}, {23, 4}, // 3D_TRTRTR, with complicated children {63, 5, 6}, {6, 5, 63}, {23, 5, 6}, {6, 5, 23}, {70, 5, 6}, {6, 5, 70}, {8192, 5, 6}, {6, 5, 8192}, // 3D_RTRT, with complicated children {23, 4, 4}, {4, 4, 23}, {70, 4, 4}, {4, 4, 70}, {8192, 4, 4}, {4, 4, 8192}, // 3D odd lengths {27, 27, 27}, // 3D_BLOCK_RC {64, 64, 64}, }; const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; auto forward_transform_types = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; INSTANTIATE_TEST_SUITE_P( #ifdef WIN32 DISABLED_callback_no_offset, #else callback, #endif accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, ::testing::ValuesIn(param_generator_base(test_prob, forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, true)), accuracy_test::TestName); // one of the obvious use cases for callbacks is to implement result // scaling manually, so use the same sizes to test rocFFT's own // result scaling feature. inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(test_prob, callback_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true); for(auto& param : params) param.scale_factor = 7.23; return params; } INSTANTIATE_TEST_SUITE_P(scaling, accuracy_test, ::testing::ValuesIn(param_generator_scaling(callback_sizes)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_checkstride.cpp000066400000000000000000000133301506652163400243200ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" extern bool fftw_compare; inline auto param_checkstride() { // checkstride requires us to copy data back to the host for // checking, which we only do when comparing against FFTW. if(!fftw_compare) return std::vector{}; // tuples of length,stride,nbatch,dist to test. strides are arranged so // there's space either between elements on the fastest dim, or // between dims, or both. std::vector, std::vector, size_t, size_t>> sizes = { // 1D single kernel non-unit stride {{64}, {2}, 2, 140}, // 1D single kernel unit stride but non-contiguous batch {{64}, {1}, 2, 80}, // 1D odd length (to test odd-length R2C/C2R) {{15}, {2}, 2, 40}, // 1D SBCC+SBRC {{8192}, {2}, 2, 17000}, // 1D TRTRT {{24000}, {2}, 2, 50000}, // 2D_RTRT {{20, 30}, {80, 2}, 2, 1700}, {{40, 30}, {80, 2}, 2, 3600}, // 2D_RTRT unit stride along fast dim {{20, 30}, {40, 1}, 2, 1000}, {{40, 30}, {40, 1}, 2, 2000}, // 2D_RC {{64, 64}, {130, 2}, 2, 8400}, // 3D_RC {{64, 64, 64}, {8400, 130, 2}, 2, 540000}, // 3D_RTRTRT {{2, 3, 4}, {40, 10, 2}, 2, 100}, // bigger 3D_RTRTRT {{30, 40, 50}, {3000, 60, 1}, 2, 100000}, }; std::vector params; for(const auto trans_type : trans_type_range) { for(const auto& s : sizes) { for(const auto precision : precision_range_sp_dp) { for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true)) { #ifdef WIN32 for(bool callback : {false}) #else for(bool callback : {true, false}) #endif { // callbacks don't work for planar bool is_planar = std::get<2>(types) == fft_array_type_complex_planar || std::get<2>(types) == fft_array_type_hermitian_planar || std::get<3>(types) == fft_array_type_complex_planar || std::get<3>(types) == fft_array_type_hermitian_planar; if(callback && is_planar) continue; fft_params param; param.length = std::get<0>(s); param.istride = std::get<1>(s); param.ostride = std::get<1>(s); param.nbatch = std::get<2>(s); param.precision = precision; param.idist = std::get<3>(s); param.odist = std::get<3>(s); param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.run_callbacks = callback; param.check_output_strides = true; param.validate(); const double roll = hash_prob(random_seed, param.token()); const double run_prob = test_prob * (param.is_planar() ? complex_planar_prob_factor : 1.0) * (param.is_interleaved() ? complex_interleaved_prob_factor : 1.0) * (param.is_real() ? real_prob_factor : 1.0); if(roll > run_prob) { if(verbose > 4) { std::cout << "Test skipped (probability " << run_prob << " > " << roll << ")\n"; } continue; } if(param.valid(0)) { params.push_back(param); } } } } } } return params; } INSTANTIATE_TEST_SUITE_P(checkstride, accuracy_test, ::testing::ValuesIn(param_checkstride()), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_test_emulation.cpp000066400000000000000000000164421506652163400240340ustar00rootroot00000000000000 // Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "accuracy_tests_range.h" const auto emulation_tokens = { // clang-format off "complex_forward_len_4_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_4_odist_4_ioffset_0_0_ooffset_0_0", "complex_forward_len_4_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_4_odist_4_ioffset_0_0_ooffset_0_0", "complex_forward_len_8_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_8_odist_8_ioffset_0_0_ooffset_0_0", "complex_forward_len_16_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_16_odist_16_ioffset_0_0_ooffset_0_0", "complex_forward_len_32_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_32_odist_32_ioffset_0_0_ooffset_0_0", "complex_forward_len_64_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0", "complex_forward_len_128_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_128_odist_128_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_27_odist_27_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_27_odist_27_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_27_double_ip_batch_1_istride_27_1_CI_ostride_27_1_CI_idist_729_odist_729_ioffset_0_0_ooffset_0_0", "complex_forward_len_27_27_single_ip_batch_1_istride_27_1_CI_ostride_27_1_CI_idist_729_odist_729_ioffset_0_0_ooffset_0_0", "complex_forward_len_125_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_125_odist_125_ioffset_0_0_ooffset_0_0", "complex_forward_len_125_125_single_ip_batch_1_istride_125_1_CI_ostride_125_1_CI_idist_15625_odist_15625_ioffset_0_0_ooffset_0_0", "complex_forward_len_121_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_121_odist_121_ioffset_0_0_ooffset_0_0", "complex_forward_len_121_121_single_ip_batch_1_istride_121_1_CI_ostride_121_1_CI_idist_14641_odist_14641_ioffset_0_0_ooffset_0_0", "complex_forward_len_216_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_216_odist_216_ioffset_0_0_ooffset_0_0", "complex_forward_len_10000_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_10000_odist_10000_ioffset_0_0_ooffset_0_0", "complex_forward_len_128_50_128_single_ip_batch_1_istride_6400_128_1_CI_ostride_6400_128_1_CI_idist_819200_odist_819200_ioffset_0_0_ooffset_0_0", "real_forward_len_16_256_256_single_op_batch_2_istride_65536_256_1_R_ostride_33024_129_1_HI_idist", "real_forward_len_256_128_256_single_op_batch_1_istride_32768_256_1_R_ostride_16512_129_1_HI_idist" // clang-format on }; INSTANTIATE_TEST_SUITE_P(emulation_token, accuracy_test, ::testing::ValuesIn(param_generator_token(emulation_prob, emulation_tokens)), accuracy_test::TestName); const static std::vector emulation_range_1D = {2, 3, 5, 16, 17, 29, 32, 64, 75, 128, 200, 256, 288, 298}; const static std::vector emulation_range_2D = {2, 3, 5, 16, 29, 17, 64, 76, 96, 112, 128, 150, 315}; const static std::vector emulation_range_3D = {2, 3, 5, 16, 29, 17, 32, 64, 128, 256}; INSTANTIATE_TEST_SUITE_P(emulation_1D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(emulation_2D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_2D, emulation_range_2D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(emulation_3D, accuracy_test, ::testing::ValuesIn(param_generator(emulation_prob, generate_lengths({emulation_range_3D, emulation_range_3D, emulation_range_3D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/accuracy_tests_range.h000066400000000000000000000215211506652163400227550ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ACCURACY_TESTS_RANGE_H #define ACCURACY_TESTS_RANGE_H #include const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 1D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- // TODO: handle special case where length=2 for real/complex transforms. const static std::vector pow2_range_1D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824}; const static std::vector pow2_range_half_1D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; const static std::vector pow3_range_1D = {3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, 1594323, 4782969, 14348907, 43046721, 129140163, 387420489}; const static std::vector pow5_range_1D = {5, 25, 125, 625, 3125, 15625, 78125, 390625, 1953125, 9765625, 48828125, 244140625}; // radix 7, 11, 13 sizes that are either pure powers or sizes people have wanted in the wild const static std::vector radX_range_1D = {7, 49, 84, 112, 11, 13, 52, 104, 208, 343, 2401, 16807}; const static std::vector mix_range_1D = {6, 10, 12, 15, 20, 30, 56, 120, 150, 225, 240, 300, 336, 486, 600, 900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000}; const static std::vector prime_range_1D = {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static std::vector small_1D_sizes() { static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered std::vector covered_sizes; std::copy(pow2_range_1D.begin(), pow2_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(pow3_range_1D.begin(), pow3_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(pow5_range_1D.begin(), pow5_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(radX_range_1D.begin(), radX_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(mix_range_1D.begin(), mix_range_1D.end(), std::back_inserter(covered_sizes)); std::copy(prime_range_1D.begin(), prime_range_1D.end(), std::back_inserter(covered_sizes)); std::sort(covered_sizes.begin(), covered_sizes.end()); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) { if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i)) { output.push_back(i); } } return output; } const static std::vector batch_range_1D = {4, 2, 1}; const static std::vector> stride_range_for_prime_1D = {{1}, {2}, {3}, {64}, {65}}; //TODO: this will be merged back to stride_range const static std::vector pow2_range_for_stride_1D = {4096, 8192, 524288}; const static std::vector pow2_range_for_stride_half_1D = {4096, 8192}; const static std::vector> stride_range_for_pow2_1D = {{2}, {3}}; const static std::vector batch_range_for_stride_1D = {2, 1}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 2D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector pow2_range_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; const static std::vector pow3_range_2D = {3, 9, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range_2D = {5, 25, 125, 625, 3125, 15625}; const static std::vector prime_range_2D = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; const static std::vector mix_range_2D = {56, 120, 336, 2160, 5000, 6000, 8000}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 3D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector pow2_range_3D = {4, 8, 16, 32, 128, 256}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half_3D = {4, 8, 16, 32}; const static std::vector pow3_range_3D = {3, 9, 27, 81, 243}; const static std::vector pow5_range_3D = {5, 25, 125}; const static std::vector prime_range_3D = {7, 11, 13, 17, 19, 23, 29}; // SBCC+SBRC as a sub-node of a 3D TRTRTR const static std::vector> pow2_adhoc_3D = {{4, 4, 8192}}; // Test combinations of SBRC sizes, plus a non-SBRC size (10) to // exercise fused SBRC+transpose kernels. const static std::vector sbrc_range_3D = {50, 64, 81, 100, 200, 10, 128, 256}; const static std::vector sbrc_batch_range_3D = {2, 1}; // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't const static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; const static std::vector inner_batch_3D_range_half = {4, 8, 16, 32, 20, 24}; const static std::vector inner_batch_3D_batch_range = {3, 2, 1}; //----------------------------------------------------------------------- //----------------------------------------------------------------------- // partial pass test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- const static std::vector> partial_pass_adhoc_3D = { {64, 64, 128}, {64, 64, 64}, {64, 64, 52}, {60, 60, 60}, {32, 32, 128}, {32, 32, 64}, {64, 32, 128}, }; const static std::vector partial_pass_batch_range_3D = {1, 5, 10, 20, 50}; #endif // ACCURACY_TESTS_RANGE_HrocFFT-rocm-7.1.0/clients/tests/bitwise_repro/000077500000000000000000000000001506652163400212705ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/tests/bitwise_repro/bitwise_repro_db.h000066400000000000000000000432541506652163400247730ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef BITWISE_REPRO_DB_H #define BITWISE_REPRO_DB_H #include "../../../shared/fft_hash.h" #include "sqlite3.h" #include #include #include #include #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif typedef size_t default_hash_type; template struct rocfft_test_run { rocfft_test_run(Tint ibuffer_hash_real_, Tint ibuffer_hash_imag_, Tint obuffer_hash_real_, Tint obuffer_hash_imag_, std::string token_, std::string runtime_version_, std::string lib_version_, std::string gpu_architecure_) : ibuffer_hash_real(ibuffer_hash_real_) , ibuffer_hash_imag(ibuffer_hash_imag_) , obuffer_hash_real(obuffer_hash_real_) , obuffer_hash_imag(obuffer_hash_imag_) , token(token_) , runtime_version(runtime_version_) , lib_version(lib_version_) , gpu_architecture(gpu_architecure_) { } static std::string get_create_rocfft_test_run_sql() { return "CREATE TABLE IF NOT EXISTS rocfft_test_run(ibuffer_hash_real TEXT NOT NULL, " "ibuffer_hash_imag TEXT NOT NULL, obuffer_hash_real TEXT NOT NULL, " "obuffer_hash_imag TEXT NOT NULL, token TEXT NOT NULL, runtime_version TEXT NOT " "NULL, lib_version TEXT NOT NULL, gpu_architecture TEXT NOT NULL); CREATE UNIQUE " "INDEX IF NOT EXISTS idx_unique_run ON rocfft_test_run(token, runtime_version, " "lib_version, gpu_architecture);"; } static std::string get_match_sql() { return "SELECT ibuffer_hash_real, ibuffer_hash_imag, obuffer_hash_real, obuffer_hash_imag, " "token, runtime_version, lib_version, gpu_architecture FROM rocfft_test_run WHERE " "token = ? AND runtime_version = ? AND lib_version = ? AND gpu_architecture = ? "; } static std::string get_insert_sql() { return "INSERT INTO rocfft_test_run(ibuffer_hash_real, ibuffer_hash_imag, " "obuffer_hash_real, obuffer_hash_imag, token, runtime_version, lib_version, " "gpu_architecture) VALUES (?,?,?,?,?,?,?,?)"; } void bind_insert_statement(sqlite3_stmt* stmt) { bind_ibuffer_hash_real(stmt, 1); bind_ibuffer_hash_imag(stmt, 2); bind_obuffer_hash_real(stmt, 3); bind_obuffer_hash_imag(stmt, 4); bind_token(stmt, 5); bind_runtime_version(stmt, 6); bind_lib_version(stmt, 7); bind_gpu_architecture(stmt, 8); } void bind_match_statement(sqlite3_stmt* stmt) { bind_token(stmt, 1); bind_runtime_version(stmt, 2); bind_lib_version(stmt, 3); bind_gpu_architecture(stmt, 4); } void update(sqlite3_stmt* stmt) { for(int col = 0; col < sqlite3_column_count(stmt); ++col) { auto col_name = sqlite3_column_name(stmt, col); auto col_value = std::string(reinterpret_cast(sqlite3_column_text(stmt, col))); if(strcmp(col_name, "ibuffer_hash_real") == 0) update_ibuffer_hash_real(col_value); if(strcmp(col_name, "ibuffer_hash_imag") == 0) update_ibuffer_hash_imag(col_value); if(strcmp(col_name, "obuffer_hash_real") == 0) update_obuffer_hash_real(col_value); if(strcmp(col_name, "obuffer_hash_imag") == 0) update_obuffer_hash_imag(col_value); if(strcmp(col_name, "token") == 0) update_token(col_value); if(strcmp(col_name, "runtime_version") == 0) update_runtime_version(col_value); if(strcmp(col_name, "lib_version") == 0) update_lib_version(col_value); if(strcmp(col_name, "gpu_architecture") == 0) update_gpu_architecture(col_value); } } Tint ibuffer_hash_real; Tint ibuffer_hash_imag; Tint obuffer_hash_real; Tint obuffer_hash_imag; private: std::string token; std::string runtime_version; std::string lib_version; std::string gpu_architecture; std::string get_ibuffer_hash_real() const { return std::to_string(ibuffer_hash_real); } std::string get_ibuffer_hash_imag() const { return std::to_string(ibuffer_hash_imag); } std::string get_obuffer_hash_real() const { return std::to_string(obuffer_hash_real); } std::string get_obuffer_hash_imag() const { return std::to_string(obuffer_hash_imag); } std::string get_token() const { return token; } std::string get_runtime_version() const { return runtime_version; } std::string get_lib_version() const { return lib_version; } std::string get_gpu_architecture() const { return gpu_architecture; } void bind_ibuffer_hash_real(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_ibuffer_hash_real().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding ibuffer_hash_real field in insert statement")); } void bind_ibuffer_hash_imag(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_ibuffer_hash_imag().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding ibuffer_hash_imag field in insert statement")); } void bind_obuffer_hash_real(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_obuffer_hash_real().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding obuffer_hash_real field in insert statement")); } void bind_obuffer_hash_imag(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_obuffer_hash_imag().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding obuffer_hash_imag field in insert statement")); } void bind_token(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_token().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error(std::string("Error binding token field in insert statement")); } void bind_runtime_version(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_runtime_version().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding runtime_version field in insert statement")); } void bind_lib_version(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_lib_version().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding lib_version field in insert statement")); } void bind_gpu_architecture(sqlite3_stmt* stmt, int index) { auto ret = sqlite3_bind_text(stmt, index, get_gpu_architecture().c_str(), -1, SQLITE_TRANSIENT); if(ret != SQLITE_OK) throw std::runtime_error( std::string("Error binding gpu_architecture field in insert statement")); } void update_ibuffer_hash_real(const std::string& value) { std::stringstream stream(value); stream >> ibuffer_hash_real; } void update_ibuffer_hash_imag(const std::string& value) { std::stringstream stream(value); stream >> ibuffer_hash_imag; } void update_obuffer_hash_real(const std::string& value) { std::stringstream stream(value); stream >> obuffer_hash_real; } void update_obuffer_hash_imag(const std::string& value) { std::stringstream stream(value); stream >> obuffer_hash_imag; } void update_token(const std::string& value) { token = value; } void update_runtime_version(const std::string& value) { runtime_version = value; } void update_lib_version(const std::string& value) { lib_version = value; } void update_gpu_architecture(const std::string& value) { gpu_architecture = value; } }; template inline rocfft_test_run get_rocfft_test_run(const hash_output& ibuffer_hash, const hash_output& obuffer_hash, const std::string& token) { hipDeviceProp_t device_prop; if(hipGetDeviceProperties(&device_prop, 0) != hipSuccess) throw std::runtime_error("hipGetDeviceProperties failure"); auto gpu_arch = std::string(device_prop.gcnArchName); auto ver_sep = std::string("."); auto runtime_ver = std::to_string(HIP_VERSION_MAJOR) + ver_sep + std::to_string(HIP_VERSION_MINOR); const size_t ver_size = 256; char lib_version[ver_size]; rocfft_get_version_string(lib_version, ver_size); auto lib_ver_full = std::string(lib_version); auto idx_maj = lib_ver_full.find(ver_sep); auto idx_min = lib_ver_full.find(ver_sep, idx_maj + 1); auto idx_rev = lib_ver_full.find(ver_sep, idx_min + 1); auto ver_maj = lib_ver_full.substr(0, idx_maj); auto ver_min = lib_ver_full.substr(idx_maj + 1, idx_min - idx_maj - 1); auto ver_rev = lib_ver_full.substr(idx_min + 1, idx_rev - idx_min - 1); auto lib_ver = ver_maj + ver_sep + ver_min + ver_sep + ver_rev; return rocfft_test_run(ibuffer_hash.buffer_real, ibuffer_hash.buffer_imag, obuffer_hash.buffer_real, obuffer_hash.buffer_imag, token, runtime_ver, lib_ver, gpu_arch); } class fft_hash_db { public: fft_hash_db(std::string db_path) : ret(SQLITE_OK) , db_connection(nullptr) , begin_stmt(nullptr) , end_stmt(nullptr) , insert_stmt(nullptr) , match_stmt(nullptr) { ret = sqlite3_open(db_path.c_str(), &db_connection); if(ret != SQLITE_OK) throw std::runtime_error(std::string("Cannot open repro-db: ") + db_path); // Access to a database file may occur in parallel. // Increase default sqlite timeout, so diferent process // can wait for one another. sqlite3_busy_timeout(db_connection, 30000); // Set sqlite3 engine to WAL mode to avoid potential deadlocks with multiple // concurrent processes (if a deadlock occurs, the busy timeout is not honored). ret = sqlite3_exec(db_connection, "PRAGMA journal_mode = WAL", nullptr, nullptr, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Error setting WAL mode: " + std::string(sqlite3_errmsg(db_connection))); ret = sqlite3_exec(db_connection, rocfft_test_run<>::get_create_rocfft_test_run_sql().c_str(), nullptr, nullptr, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Error creating table: " + std::string(sqlite3_errmsg(db_connection))); prepare_begin_end_stmts(); prepare_match_stmt(); prepare_insert_stmt(); } ~fft_hash_db() { sqlite3_finalize(begin_stmt); sqlite3_finalize(end_stmt); sqlite3_finalize(match_stmt); sqlite3_finalize(insert_stmt); sqlite3_close(db_connection); } template void check_hash_valid(const hash_output& ibuffer_hash, const hash_output& obuffer_hash, const std::string& token, bool& hash_entry_found, bool& hash_valid) { hash_valid = true; auto test_run = get_rocfft_test_run(ibuffer_hash, obuffer_hash, token); begin_transaction(); hash_entry_found = check_match(&test_run); if(hash_entry_found) hash_valid = (test_run.ibuffer_hash_real == ibuffer_hash.buffer_real && test_run.ibuffer_hash_imag == ibuffer_hash.buffer_imag && test_run.obuffer_hash_real == obuffer_hash.buffer_real && test_run.obuffer_hash_imag == obuffer_hash.buffer_imag) ? true : false; else insert(&test_run); end_transaction(); } private: void prepare_begin_end_stmts() { auto begin_sql = std::string("BEGIN TRANSACTION;"); ret = sqlite3_prepare_v2(db_connection, begin_sql.c_str(), -1, &begin_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare begin statement: " + std::string(sqlite3_errmsg(db_connection))); auto end_sql = std::string("END TRANSACTION;"); ret = sqlite3_prepare_v2(db_connection, end_sql.c_str(), -1, &end_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare end statement: " + std::string(sqlite3_errmsg(db_connection))); } void prepare_match_stmt() { auto match_sql = rocfft_test_run<>::get_match_sql(); ret = sqlite3_prepare_v2(db_connection, match_sql.c_str(), -1, &match_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare match statement: " + std::string(sqlite3_errmsg(db_connection))); } void prepare_insert_stmt() { auto insert_sql = rocfft_test_run<>::get_insert_sql(); ret = sqlite3_prepare_v2(db_connection, insert_sql.c_str(), -1, &insert_stmt, nullptr); if(ret != SQLITE_OK) throw std::runtime_error("Cannot prepare insert statement: " + std::string(sqlite3_errmsg(db_connection))); } void begin_transaction() { ret = sqlite3_step(begin_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing begin statement: ") + std::string(sqlite3_errmsg(db_connection))); } void end_transaction() { ret = sqlite3_step(end_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing end statement: ") + std::string(sqlite3_errmsg(db_connection))); } template bool check_match(rocfft_test_run* entry) { sqlite3_reset(match_stmt); entry->bind_match_statement(match_stmt); size_t match_count = 0; while((ret = sqlite3_step(match_stmt)) == SQLITE_ROW) { entry->update(match_stmt); match_count++; } // There can only be one result in this query if(match_count > 1) throw std::runtime_error("Corrupted database"); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing select statement: ") + std::string(sqlite3_errmsg(db_connection))); return match_count; } template void insert(rocfft_test_run* entry) { sqlite3_reset(insert_stmt); entry->bind_insert_statement(insert_stmt); ret = sqlite3_step(insert_stmt); if(ret != SQLITE_DONE) throw std::runtime_error(std::string("Error executing insert statement: ") + std::string(sqlite3_errmsg(db_connection))); } int ret; sqlite3* db_connection; sqlite3_stmt* begin_stmt; sqlite3_stmt* end_stmt; sqlite3_stmt* insert_stmt; sqlite3_stmt* match_stmt; }; #endif // BITWISE_REPRO_DB_HrocFFT-rocm-7.1.0/clients/tests/bitwise_repro/bitwise_repro_test.cpp000066400000000000000000000767551506652163400257340ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../../shared/params_gen.h" #include "../../../shared/rocfft_params.h" #include "../accuracy_tests_range.h" #include "bitwise_repro_test.h" using ::testing::ValuesIn; TEST(bitwise_repro_test, compare_precisions) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_192_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_192_odist_192_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_forward_len_192_double_ip_batch_1_istride_1_CI_ostride_1_CI_idist_192_odist_192_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST(bitwise_repro_test, compare_lengths) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_64_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_64_odist_64_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_forward_len_32_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_32_odist_32_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST(bitwise_repro_test, compare_transform_types) { rocfft_params params_1; // clang-format off params_1.from_token(std::string("complex_forward_len_256_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_256_odist_256_ioffset_0_0_ooffset_0_0")); // clang-format on params_1.validate(); rocfft_params params_2; // clang-format off params_2.from_token(std::string("complex_inverse_len_256_single_ip_batch_1_istride_1_CI_ostride_1_CI_idist_256_odist_256_ioffset_0_0_ooffset_0_0")); // clang-format on params_2.validate(); if(!params_1.valid(verbose) || !params_2.valid(verbose)) { if(verbose) std::cout << "Invalid parameters, skip this test." << std::endl; GTEST_SKIP(); } try { bitwise_repro(params_1, params_2); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } TEST_P(bitwise_repro_test, compare_to_reference) { if(repro_db == nullptr) GTEST_SKIP() << "A database file is required for this test." << std::endl; rocfft_params params(GetParam()); params.validate(); // Test that the tokenization works as expected. auto token = params.token(); fft_params tokentest; tokentest.from_token(token); auto token1 = tokentest.token(); EXPECT_EQ(token, token1); if(!params.valid(verbose)) { if(verbose) { std::cout << "Invalid parameters, skip this test." << std::endl; } GTEST_SKIP(); } try { bitwise_repro(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 1D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_1D}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({radX_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_1D}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_1D}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P(small_1D, bitwise_repro_test, ::testing::ValuesIn(param_generator_base( test_prob, {fft_transform_type_complex_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, bitwise_repro_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex_half, bitwise_repro_test, ::testing::ValuesIn(param_generator_complex(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, bitwise_repro_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_1D}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, bitwise_repro_test, ::testing::ValuesIn(param_generator_real(test_prob, generate_lengths({pow2_range_for_stride_half_1D}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2_1D, stride_range_for_pow2_1D, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 2D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_2D, pow2_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_2D, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow3_range_2D, pow3_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow5_range_2D, pow5_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({prime_range_2D, prime_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({mix_range_2D, mix_range_2D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); //----------------------------------------------------------------------- //----------------------------------------------------------------------- // 3D test problems //----------------------------------------------------------------------- //----------------------------------------------------------------------- INSTANTIATE_TEST_SUITE_P(pow2_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow2_range_3D, pow2_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({pow2_range_half_3D, pow2_range_half_3D, pow2_range_half_3D}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow3_range_3D, pow3_range_3D, pow3_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow5_range_3D, pow5_range_3D, pow5_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({prime_range_3D, prime_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({pow2_range_3D, pow3_range_3D, prime_range_3D}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P(sbrc_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({sbrc_range_3D, sbrc_range_3D, sbrc_range_3D}), precision_range_sp_dp, sbrc_batch_range_3D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D, bitwise_repro_test, ::testing::ValuesIn(param_generator( test_prob, generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D_half, bitwise_repro_test, ::testing::ValuesIn(param_generator(test_prob, generate_lengths({inner_batch_3D_range_half, inner_batch_3D_range_half, inner_batch_3D_range_half}), {fft_precision_half}, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), bitwise_repro_test::TestName); rocFFT-rocm-7.1.0/clients/tests/bitwise_repro/bitwise_repro_test.h000066400000000000000000000337521506652163400253670ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef BITWISE_REPRO_TEST_H #define BITWISE_REPRO_TEST_H #include #include #include #include #include #include "../../../shared/accuracy_test.h" #include "../../../shared/enum_to_string.h" #include "../../../shared/fft_params.h" #include "../../../shared/gpubuf.h" #include "../../../shared/rocfft_params.h" #include "../../../shared/test_params.h" #include "bitwise_repro_db.h" extern int verbose; extern std::unique_ptr repro_db; // Base gtest class for bitwise reproduction tests class bitwise_repro_test : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; // execute the GPU transform template inline void execute_fft(Tparams& params, std::vector& pibuffer, std::vector& pobuffer, std::vector& obuffer, std::vector& gpu_output) { // Execute the transform: auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); if(fft_status != fft_status_success) throw std::runtime_error("rocFFT plan execution failure"); ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) { ASSERT_TRUE(gpu_output[idx].data() != nullptr) << "output buffer index " << idx << " is empty"; auto hip_status = hipMemcpy(gpu_output[idx].data(), pobuffer.at(idx), gpu_output[idx].size(), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream msg; msg << "hipMemcpy failure"; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } if(verbose > 2) { std::cout << "GPU output:\n"; params.print_obuffer(gpu_output); } if(verbose > 5) { std::cout << "flat GPU output:\n"; params.print_obuffer_flat(gpu_output); } } template void compute_fft_data(Tparams& params, std::vector& fft_input, std::vector& fft_output) { // Call hipGetLastError to reset any errors // returned by previous HIP runtime API calls. hipError_t hip_status = hipGetLastError(); // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); // Make sure FFT buffers fit in device memory check_problem_fits_device_memory(params, verbose); auto ibuffer_sizes = params.ibuffer_sizes(); auto obuffer_sizes = params.obuffer_sizes(); // Create FFT plan - this will also allocate work buffer, but // will throw a specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { ++n_hip_failures; std::stringstream msg; msg << "Work buffer allocation failed with size: " << params.workbuffersize; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_status != hipSuccess) { std::stringstream msg; msg << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); ++n_hip_failures; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } pibuffer[i] = ibuffer[i].data(); } // allocation counts in elements, ibuffer_sizes is in bytes auto ibuffer_sizes_elems = ibuffer_sizes; for(auto& buf : ibuffer_sizes_elems) buf /= var_size(params.precision, params.itype); fft_input = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); // generate input based on either cpu/gpu #ifdef USE_HIPRAND // generate the input directly on the gpu params.compute_input(ibuffer); // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { hip_status = hipMemcpy(fft_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { std::stringstream msg; msg << "hipMemcpy failure with error " << hip_status; ++n_hip_failures; if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } #else // generate the input on the cpu params.compute_input(fft_input); // Copy input to GPU for(unsigned int idx = 0; idx < fft_input.size(); ++idx) { hip_status = hipMemcpy(ibuffer[idx].data(), fft_input.at(idx).data(), ibuffer_sizes[idx], hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream ss; ss << "hipMemcpy failure with error " << hip_status; if(skip_runtime_fails) { throw ROCFFT_SKIP{ss.str()}; } else { throw ROCFFT_FAIL{ss.str()}; } } } #endif std::vector obuffer_data; std::vector* obuffer = &obuffer_data; std::vector pobuffer; // allocate the output buffer if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream msg; msg << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); if(skip_runtime_fails) throw ROCFFT_SKIP{msg.str()}; else throw ROCFFT_FAIL{msg.str()}; } } } pobuffer.resize(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // execute GPU transform fft_output = allocate_host_buffer(params.precision, params.otype, params.osize); execute_fft(params, pibuffer, pobuffer, *obuffer, fft_output); } template inline void bitwise_repro_impl(Tparams& params, Tparams& params_comp) { std::vector fft_input, fft_output; compute_fft_data(params, fft_input, fft_output); auto ibuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto ibuffer_hash_out = hash_output(); compute_hash(fft_input, ibuffer_hash_in, ibuffer_hash_out); auto obuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.olength(), params.ostride, params.odist, rocfft_array_type_from_fftparams(params.otype), params.nbatch); auto obuffer_hash_out = hash_output(); compute_hash(fft_output, obuffer_hash_in, obuffer_hash_out); if(params_comp.token().compare(params.token()) == 0) { std::stringstream msg; msg << "FFT input tokens are identical"; throw ROCFFT_SKIP{msg.str()}; } std::vector fft_input_comp, fft_output_comp; compute_fft_data(params_comp, fft_input_comp, fft_output_comp); auto obuffer_hash_in_comp = hash_input(rocfft_precision_from_fftparams(params_comp.precision), params_comp.olength(), params_comp.ostride, params_comp.odist, rocfft_array_type_from_fftparams(params_comp.otype), params_comp.nbatch); auto obuffer_hash_out_comp = hash_output(); compute_hash(fft_output_comp, obuffer_hash_in_comp, obuffer_hash_out_comp); params.free(); params_comp.free(); // FFT params are not identical and, therefore, // must also have different fft outputs. ASSERT_FALSE(obuffer_hash_out_comp == obuffer_hash_out) << "Different FFT params have the same output hash."; } template inline void bitwise_repro_impl(Tparams& params) { std::vector fft_input, fft_output; compute_fft_data(params, fft_input, fft_output); auto ibuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto ibuffer_hash_out = hash_output(); compute_hash(fft_input, ibuffer_hash_in, ibuffer_hash_out); auto obuffer_hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.olength(), params.ostride, params.odist, rocfft_array_type_from_fftparams(params.otype), params.nbatch); auto obuffer_hash_out = hash_output(); compute_hash(fft_output, obuffer_hash_in, obuffer_hash_out); bool hash_entry_found, hash_valid; if(verbose) { std::cout << "input buffer hash: (" << ibuffer_hash_out.buffer_real << "," << ibuffer_hash_out.buffer_imag << ")" << std::endl; std::cout << "output buffer hash: (" << obuffer_hash_out.buffer_real << "," << obuffer_hash_out.buffer_imag << ")" << std::endl; } repro_db->check_hash_valid( ibuffer_hash_out, obuffer_hash_out, params.token(), hash_entry_found, hash_valid); params.free(); if(hash_entry_found) ASSERT_TRUE(hash_valid) << "FFT result is not bitwise reproducible."; else { std::stringstream msg; msg << "FFT result entry added to the repro-db file. Previously stored reference entry not " "found. \n"; throw ROCFFT_SKIP{msg.str()}; } } inline void bitwise_repro(rocfft_params& params) { switch(params.precision) { case fft_precision_half: bitwise_repro_impl(params); break; case fft_precision_single: bitwise_repro_impl(params); break; case fft_precision_double: bitwise_repro_impl(params); break; } } inline void bitwise_repro(rocfft_params& params, rocfft_params& params_comp) { switch(params.precision) { case fft_precision_half: bitwise_repro_impl(params, params_comp); break; case fft_precision_single: bitwise_repro_impl(params, params_comp); break; case fft_precision_double: bitwise_repro_impl(params, params_comp); break; } } #endif // BITWISE_REPRO_TEST_H rocFFT-rocm-7.1.0/clients/tests/buffer_hash_test.cpp000066400000000000000000000303371506652163400224400ustar00rootroot00000000000000// Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/fft_hash.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_params.h" #include #include #include #include #include #include static void set_params(const fft_precision precision, fft_params& param) { std::vector blengths = {131072}; std::vector unit_strides = {1}; size_t nbatch = 1; std::vector zero_offsets = {0}; auto btype = fft_array_type::fft_array_type_complex_interleaved; param.length = blengths; param.istride = unit_strides; param.ostride = unit_strides; param.nbatch = nbatch; param.precision = precision; param.idist = blengths[0]; param.odist = blengths[0]; param.isize = {blengths[0]}; param.osize = {blengths[0]}; param.itype = btype; param.otype = btype; param.ioffset = zero_offsets; param.ooffset = zero_offsets; param.placement = fft_placement_inplace; } // Create an fft params struct for a contiguous input/output buffer. // Purpose of the unit tests here is only to test the hashing strategy, // i.e., to reduce multiple floating point values to a single 64 bit // identifier. The strategy for hashing a non-contiguous buffer is // essentially the same, only the data access pattern is changed. static void validate_buffer_params(const fft_params& param) { ASSERT_EQ(param.length.size() == 1, true); ASSERT_EQ(param.istride.size() == 1, true); ASSERT_EQ(param.istride[0] == 1, true); ASSERT_EQ(param.ostride.size() == 1, true); ASSERT_EQ(param.ostride[0] == 1, true); ASSERT_EQ(param.ioffset.size() == 1, true); ASSERT_EQ(param.ioffset[0] == 0, true); ASSERT_EQ(param.ooffset.size() == 1, true); ASSERT_EQ(param.ooffset[0] == 0, true); ASSERT_EQ(param.isize.size() == 1, true); ASSERT_EQ(param.isize[0] == param.length[0], true); ASSERT_EQ(param.osize.size() == 1, true); ASSERT_EQ(param.osize[0] == param.length[0], true); ASSERT_EQ(param.nbatch == 1, true); ASSERT_EQ(param.itype == fft_array_type_complex_interleaved, true); ASSERT_EQ(param.otype == fft_array_type_complex_interleaved, true); ASSERT_EQ(param.placement == fft_placement_inplace, true); } static unsigned int gen_seed() { auto seed = static_cast(time(NULL)); return seed; } template static void shuffle_buffer(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::random_device rd; std::mt19937 g(rd()); std::shuffle(idata, idata + N, g); } static void shuffle_buffer(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: shuffle_buffer(N, seed, buffer); break; case fft_precision_double: shuffle_buffer(N, seed, buffer); break; case fft_precision_single: shuffle_buffer(N, seed, buffer); break; default: abort(); } } template static void corrupt_buffer_single(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist1(0.0f, 1.0f); std::uniform_real_distribution dist2(-1.0f, 1.0f); auto random_id = static_cast(dist1(gen) * static_cast(N - 1)); auto real = idata[random_id].real(); auto imag = idata[random_id].imag(); idata[random_id].real(real + dist2(gen)); idata[random_id].imag(imag + dist2(gen)); } static void corrupt_buffer_single(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: corrupt_buffer_single(N, seed, buffer); break; case fft_precision_double: corrupt_buffer_single(N, seed, buffer); break; case fft_precision_single: corrupt_buffer_single(N, seed, buffer); break; default: abort(); } } template static void corrupt_buffer_full(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); for(size_t i = 0; i < N; i++) { auto real = idata[i].real(); auto imag = idata[i].imag(); idata[i].real(real + dist(gen)); idata[i].imag(imag + dist(gen)); } } static void corrupt_buffer_full(const fft_params& param, const size_t seed, std::vector& buffer) { validate_buffer_params(param); auto N = param.length[0]; switch(param.precision) { case fft_precision_half: corrupt_buffer_full(N, seed, buffer); break; case fft_precision_double: corrupt_buffer_full(N, seed, buffer); break; case fft_precision_single: corrupt_buffer_full(N, seed, buffer); break; default: abort(); } } template static void init_buffer(const size_t N, const size_t seed, std::vector& buffer) { auto idata = (rocfft_complex*)buffer[0].data(); std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); for(size_t i = 0; i < N; i++) { idata[i].real(dist(gen)); idata[i].imag(dist(gen)); } } static void init_buffer(const fft_params& params, const size_t seed, std::vector& buffer) { validate_buffer_params(params); auto N = params.length[0]; switch(params.precision) { case fft_precision_half: init_buffer(N, seed, buffer); break; case fft_precision_double: init_buffer(N, seed, buffer); break; case fft_precision_single: init_buffer(N, seed, buffer); break; default: abort(); } } static void run_test(const rocfft_params& params) { auto hash_in = hash_input(rocfft_precision_from_fftparams(params.precision), params.ilength(), params.istride, params.idist, rocfft_array_type_from_fftparams(params.itype), params.nbatch); auto hash_out_1 = hash_output(); auto hash_out_2 = hash_output(); auto seed = gen_seed(); std::vector buffer1, buffer2; buffer1 = allocate_host_buffer(params.precision, params.itype, params.ibuffer_sizes()); buffer2 = allocate_host_buffer(params.precision, params.itype, params.ibuffer_sizes()); init_buffer(params, seed, buffer1); compute_hash(buffer1, hash_in, hash_out_1); copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real == hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag == hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); corrupt_buffer_full(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); corrupt_buffer_single(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; copy_buffers(buffer1, buffer2, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, params.itype, params.istride, params.idist, params.ioffset, params.ioffset); shuffle_buffer(params, seed, buffer2); compute_hash(buffer2, hash_in, hash_out_2); ASSERT_EQ(hash_out_1.buffer_real != hash_out_2.buffer_real, true) << "random seed: " << seed << std::endl; ASSERT_EQ(hash_out_1.buffer_imag != hash_out_2.buffer_imag, true) << "random seed: " << seed << std::endl; } TEST(rocfft_UnitTest, buffer_hashing_half) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } rocfft_params params; set_params(fft_precision_half, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } TEST(rocfft_UnitTest, buffer_hashing_single) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } rocfft_params params; set_params(fft_precision_single, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } TEST(rocfft_UnitTest, buffer_hashing_double) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } rocfft_params params; set_params(fft_precision_double, params); try { run_test(params); } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } rocFFT-rocm-7.1.0/clients/tests/callback_change_type.cpp000066400000000000000000000232761506652163400232330ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/hostbuf.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_complex.h" #include "../../shared/rocfft_params.h" #include "../../shared/accuracy_test.h" #include "../../shared/fftw_transform.h" #include "../../shared/rocfft_against_fftw.h" #include GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(change_type); // callback functions to cast data from short to float __host__ __device__ float load_callback_short(short* input, size_t offset, void* cbdata, void* sharedMem) { return static_cast(input[offset]); } __host__ __device__ float2 load_callback_short2(short2* input, size_t offset, void* cbdata, void* sharedMem) { return float2{static_cast(input[offset].x), static_cast(input[offset].y)}; } __device__ auto load_callback_short_dev = load_callback_short; __device__ auto load_callback_short2_dev = load_callback_short2; class change_type : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; // aim for 1D lengths that might need ordinary Stockham, transpose, // Bluestein kernels to treat real data as complex std::vector> callback_type_sizes = {{4}, {60}, {122}, {220}, {8192}, {4500000}}; // test complex + real forward transforms. real inverse is not a valid // test case here, because we're allowed to overwrite input on those. // the input can't be any smaller than what rocFFT thinks it is, // because the overwrite will fail. const static std::vector> stride_range = {{1}}; INSTANTIATE_TEST_SUITE_P( #ifdef WIN32 DISABLED_callback, #else callback, #endif change_type, ::testing::ValuesIn(param_generator_base(test_prob, {fft_transform_type_complex_forward, fft_transform_type_real_forward}, callback_type_sizes, {fft_precision_single}, {1}, generate_types, stride_range, stride_range, {{0, 0}}, {{0, 0}}, {fft_placement_notinplace}, false, false)), accuracy_test::TestName); // run an out-of-place transform that casts input from short to float TEST_P(change_type, short_to_float) { rocfft_params params(GetParam()); params.run_callbacks = true; ASSERT_EQ(params.create_plan(), fft_status_success); // input has 2 shorts/floats for complex data, 1 otherwise. // output is always complex for these tests. const size_t input_complex = params.transform_type != fft_transform_type_real_forward ? 2 : 1; // allocate gpubuf gpu_input; gpubuf gpu_output; std::vector cpu_input(1); std::vector cpu_output(1); try { // gpu input is actually shorts, everything else is float ASSERT_EQ(gpu_input.alloc(params.isize[0] * sizeof(short) * input_complex), hipSuccess); ASSERT_EQ(gpu_output.alloc(params.osize[0] * sizeof(float) * 2), hipSuccess); cpu_input[0].alloc(params.isize[0] * sizeof(float) * input_complex); cpu_output[0].alloc(params.osize[0] * sizeof(float) * 2); // generate short (16-bit) and float (32-bit) input std::mt19937 gen; std::uniform_int_distribution dis(-3, 3); std::vector cpu_input_short(params.isize[0] * input_complex); for(auto& i : cpu_input_short) i = dis(gen); // copy short input to gpubuf ASSERT_EQ(hipMemcpy(gpu_input.data(), cpu_input_short.data(), sizeof(short) * cpu_input_short.size(), hipMemcpyHostToDevice), hipSuccess); // convert shorts to floats for FFTW input std::copy(cpu_input_short.begin(), cpu_input_short.end(), static_cast(cpu_input[0].data())); // get callback function so we can pass it to rocfft void* callback_host; if(input_complex == 1) { ASSERT_EQ(hipMemcpyFromSymbol( &callback_host, HIP_SYMBOL(load_callback_short_dev), sizeof(void*)), hipSuccess); } else { ASSERT_EQ(hipMemcpyFromSymbol( &callback_host, HIP_SYMBOL(load_callback_short2_dev), sizeof(void*)), hipSuccess); } ASSERT_EQ(params.set_callbacks(callback_host, nullptr, nullptr, nullptr), fft_status_success); // run rocFFT void* gpu_input_ptr = gpu_input.data(); void* gpu_output_ptr = gpu_output.data(); ASSERT_EQ(params.execute(&gpu_input_ptr, &gpu_output_ptr), fft_status_success); // construct + run FFTW plan auto cpu_plan = fftw_plan_via_rocfft(params.length, params.istride, params.ostride, params.nbatch, params.idist, params.odist, params.transform_type, cpu_input, cpu_output); fftw_run(params.transform_type, cpu_plan, cpu_input, cpu_output); // copy rocFFT output back to CPU std::vector gpu_output_copy(1); gpu_output_copy[0].alloc(gpu_output.size()); ASSERT_EQ(hipMemcpy(gpu_output_copy[0].data(), gpu_output.data(), gpu_output.size(), hipMemcpyDeviceToHost), hipSuccess); auto cpu_output_norm = norm(cpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); auto gpu_output_norm = norm(gpu_output_copy, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); ASSERT_TRUE(std::isfinite(gpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(gpu_output_norm.l_inf)); double linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(params.length.front()); auto diff = distance(cpu_output, gpu_output_copy, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.otype, params.ostride, params.odist, nullptr, linf_cutoff, params.ioffset, params.ooffset); ASSERT_TRUE(diff.l_inf <= linf_cutoff); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { GTEST_SKIP() << e.msg; } } rocFFT-rocm-7.1.0/clients/tests/cmake/000077500000000000000000000000001506652163400174735ustar00rootroot00000000000000rocFFT-rocm-7.1.0/clients/tests/cmake/FindFFTW.cmake000066400000000000000000000114341506652163400220470ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# #if( FFTW_FIND_VERSION VERSION_LESS "3" ) # message( FFTW_FIND_VERION is ${FFTW_FIND_VERSION}) # message( FATAL_ERROR "FindFFTW can not configure versions less than FFTW 3.0.0" ) #endif( ) find_path(FFTW_INCLUDE_DIRS NAMES fftw3.h HINTS ${FFTW_ROOT}/include $ENV{FFTW_ROOT}/include PATHS /usr/include /usr/local/include ) mark_as_advanced( FFTW_INCLUDE_DIRS ) # message( STATUS "FFTW_FIND_COMPONENTS: ${FFTW_FIND_COMPONENTS}" ) # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) include( CheckSymbolExists ) set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE NAMES fftw3f fftw3f-3 fftw3 fftw3-3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library single" ) mark_as_advanced( FFTW_LIBRARIES_SINGLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTWF_OMP_LIBRARY fftw3f_omp ) find_library( FFTWF_THREADS_LIBRARY fftw3f_threads ) if( FFTWF_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTWF_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) check_symbol_exists( fftwf_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) find_library( FFTW_LIBRARIES_DOUBLE NAMES fftw3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library double" ) mark_as_advanced( FFTW_LIBRARIES_DOUBLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTW_OMP_LIBRARY fftw3_omp ) find_library( FFTW_THREADS_LIBRARY fftw3_threads ) if( FFTW_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTW_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) check_symbol_exists( fftw_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( BUILD_FFTW OR FFTW_HAVE_SPRINT_PLAN ) target_compile_definitions( rocfft-test PUBLIC FFTW_HAVE_SPRINT_PLAN ) endif() include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) # assume the threads feature is always enabled on Windows, since it's # not a separate library there if( FFTW_FOUND AND WIN32 ) set( FFTW_MULTITHREAD TRUE ) endif() if( NOT FFTW_FOUND ) message( STATUS "FindFFTW could not find all of the following fftw libraries" ) message( STATUS "${FFTW_FIND_COMPONENTS}" ) else( ) message(STATUS "FindFFTW configured variables:" ) message(STATUS "FFTW_INCLUDE_DIRS: ${FFTW_INCLUDE_DIRS}" ) message(STATUS "FFTW_LIBRARIES: ${FFTW_LIBRARIES}" ) endif() rocFFT-rocm-7.1.0/clients/tests/default_callbacks_test.cpp000066400000000000000000000430431506652163400236050ustar00rootroot00000000000000// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../../shared/fftw_transform.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" // ------------------------------------- // default load callback definitions // ------------------------------------- template __device__ T load_cb(T* data, size_t offset, void* cbdata, void* sharedMem) { return data[offset]; } __device__ auto load_cb_complex_double = load_cb>; __device__ auto load_cb_double = load_cb; __device__ auto load_cb_complex_float = load_cb>; __device__ auto load_cb_float = load_cb; // ------------------------------------- // default store callback definitions // ------------------------------------- template __device__ void store_cb(T* data, size_t offset, T element, void* cbdata, void* sharedMem) { data[offset] = element; } __device__ auto store_cb_complex_double = store_cb>; __device__ auto store_cb_double = store_cb; __device__ auto store_cb_complex_float = store_cb>; __device__ auto store_cb_float = store_cb; // ------------------------------------- // type traits definitions // ------------------------------------- template struct is_hip_complex { static const bool value = false; }; template <> struct is_hip_complex> { static const bool value = true; }; template <> struct is_hip_complex> { static const bool value = true; }; // ASAN introduces some problems with mixing library and client // callbacks, so skip these tests if it's enabled #ifdef ADDRESS_SANITIZER #define TEST_CALLBACK_CHECK_ASAN \ GTEST_SKIP() << "mixed library/client callbacks not supported for ASAN"; #else #define TEST_CALLBACK_CHECK_ASAN #endif // ------------------------------------- // test callbacks struct // ------------------------------------- enum struct DefaultCallbackType { LOAD, STORE, }; struct Test_Callback { Test_Callback(size_t _N, size_t _dim, rocfft_transform_type_e _frwd_transf_type, rocfft_precision_e _frwd_transf_precision, DefaultCallbackType _cb_type, uint32_t _seed) : N(_N) , dim(_dim) , fwrd_transf_type(_frwd_transf_type) , frwd_transf_precision(_frwd_transf_precision) , cb_type(_cb_type) , seed(_seed) { float low_bound_f = -1.0f, up_bound_f = 1.0f; double low_bound_d = -1.0, up_bound_d = 1.0; std::vector> h_mem_out_f2, h_mem_out_no_cb_f2; std::vector> h_mem_out_d2, h_mem_out_no_cb_d2; switch(fwrd_transf_type) { case rocfft_transform_type_complex_forward: { std::vector> h_mem_in_f2; std::vector> h_mem_in_d2; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f2, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d2, h_mem_out_d2, h_mem_out_no_cb_d2); break; } case rocfft_transform_type_real_forward: { std::vector h_mem_in_f; std::vector h_mem_in_d; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d, h_mem_out_d2, h_mem_out_no_cb_d2); break; } default: break; } } size_t get_data_size() { // compute total data size size_t data_size = 1; for(size_t i = 0; i < dim; ++i) { data_size *= N; } return data_size; } template void run(Tbound low_bound, Tbound up_bound, std::vector& host_mem_in, std::vector& host_mem_out, std::vector& host_mem_out_no_cb) { auto data_sz = get_data_size(); if(cb_type == DefaultCallbackType::LOAD) set_load_callback(); else if(cb_type == DefaultCallbackType::STORE) set_store_callback(); host_mem_in.resize(data_sz); if constexpr(!is_hip_complex::value) init_data(low_bound, up_bound, host_mem_in); else init_data_complex(low_bound, up_bound, host_mem_in); if constexpr(!is_hip_complex::value) data_sz = (data_sz / 2) + 1; host_mem_out.resize(data_sz); forward_transform(true, host_mem_in, host_mem_out); host_mem_out_no_cb.resize(data_sz); forward_transform(false, host_mem_in, host_mem_out_no_cb); validate_test(host_mem_out, host_mem_out_no_cb); } template void init_data(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i] = dist(gen); } } template void init_data_complex(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i].x = dist(gen); host_mem[i].y = dist(gen); } } template void forward_transform(bool apply_callback, const std::vector& host_mem_in, std::vector& host_mem_out) { rocfft_plan plan = nullptr; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, fwrd_transf_type, frwd_transf_precision, dim, lengths.data(), 1, nullptr), rocfft_status_success); size_t work_buffer_size = 0; void* work_buffer = nullptr; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } hipStream_wrapper_t stream; stream.alloc(); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); if(apply_callback) { if(cb_type == DefaultCallbackType::LOAD) { ASSERT_EQ(rocfft_execution_info_set_load_callback(info, &load_cb_host, nullptr, 0), rocfft_status_success); } else if(cb_type == DefaultCallbackType::STORE) { ASSERT_EQ( rocfft_execution_info_set_store_callback(info, &store_cb_host, nullptr, 0), rocfft_status_success); } } gpubuf device_mem_in; size_t NbytesIn = host_mem_in.size() * sizeof(Tin); ASSERT_EQ(device_mem_in.alloc(NbytesIn), hipSuccess); EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), NbytesIn, hipMemcpyHostToDevice), hipSuccess); gpubuf device_mem_out; size_t NbytesOut = host_mem_out.size() * sizeof(Tout); ASSERT_EQ(device_mem_out.alloc(NbytesOut), hipSuccess); void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); ASSERT_EQ(hipMemcpy(host_mem_out.data(), out_ptr, NbytesOut, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); ASSERT_EQ(hipFree(work_buffer), hipSuccess); } template void validate_test(const std::vector& host_mem_out, const std::vector& host_mem_out_no_cb) { auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_out.data()), reinterpret_cast*>(host_mem_out_no_cb.data()), host_mem_out.size(), 1, 1, host_mem_out.size(), 1, host_mem_out_no_cb.size(), nullptr, type_epsilon(), {0}, {0}); EXPECT_LT(diff.l_inf, type_epsilon()); } // ------------------------------------------------ // set_load_callback template specializations // ------------------------------------------------ template void set_load_callback(){}; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_float), sizeof(void*)), hipSuccess); }; // ------------------------------------------------ // set_store_callback template specializations // ------------------------------------------------ template void set_store_callback(){}; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_float), sizeof(void*)), hipSuccess); }; size_t N = 0; size_t dim = 0; rocfft_transform_type_e fwrd_transf_type; rocfft_precision_e frwd_transf_precision; DefaultCallbackType cb_type; uint32_t seed = 0; void* store_cb_host = nullptr; void* load_cb_host = nullptr; }; // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a load callback set. // ------------------------------------------------------------------- #ifndef WIN32 TEST(rocfft_UnitTest, default_load_callback_complex_single) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 1); } TEST(rocfft_UnitTest, default_load_callback_complex_double) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 2); } TEST(rocfft_UnitTest, default_load_callback_real_single) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 3); } TEST(rocfft_UnitTest, default_load_callback_real_double) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 4); } // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a store callback set. // ------------------------------------------------------------------- TEST(rocfft_UnitTest, default_store_callback_complex_single) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::STORE, 5); } TEST(rocfft_UnitTest, default_store_callback_complex_double) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::STORE, 6); } TEST(rocfft_UnitTest, default_store_callback_real_single) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::STORE, 7); } TEST(rocfft_UnitTest, default_store_callback_real_double) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } TEST_CALLBACK_CHECK_ASAN; Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::STORE, 8); } #endif rocFFT-rocm-7.1.0/clients/tests/gtest_main.cpp000066400000000000000000000765261506652163400212710ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /// @file /// @brief googletest based unit tester for rocfft /// #include #include #include #include #include #include #include #include #include #include #include #include #include "../../shared/CLI11.hpp" #include "../../shared/concurrency.h" #include "../../shared/device_properties.h" #include "../../shared/environment.h" #include "../../shared/hostbuf.h" #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/sys_mem.h" #include "../../shared/test_params.h" #include "../../shared/work_queue.h" #include "bitwise_repro/bitwise_repro_db.h" #include "bitwise_repro/bitwise_repro_test.h" #include "rocfft/rocfft.h" // Control output verbosity: int verbose; // User-defined random seed size_t random_seed; std::random_device default_seed_dev; // Overall probability of running conventional tests double test_prob; // Probability of running tests from the emulation suite double emulation_prob; // Probability of running unit tests double unittest_prob; // Modifier for probability of running tests with complex interleaved data double complex_interleaved_prob_factor; // Modifier for probability of running tests with real data double real_prob_factor; // Modifier for probability of running tests with complex planar data double complex_planar_prob_factor; // Modifier for probability of running tests with callbacks double callback_prob_factor; // Number of random tests per suite size_t n_random_tests = 0; GTEST_ALLOW_UNINSTANTIATED_PARAMETERIZED_TEST(bitwise_repro_test); // Transform parameters for manual test: fft_params manual_params; // Host memory limitation for tests (GiB): size_t ramgb; // Device memory limitation for tests (GiB): size_t vramgb; // Number of hip devices to use. int ngpus{}; // Allow skipping tests if there is a runtime error bool skip_runtime_fails; // But count the number of failures int n_hip_failures = 0; // Pointer to a bitwise repro-db file std::unique_ptr repro_db; // Manually specified precision cutoffs: double half_epsilon; double single_epsilon; double double_epsilon; // Measured precision cutoffs: double max_linf_eps_double = 0.0; double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; double max_linf_eps_half = 0.0; double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; // Compare results against FFTW in accuracy tests bool fftw_compare = true; // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; // Number of devices to distribute the FFT to for manual tests int manual_devices = 1; // Multi-process library to use fft_params::fft_mp_lib mp_lib = fft_params::fft_mp_lib_none; // Number of multi-process ranks to launch int mp_ranks = 1; // Multi-process launch command (e.g. mpirun --np 4 /path/to/rocfft_mpi_worker) std::string mp_launch; void init_gtest_flags() { // HACK: gtest maintains a "should run" flag on each test case, // but only sets it during RUN_ALL_TESTS. Precompiling should // ideally only happen for the test cases that would actually // run. // // So call RUN_ALL_TESTS once with the "list tests" temporarily set // to true, to initialize all of that. // // gtest will then print all of the test cases to stdout. // Temporarily redirect stdout to /dev/null as well. bool temp_list_tests = true; std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); // move stdout to devnull #ifdef WIN32 int stdout_fd = _fileno(stdout); int devnull = _open("NUL", _O_WRONLY); int stdout_copy = _dup(stdout_fd); _dup2(devnull, stdout_fd); #else int stdout_fd = STDOUT_FILENO; int devnull = open("/dev/null", O_WRONLY); int stdout_copy = dup(stdout_fd); dup2(devnull, stdout_fd); #endif (void)RUN_ALL_TESTS(); // put stdout back #ifdef WIN32 _dup2(stdout_copy, stdout_fd); _close(stdout_copy); _close(devnull); #else dup2(stdout_copy, stdout_fd); close(stdout_copy); close(devnull); #endif std::swap(temp_list_tests, testing::GTEST_FLAG(list_tests)); } std::vector tokens_to_run() { init_gtest_flags(); std::vector tokens; auto ut = testing::UnitTest::GetInstance(); for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index) { const auto ts = ut->GetTestSuite(ts_index); for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index) { const auto ti = ts->GetTestInfo(ti_index); std::string name = ti->name(); // only precompile test cases that will run if(!ti->should_run()) continue; // only care about accuracy tests if(name.find("vs_fftw/") != std::string::npos) { name.erase(0, 8); // Run any problem that uses brick decomposition // without touching batch. Bricks are specified with // batch indexes, so arbitrarily changing batch to 1 // can break those cases. if(name.find("_brick_") != std::string::npos) { tokens.emplace_back(std::move(name)); continue; } // change batch to 1, so we don't waste time creating // multiple plans that differ only by batch auto idx = name.find("_batch_"); if(idx == std::string::npos) continue; // advance idx to batch number idx += 7; auto end = name.find('_', idx); if(end == std::string::npos) continue; name.replace(idx, end - idx, "1"); tokens.emplace_back(std::move(name)); } } } return tokens; } void precompile_test_kernels(const std::string& precompile_file) { std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; auto tokens = tokens_to_run(); std::random_device dev; std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); std::cout << "precompiling " << tokens.size() << " FFT plans...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); EnvironmentSetTemp env_compile_only{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"}; const size_t NUM_THREADS = rocfft_concurrency(); std::vector threads; for(size_t i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([&tokenQueue]() { for(;;) { std::string token{tokenQueue.pop()}; if(token.empty()) break; try { rocfft_params params_forward; params_forward.from_token(token); params_forward.validate(); params_forward.setup_structs(); params_forward.free(); rocfft_params params_inverse; params_inverse.inverse_from_forward(params_forward); params_inverse.validate(); params_inverse.setup_structs(); } catch(std::exception& e) { // failed to create a plan, abort // // we could continue on, but the test should just // fail later anyway in the same way. so report // which token failed early and get out throw std::runtime_error(token + " plan creation failure: " + e.what()); } } }); // insert empty tokens to tell threads to stop tokenQueue.push({}); } for(auto& t : threads) t.join(); auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) << " ms\n"; } int main(int argc, char* argv[]) { const auto test_begin = std::chrono::system_clock::now(); // We would like to parse a few arguments before initiating gtest. // Save argv[0] because CLI doesn't include this in the remaining args, and it's expected when // we re-parse the arguments with gtest and CLI. std::string argv0 = argv[0]; CLI::App app{ "\n" "rocFFT Runtime Test command line options\n" "NB: input parameters are row-major.\n" "\n" "FFTW accuracy test cases are named using these identifiers:\n" "\n" " len_: problem dimensions, row-major\n" " single,double: precision\n" " ip,op: in-place or out-of-place\n" " batch_: batch size\n" " istride__: input stride (ostride for output stride), format may be:\n" " CI - complex interleaved\n" " CP - complex planar\n" " R - real\n" " HI - hermitian interleaved\n" " HP - hermitian planar\n" "\n" "Usage"}; // Override CLI11 help to print it along gtest's help app.set_help_flag(""); const auto opt_help = app.add_flag("-h, --help", "Produces this help message"); app.add_option("-v, --verbose", verbose, "Print out detailed information for the tests") ->default_val(0); app.add_option("--nrand", n_random_tests, "Number of extra randomized tests")->default_val(0); app.add_option("--ngpus", ngpus, "Number of GPUs to use per rank") ->default_val(-1) ->check(CLI::NonNegativeNumber); app.add_option("--gpus", n_random_tests, "Number of extra randomized tests")->default_val(0); app.add_option("--test_prob", test_prob, "Probability of running individual tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option("--unittest_prob", unittest_prob, "Probability of running individual unit tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option( "--emulation_prob", emulation_prob, "Probability of running individual emulation tests") ->default_val(1.0) ->check(CLI::Range(0.0, 1.0)); app.add_option("--real_prob", real_prob_factor, "Probability multiplier for running individual real/complex transforms") ->default_val(1.0) ->check(CLI::PositiveNumber); app.add_option("--planar_prob", complex_planar_prob_factor, "Probability multiplier for running individual planar transforms") ->default_val(0.1) ->check(CLI::PositiveNumber); app.add_option( "--complex_interleaved_prob_factor", complex_interleaved_prob_factor, "Probability multiplier for running individual transforms with complex interleaved data") ->default_val(1) ->check(CLI::PositiveNumber); app.add_option("--callback_prob", callback_prob_factor, "Probability multiplier for running individual callback transforms") ->default_val(0.1) ->check(CLI::PositiveNumber); constexpr std::array emulation_types = {"none", "smoke", "regression", "extended"}; app.add_option("--emulation", "Run emulation tests") ->check(CLI::IsMember(emulation_types)) ->each([&](const std::string& emulationtype) { constexpr auto nidx = [emulation_types](const auto name) { return std::find(emulation_types.begin(), emulation_types.end(), name) - emulation_types.begin(); }; // Emulation test suites focus on well-established software paths; we are looking for // information about the hardware, which means that we aren't trying to find out a lot // of information about the software. Thus, no randomly-generated tests. n_random_tests = 0; // Run all of the emulation tests: emulation_prob = 1.0; // Callbacks are not an emulation test target. callback_prob_factor = 0; // We can do a switch on nidx(emulationtype) when we have C++20 // switch(nidx(emulationtype)) // { // case nidx("smoke"): // etc. if(nidx(emulationtype) == nidx("smoke")) { // 2GB vram limit, approx 1 minute GPU time with short tests. vramgb = 2; test_prob = 0; emulation_prob = 0.005; } if(nidx(emulationtype) == nidx("regression")) { vramgb = 16; emulation_prob = 1; test_prob = 0.01; } if(nidx(emulationtype) == nidx("extended")) { emulation_prob = 1; test_prob = 0.02; } }); app.add_option("--fftw_compare", fftw_compare, "Compare to FFTW in accuracy tests") ->default_val(true); app.add_option("--mp_lib", mp_lib, "Multi-process library type: none (default), mpi") ->default_val("none"); app.add_option("--mp_ranks", mp_ranks, "Number of multi-process ranks to launch") ->default_val(1) ->check(CLI::NonNegativeNumber); app.add_option("--mp_launch", mp_launch, "Command line prefix to launch multi-process transforms, e.g. \"mpirun --np 4 " "/path/to/rocfft_mpi_worker\"") ->default_val("") ->each([&](const std::string&) { if(mp_lib == fft_params::fft_mp_lib_none) { std::cout << "--mp_launch requires an mp library (see mp_lib in --help).\n"; std::exit(EXIT_FAILURE); } }) ->needs("--mp_lib"); app.add_flag("--smoketest", "Run a short (approx 5 minute) randomized selection of tests") ->each([&](const std::string&) { // The objective is to have an test that takes about 5 minutes, so just set the // probability per test to a small value to achieve this result. test_prob = 0.0005; emulation_prob = 0.005; unittest_prob = 0.2; n_random_tests = 10; }); app.add_flag("--callback", "Inject load/store callbacks")->each([&](const std::string&) { manual_params.run_callbacks = true; }); app.add_option("--seed", random_seed, "Random seed; if unset, use an actual random seed") ->default_val(default_seed_dev()); // Filename for fftw and fftwf wisdom. std::string fftw_wisdom_filename; // Token string to fully specify fft params for the manual test. std::string test_token; // Filename for precompiled kernels to be written to std::string precompile_file; // Full path to bitwise repro database file std::string repro_db_path; // Bool option to just print tokens and exit bool printtokens{false}; // Declare the supported options. Some option pointers are declared to track passed opts. app.add_flag("--version", "Print queryable version information from the rocfft library") ->each([](const std::string&) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << std::endl; return EXIT_SUCCESS; }); app.add_flag("--checkstride", "Check that data is not written outside of output strides") ->each([&](const std::string&) { manual_params.check_output_strides = true; }); auto opt_token = app.add_option("--token", test_token, "Test token name for manual test")->default_val(""); // Group together options that conflict with --token auto* non_token = app.add_option_group("Token Conflict", "Options excluded by --token"); non_token ->add_flag("--double", "Double precision transform (deprecated: use --precision double)") ->each([&](const std::string&) { manual_params.precision = fft_precision_double; }); non_token->excludes(opt_token); non_token ->add_option("-t, --transformType", manual_params.transform_type, "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ->default_val(fft_transform_type_complex_forward); non_token ->add_option("--auto_allocation", manual_params.auto_allocate, "rocFFT's auto-allocation behavior: \"on\", \"off\", or \"default\"") ->default_val("default"); non_token ->add_option("--precision", manual_params.precision, "Transform precision: single (default), double, half") ->excludes("--double"); non_token->add_flag("-o, --notInPlace", "Not in-place FFT transform (default: in-place)") ->each([&](const std::string&) { manual_params.placement = fft_placement_notinplace; }); non_token ->add_option("--itype", manual_params.itype, "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token ->add_option("--otype", manual_params.otype, "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ->default_val(fft_array_type_unset); non_token->add_option("--length", manual_params.length, "Lengths")->expected(1, 3); non_token ->add_option("-b, --batchSize", manual_params.nbatch, "If this value is greater than one, arrays will be used") ->default_val(1); non_token->add_option("--istride", manual_params.istride, "Input stride"); non_token->add_option("--ostride", manual_params.ostride, "Output stride"); non_token->add_option("--idist", manual_params.idist, "Logical distance between input batches") ->default_val(0); non_token->add_option("--odist", manual_params.odist, "Logical distance between output batches") ->default_val(0); non_token->add_option("--ioffset", manual_params.ioffset, "Input offset"); non_token->add_option("--ooffset", manual_params.ooffset, "Output offset"); app.add_option("--isize", manual_params.isize, "Logical size of input buffer"); app.add_option("--osize", manual_params.osize, "Logical size of output buffer"); app.add_option("--R", ramgb, "RAM limit in GiB for tests") ->default_val(host_memory::singleton().get_total_gbytes()); app.add_option("--V", vramgb, "VRAM limit in GiB for tests")->default_val(0); app.add_option("--half_epsilon", half_epsilon)->default_val(9.77e-4); app.add_option("--single_epsilon", single_epsilon)->default_val(3.75e-5); app.add_option("--double_epsilon", double_epsilon)->default_val(1e-15); app.add_option("--skip_runtime_fails", skip_runtime_fails, "Skip the test if there is a runtime failure") ->default_val(true); app.add_option("-w, --wise", use_fftw_wisdom, "Use FFTW wisdom"); app.add_option("-W, --wisdomfile", fftw_wisdom_filename, "FFTW3 wisdom filename") ->default_val("wisdom3.txt"); app.add_option("--manual_devices", manual_devices, "Distribute manual test case among this many devices") ->default_val(1) ->check(CLI::PositiveNumber); app.add_option("--scalefactor", manual_params.scale_factor, "Scale factor to apply to output"); app.add_option("--repro-db", repro_db_path, "Database file full path name for bitwise reproducibility tests"); app.add_option("--precompile", precompile_file, "Precompile kernels to a file for all test cases before running tests") ->default_val(""); app.add_flag("--printtokens", printtokens, "Print test tokens to scheduled to be run and exit"); // Default value is set in fft_params.h based on if device-side PRNG was enabled. app.add_option("-g, --inputGen", manual_params.igen, "Input data generation:\n0) PRNG sequence (device)\n" "1) PRNG sequence (host)\n" "2) linearly-spaced sequence (device)\n" "3) linearly-spaced sequence (host)"); // Try parsing initial args that will be used to configure tests // Allow extras to pass on gtest arguments without error app.allow_extras(); try { app.parse(argc, argv); } catch(const CLI::ParseError& e) { return app.exit(e); } // extract remaining arguments for subsequent gtest initialization std::vector remaining_args = app.remaining(); std::string gtest_help_opt = "--help"; // NB: If we initialize gtest first, then it removes all of its own command-line // arguments and sets argc and argv correctly; std::vector gtest_argv; gtest_argv.insert(gtest_argv.begin(), argv[0]); for(std::string& s : remaining_args) { gtest_argv.push_back(&s[0]); } if(*opt_help) { // make sure gtest prints its help as well gtest_argv.push_back(>est_help_opt[0]); } gtest_argv.push_back(NULL); decltype(argc) gtest_argc = gtest_argv.size() - 1; ::testing::InitGoogleTest(>est_argc, gtest_argv.data()); // gtest-relevant args are removed if(*opt_help) { std::cout << app.help() << "\n"; return EXIT_SUCCESS; } // no help was used, gtest_argc is expected to be 1 at this point. If not, some of the // used options were not recognized at all if(gtest_argc > 1) { std::cout << "Unrecognised option(s) found:\n "; for(auto i = 1; i < gtest_argc; i++) std::cout << gtest_argv[i] << " "; std::cout << "\nRun with --help for more information.\n"; return EXIT_FAILURE; } std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon << std::endl; std::cout << "Random seed: " << random_seed << std::endl; // If precompiling, tell rocFFT to use the specified cache file // to write kernels to // // But if our environment already has a cache file for RTC, then // we should just use that std::unique_ptr env_precompile; if(!precompile_file.empty() && rocfft_getenv("ROCFFT_RTC_CACHE_PATH").empty()) { env_precompile = std::make_unique("ROCFFT_RTC_CACHE_PATH", precompile_file.c_str()); } rocfft_setup(); { char v[256]; rocfft_get_version_string(v, 256); std::cout << "rocFFT version: " << v << std::endl; } #ifdef FFTW_MULTITHREAD fftw_init_threads(); fftwf_init_threads(); fftw_plan_with_nthreads(rocfft_concurrency()); fftwf_plan_with_nthreads(rocfft_concurrency()); #endif // Set host memory limit from command-line options host_memory::singleton().set_limit_gbytes(ramgb); std::cout << "Host memory limit: " << ramgb << " GiB" << std::endl; if(use_fftw_wisdom) { if(verbose) { std::cout << "Using " << fftw_wisdom_filename << " wisdom file\n"; } std::ifstream fftw_wisdom_file(fftw_wisdom_filename); std::string allwisdom = std::string(std::istreambuf_iterator(fftw_wisdom_file), std::istreambuf_iterator()); std::string fftw_wisdom; std::string fftwf_wisdom; bool load_wisdom = false; bool load_fwisdom = false; std::istringstream input; input.str(allwisdom); // Separate the single-precision and double-precision wisdom: for(std::string line; std::getline(input, line);) { if(line.rfind("(fftw", 0) == 0 && line.find("fftw_wisdom") != std::string::npos) { load_wisdom = true; } if(line.rfind("(fftw", 0) == 0 && line.find("fftwf_wisdom") != std::string::npos) { load_fwisdom = true; } if(load_wisdom) { fftw_wisdom.append(line + "\n"); } if(load_fwisdom) { fftwf_wisdom.append(line + "\n"); } if(line.rfind(")", 0) == 0) { load_wisdom = false; load_fwisdom = false; } } fftw_import_wisdom_from_string(fftw_wisdom.c_str()); fftwf_import_wisdom_from_string(fftwf_wisdom.c_str()); } if(!repro_db_path.empty()) repro_db.reset(new fft_hash_db(repro_db_path)); if(!test_token.empty()) { std::cout << "Reading fft params from token:\n" << test_token << std::endl; try { manual_params.from_token(test_token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return EXIT_FAILURE; } } else { if(manual_params.length.empty()) { manual_params.length.push_back(8); // TODO: add random size? } if(manual_params.istride.empty()) { manual_params.istride.push_back(1); // TODO: add random size? } if(manual_params.ostride.empty()) { manual_params.ostride.push_back(1); // TODO: add random size? } } if(!precompile_file.empty()) precompile_test_kernels(precompile_file); if(printtokens) { std::cout << "Tokens:" << std::endl; const auto tokens = tokens_to_run(); for(const auto& token : tokens) { std::cout << token << std::endl; } return EXIT_SUCCESS; } auto retval = RUN_ALL_TESTS(); if(use_fftw_wisdom) { std::string fftw_wisdom = std::string(fftw_export_wisdom_to_string()); std::string fftwf_wisdom = std::string(fftwf_export_wisdom_to_string()); fftw_wisdom.append(std::string(fftwf_export_wisdom_to_string())); std::ofstream fftw_wisdom_file(fftw_wisdom_filename); fftw_wisdom_file << fftw_wisdom; fftw_wisdom_file << fftwf_wisdom; fftw_wisdom_file.close(); } rocfft_cleanup(); const auto test_duration = std::chrono::system_clock::now() - test_begin; const auto test_hours = std::chrono::duration_cast(test_duration); const auto test_minutes = std::chrono::duration_cast(test_duration - test_hours); std::cout << "Test suite took " << test_hours.count() << " hours " << test_minutes.count() << " minutes\n" << std::endl; std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n"; std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n"; std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << "\n"; std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << "\n"; std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n"; std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n"; std::cout << "Number of runtime issues: " << n_hip_failures << "\n"; std::cout << "Number of successful tests: " << ::testing::UnitTest::GetInstance()->successful_test_count() << "\n"; std::cout << "Number of skipped tests: " << ::testing::UnitTest::GetInstance()->skipped_test_count() << "\n"; std::cout << "\nRandom seed: " << random_seed << std::endl; return retval; } TEST(manual, vs_fftw) // MANUAL TESTS HERE { rocfft_params params(manual_params); if(manual_devices > 1) { // just distribute along the slowest FFT dimension std::vector deviceGrid(params.length.size() + 1, 1); deviceGrid[1] = manual_devices; params.distribute_input(manual_devices, deviceGrid); params.distribute_output(manual_devices, deviceGrid); } // Run an individual test using the provided command-line parameters. params.validate(); std::cout << "Manual test:" << "\n\t" << params.str("\n\t") << "\n"; std::cout << "Token: " << params.token() << "\n"; if(!params.valid(verbose + 2)) { std::cout << "manual params are not valid\n"; } try { fft_vs_reference(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear test cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } } TEST(manual, bitwise_reproducibility) // MANUAL TESTS HERE { if(repro_db == nullptr) GTEST_SKIP() << "A database file is required for this test." << std::endl; rocfft_params params(manual_params); // Run an individual test using the provided command-line parameters. params.validate(); std::cout << "Manual test:" << "\n\t" << params.str("\n\t") << "\n"; std::cout << "Token: " << params.token() << "\n"; if(!params.valid(verbose + 2)) { std::cout << "manual params are not valid\n"; } try { bitwise_repro(params); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } SUCCEED(); } rocFFT-rocm-7.1.0/clients/tests/hermitian_test.cpp000066400000000000000000000273451506652163400221510ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/gpubuf.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_params.h" #include "../samples/rocfft/examplekernels.h" #include "../samples/rocfft/exampleutils.h" #include "rocfft/rocfft.h" #include #include #include #include #include #include #include void run_1D_hermitian_test(size_t length) { // Run two 1D C2R transforms, on: // * random input // * identical random input, but modified to be Hermitian-symmetric // We should tolerate the input being having non-zero imaginary part in the DC mode // and the Nyquist frequency (of the length is even). rocfft_params p; p.length = {length}; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); std::vector h_input(p.isize[0]); std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dis(0.0, 1.0); for(auto& val : h_input) { val.x = dis(gen); val.y = dis(gen); } if(verbose > 2) { std::cout << "non-Hermitian input:"; for(const auto& val : h_input) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } gpubuf ibuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); gpubuf obuf; ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(p.create_plan() == fft_status_success); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); ASSERT_TRUE(hipDeviceSynchronize() == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output) { std::cout << " " << val; } std::cout << std::endl; } std::vector h_input1(p.isize[0]); std::copy(h_input.begin(), h_input.end(), h_input1.begin()); // Impose Hermitian symmetry on the input: h_input1[0].y = 0.0; if(p.length[0] % 2 == 0) { h_input1.back().y = 0.0; } if(verbose > 2) { std::cout << "Hermitian input:"; for(const auto& val : h_input1) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } double maxdiff = 0.0; for(unsigned int i = 0; i < h_input.size(); ++i) { auto val = std::abs( rocfft_complex(h_input[i].x - h_input1[i].x, h_input[i].y - h_input1[i].y)); if(val > maxdiff) maxdiff = val; } ASSERT_TRUE(maxdiff > 0.0); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input1.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output1(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output1.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output1) { std::cout << " " << val; } std::cout << std::endl; } double maxerr = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto val = std::abs(h_output[i] - h_output1[i]); if(val > maxerr) maxerr = val; } if(verbose) std::cout << maxerr << std::endl; EXPECT_TRUE(maxerr == 0.0); } // test a case that's small enough that it only needs one kernel TEST(rocfft_UnitTest, 1D_hermitian_single_small) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } run_1D_hermitian_test(8); } // test a case that's big enough that it needs multiple kernels TEST(rocfft_UnitTest, 1D_hermitian_single_large) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } run_1D_hermitian_test(8192); } template std::string str(T begin, T end) { std::stringstream ss; bool first = true; for(; begin != end; begin++) { if(!first) ss << ", "; ss << *begin; first = false; } return ss.str(); } // Test that the GPU Hermitian symmetrizer code produces the correct results. TEST(rocfft_UnitTest, gpu_symmetrizer) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } std::vector> lengths = {{4, 4, 3}, {5}, {8}, {5, 5}, {5, 8}, {8, 5}, {8, 8}, {5, 5, 5}, {8, 5, 5}, {5, 8, 5}, {5, 5, 8}, {5, 8, 8}, {8, 5, 8}, {8, 8, 5}, {8, 8, 8}}; for(const auto& length : lengths) { // Symmetrize complex data and ensure that the checker sees that it's symmetric. // Use the params class to set up strides and lengths: rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); // Data buffers: gpubuf buf; ASSERT_TRUE(buf.alloc(sizeof(hipDoubleComplex) * p.isize[0]) == hipSuccess); std::vector hbuf(p.isize[0]); // Initialize a Hermitian-symmetric array; it should be symmetric. init_hermitiancomplex_cm(p.length_cm(), p.ilength_cm(), p.istride_cm(), buf.data()); ASSERT_TRUE(hipMemcpy(hbuf.data(), buf.data(), buf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 1) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, verbose > 0)) << "length: " << str(length.begin(), length.end()); // This should not be symmetric: std::mt19937_64 rng; std::seed_seq ss{uint32_t(10)}; rng.seed(ss); std::uniform_real_distribution unif(0, 1); for(auto& v : hbuf) { v.x = unif(rng); v.y = unif(rng); } if(verbose > 2) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( !check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, false)) << "length: " << str(length.begin(), length.end()); } for(const auto& length : lengths) { // Generate Hermitian-symmetric data and ensure that applying the symmetrizer has no effect. rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_forward; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); ASSERT_TRUE(p.create_plan() == fft_status_success); gpubuf ibuf, obuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); initreal_cm(p.length_cm(), p.istride_cm(), ibuf.data()); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); std::fill(h_output.begin(), h_output.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy(h_output.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); impose_hermitian_symmetry_cm(p.length_cm(), p.olength_cm(), p.ostride_cm(), obuf.data()); std::vector h_output_resym(p.osize[0]); std::fill(h_output_resym.begin(), h_output_resym.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy( h_output_resym.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); double maxdiff = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto rdiff = std::abs(h_output[i].x - h_output_resym[i].x); auto idiff = std::abs(h_output[i].y - h_output_resym[i].y); maxdiff = std::max({maxdiff, rdiff, idiff}); } if(verbose) { std::cout << "maxdiff: " << maxdiff << std::endl; } if(verbose > 2) { std::cout << "before symmetrization:\n"; printbuffer_cm(h_output, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); std::cout << "after symmetrization:\n"; printbuffer_cm(h_output_resym, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); } EXPECT_TRUE(maxdiff < 1e-13) << maxdiff << "\n" << p.str() << "\n"; } } rocFFT-rocm-7.1.0/clients/tests/hipGraph_test.cpp000066400000000000000000000334631506652163400217310ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" #include #include #include #include #include static const unsigned int KERNEL_THREADS = 64; __global__ void scale_data_kernel(rocfft_complex* data, size_t length, float scale) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x *= scale; data[idx].y *= scale; } } template __global__ void offset_data_kernel_complex(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x += offset.x; data[idx].y += offset.y; } } template __global__ void offset_data_kernel_real(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx] += offset; } } static void init_input_data(size_t N, size_t seed, std::vector>& host_data, gpubuf_t>& device_data) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_data.resize(N); for(size_t i = 0; i < N; i++) { host_data[i].x = dist(gen); host_data[i].y = dist(gen); } size_t Nbytes = N * sizeof(rocfft_complex); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } template static void init_data(size_t N, T init_val, std::vector& host_data, gpubuf_t& device_data) { host_data.resize(N); std::fill(host_data.begin(), host_data.end(), init_val); size_t Nbytes = N * sizeof(T); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } static void create_forward_fft_plan(size_t N, rocfft_plan& plan) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void create_inverse_fft_plan(size_t N, rocfft_plan& plan_inv) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void set_fft_info(hipStream_t stream, rocfft_execution_info& info) { EXPECT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); EXPECT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); } static void run_forward_fft(rocfft_execution_info info, const rocfft_plan plan, void* in_ptr, void* out_ptr) { ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); } static void run_inverse_fft(rocfft_execution_info info, const rocfft_plan plan_inv, void* in_ptr, void* out_ptr) { // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &in_ptr, &out_ptr, info), rocfft_status_success); } static void scale_device_data(hipStream_t stream, float scale, size_t N, rocfft_complex* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(scale_data_kernel, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, scale); } template static void offset_device_data_real(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_real, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void offset_device_data_complex(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_complex, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void compare_data_exact_match(hipStream_t other_stream, const std::vector& host_data, const gpubuf_t& device_data) { std::vector host_data_compare(host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpyAsync(host_data_compare.data(), device_data.data(), host_data_compare.size() * sizeof(T), hipMemcpyDeviceToHost, other_stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(other_stream), hipSuccess); ASSERT_EQ(host_data == host_data_compare, true); } static void compare_data(const std::vector>& original_host_data, const gpubuf_t>& modified_device_data) { std::vector> modified_host_data(original_host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpy(modified_host_data.data(), modified_device_data.data(), modified_host_data.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(original_host_data.data()), original_host_data.size(), 1, 1, original_host_data.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(original_host_data.data()), reinterpret_cast*>(modified_host_data.data()), // data is all contiguous, we can treat it as 1d original_host_data.size(), 1, 1, original_host_data.size(), 1, modified_host_data.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(original_host_data.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(original_host_data.size()) * MAX_TRANSFORM_ERROR); } TEST(rocfft_UnitTest, hipGraph_execution) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } hipGraph_t graph = nullptr; hipGraphExec_t graph_exec = nullptr; size_t N = 256; size_t seed = 100; auto offset_1 = rocfft_complex{.1, .1}; auto offset_2 = rocfft_complex{-.1, -.1}; float scale = 2.2; float inv_scale = 1. / scale; auto output_init_val = rocfft_complex(0., 0.); size_t num_kernel_launches = 100; size_t num_graph_launches = 10; gpubuf_t> device_mem_in; std::vector> host_mem_in; init_input_data(N, seed, host_mem_in, device_mem_in); rocfft_complex* in_ptr = static_cast*>(device_mem_in.data()); gpubuf_t> device_mem_out; std::vector> host_mem_out; init_data>(N, output_init_val, host_mem_out, device_mem_out); rocfft_complex* out_ptr = static_cast*>(device_mem_out.data()); gpubuf_t device_mem_counter; std::vector host_mem_counter; init_data(N, 0, host_mem_counter, device_mem_counter); size_t* counter_ptr = static_cast(device_mem_counter.data()); rocfft_plan plan; create_forward_fft_plan(N, plan); rocfft_plan plan_inv; create_inverse_fft_plan(N, plan_inv); EXPECT_EQ(hipDeviceSynchronize(), hipSuccess); hipStream_wrapper_t stream; hipStream_wrapper_t other_stream; stream.alloc(); other_stream.alloc(); ASSERT_EQ(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal), hipSuccess); rocfft_execution_info info; set_fft_info(stream, info); // add offset to device input data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, in_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, in_ptr); // scale the device input data scale_device_data(stream, scale, N, in_ptr); // backout the scale scale_device_data(stream, inv_scale, N, in_ptr); // run forward transform on input data run_forward_fft(info, plan, in_ptr, out_ptr); // scale the device output data scale_device_data(stream, scale, N, out_ptr); // backout the scale scale_device_data(stream, inv_scale, N, out_ptr); // run (in-place) inverse transform on output data run_inverse_fft(info, plan_inv, out_ptr, nullptr); // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform auto inv_scale_N = 1. / static_cast(N); scale_device_data(stream, inv_scale_N, N, out_ptr); // add offset to device output data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, out_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, out_ptr); // increment counter offset_device_data_real(stream, 1, N, counter_ptr); ASSERT_EQ(hipStreamEndCapture(stream, &graph), hipSuccess); // make sure no actual work has been done for // the captured stream before graph execution compare_data_exact_match>(other_stream, host_mem_out, device_mem_out); ASSERT_EQ(hipGraphInstantiate(&graph_exec, graph, NULL, NULL, 0), hipSuccess); ASSERT_EQ(hipGraphDestroy(graph), hipSuccess); for(size_t i = 0; i < num_graph_launches; ++i) ASSERT_EQ(hipGraphLaunch(graph_exec, stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); stream.free(); // check for correctness of the output data compare_data(host_mem_in, device_mem_out); // check for correctness of the counter // incremented with multiple graph executions std::vector host_mem_counter_modified(N); fill(host_mem_counter_modified.begin(), host_mem_counter_modified.end(), num_graph_launches); compare_data_exact_match(other_stream, host_mem_counter_modified, device_mem_counter); other_stream.free(); } rocFFT-rocm-7.1.0/clients/tests/multi_device_test.cpp000066400000000000000000000440331506652163400226330ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_params.h" #include #include extern fft_params::fft_mp_lib mp_lib; extern int mp_ranks; static const std::vector> multi_gpu_sizes = { {256}, {256, 256}, {256, 256, 256}, }; enum SplitType { // split both input and output on slow FFT dimension SLOW_INOUT, // split only input on slow FFT dimension, output is not split SLOW_IN, // split only output on slow FFT dimension, input is not split SLOW_OUT, // split input on slow FFT dimension, and output on fast FFT dimension SLOW_IN_FAST_OUT, // 3D pencil decomposition - one dimension is contiguous on input // and another dimension contiguous on output, remaining dims are // both split PENCIL_3D, }; std::vector param_generator_multi_gpu(const SplitType type, const int ngpus) { int localDeviceCount = 0; if(ngpus <= 0) { // Use the command-line option as a priority if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) { throw std::runtime_error("hipGetDeviceCount failed"); } // Limit local device testing to 16 GPUs, as we have some // bottlenecks with larger device counts that unreasonably slow // down plan creation localDeviceCount = std::min(16, localDeviceCount); } else { localDeviceCount = ngpus; } // need multiple devices or multiprocessing to test anything if(localDeviceCount < 2 && mp_lib == fft_params::fft_mp_lib_none) return {}; auto params_complex = param_generator_complex(test_prob, multi_gpu_sizes, precision_range_sp_dp, {4, 1}, stride_generator({{1}}), stride_generator({{1}}), {{0, 0}}, {{0, 0}}, {fft_placement_inplace, fft_placement_notinplace}, false); auto params_real = param_generator_real(test_prob, multi_gpu_sizes, precision_range_sp_dp, {4, 1}, stride_generator({{1}}), stride_generator({{1}}), {{0, 0}}, {{0, 0}}, {fft_placement_notinplace}, false); std::vector all_params; auto distribute_params = [=, &all_params](const std::vector& params) { int brickCount = mp_lib == fft_params::fft_mp_lib_none ? localDeviceCount : mp_ranks; for(auto& p : params) { // start with all-ones in grids std::vector input_grid(p.length.size() + 1, 1); std::vector output_grid(p.length.size() + 1, 1); auto p_dist = p; switch(type) { case SLOW_INOUT: input_grid[1] = brickCount; output_grid[1] = brickCount; break; case SLOW_IN: // this type only specifies input field and no output // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; input_grid[1] = brickCount; break; case SLOW_OUT: // this type only specifies output field and no input // field, but multi-process transforms require both // fields. if(mp_lib != fft_params::fft_mp_lib_none) continue; output_grid[1] = brickCount; break; case SLOW_IN_FAST_OUT: // requires at least rank-2 FFT if(p.length.size() < 2) continue; input_grid[1] = brickCount; output_grid.back() = brickCount; break; case PENCIL_3D: // need at least 2 bricks per split dimension, or 4 devices. // also needs to be a 3D problem. if(brickCount < 4 || p.length.size() != 3) continue; // make fast dimension contiguous on input input_grid[1] = static_cast(sqrt(brickCount)); input_grid[2] = brickCount / input_grid[1]; // make middle dimension contiguous on output output_grid[1] = input_grid[1]; output_grid[3] = input_grid[2]; break; } p_dist.mp_lib = mp_lib; p_dist.distribute_input(localDeviceCount, input_grid); p_dist.distribute_output(localDeviceCount, output_grid); // "placement" flag is meaningless if exactly one of // input+output is a field. So just add those cases if // the flag is "out-of-place", since "in-place" is // exactly the same test case. if(p_dist.placement == fft_placement_inplace && p_dist.ifields.empty() != p_dist.ofields.empty()) continue; all_params.push_back(std::move(p_dist)); } }; distribute_params(params_complex); distribute_params(params_real); return all_params; } // split both input and output on slowest FFT dim INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_INOUT, ngpus)), accuracy_test::TestName); // split slowest FFT dim only on input, or only on output INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_input_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN, ngpus)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(multi_gpu_slowest_output_dim, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_OUT, ngpus)), accuracy_test::TestName); // split input on slowest FFT and output on fastest, to minimize data // movement (only makes sense for rank-2 and higher FFTs) INSTANTIATE_TEST_SUITE_P(multi_gpu_slowin_fastout, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(SLOW_IN_FAST_OUT, ngpus)), accuracy_test::TestName); // 3D pencil decompositions INSTANTIATE_TEST_SUITE_P(multi_gpu_3d_pencils, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu(PENCIL_3D, ngpus)), accuracy_test::TestName); TEST(multi_gpu_validate, catch_validation_errors) { const auto all_split_types = { SLOW_INOUT, SLOW_IN, SLOW_OUT, SLOW_IN_FAST_OUT, PENCIL_3D, }; for(auto type : all_split_types) { // gather all of the multi-GPU test cases auto params = param_generator_multi_gpu(type, ngpus); for(size_t i = 0; i < params.size(); ++i) { auto& param = params[i]; // this validation runs in rocfft-test itself and // multi-process libs are not initialized. if(param.mp_lib != fft_params::fft_mp_lib_none) continue; std::vector available_fields; if(!param.ifields.empty()) available_fields.push_back(¶m.ifields.front()); if(!param.ofields.empty()) available_fields.push_back(¶m.ofields.front()); // get iterator to the brick we will modify auto field = available_fields[i % available_fields.size()]; auto brick_iter = field->bricks.begin() + i % field->bricks.size(); // iterate through the 5 cases we want to test: switch(i % 5) { case 0: { // missing brick field->bricks.erase(brick_iter); break; } case 1: { // a brick's lower index too small by one size_t& index = brick_iter->lower[i % brick_iter->lower.size()]; // don't worry about underflow since that should also // produce an invalid brick layout --index; break; } case 2: { // a brick's lower index too large by one size_t& index = brick_iter->lower[i % brick_iter->lower.size()]; ++index; break; } case 3: { // a brick's upper index too small by one size_t& index = brick_iter->upper[i % brick_iter->lower.size()]; // don't worry about underflow since that should also // produce an invalid brick layout --index; break; } case 4: { // a brick's upper index too large by one size_t& index = brick_iter->upper[i % brick_iter->lower.size()]; ++index; break; } } rocfft_params rparam{param}; // brick layout is invalid, so this should fail try { rparam.setup_structs(); } catch(std::runtime_error&) { continue; } // didn't get an exception, fail the test GTEST_FAIL() << "invalid brick layout " << rparam.token() << " should have failed, but plan was created successfully"; } } } static const auto multi_gpu_tokens = { // clang-format off // input bricks are not contiguous "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6480_81_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6480_81_1_rank_1_dev_1", // output bricks are not contiguous "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25600_160_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25600_160_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1", // neither input nor output bricks are contiguous "real_forward_len_160_160_160_single_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_80_160_160_stride_0_25920_162_1_dev_0_brick_lower_0_80_0_0_upper_1_160_160_160_stride_0_25920_162_1_rank_1_dev_1_ofield_brick_lower_0_0_0_0_upper_1_160_80_81_stride_0_6560_82_1_dev_0_brick_lower_0_0_80_0_upper_1_160_160_81_stride_0_6560_82_1_rank_1_dev_1", // 1D multi-process batched in-place transform using 1 device per rank "complex_forward_len_256_double_ip_batch_4_ifield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1_ofield_brick_lower_0_0_upper_4_128_stride_128_1_dev_0_brick_lower_0_128_upper_4_256_stride_128_1_rank_1_dev_1", // 2D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs "complex_forward_len_128_256_single_op_batch_1_ifield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3_ofield_brick_lower_0_0_0_upper_1_128_64_stride_8192_64_1_dev_0_brick_lower_0_0_64_upper_1_128_128_stride_8192_64_1_rank_1_dev_1_brick_lower_0_0_128_upper_1_128_192_stride_8192_64_1_rank_0_dev_2_brick_lower_0_0_192_upper_1_128_256_stride_8192_64_1_rank_1_dev_3", // 3D multi-process out-of-place transform using 2 MPI ranks each with 2 GPUs "complex_forward_len_256_256_256_double_op_batch_1_ifield_brick_lower_0_0_0_0_upper_1_64_256_256_stride_4194304_65536_256_1_dev_0_brick_lower_0_64_0_0_upper_1_128_256_256_stride_4194304_65536_256_1_rank_0_dev_1_brick_lower_0_128_0_0_upper_1_192_256_256_stride_4194304_65536_256_1_rank_1_dev_2_brick_lower_0_192_0_0_upper_1_256_256_256_stride_4194304_65536_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_1_256_256_64_stride_4194304_16384_64_1_dev_0_brick_lower_0_0_0_64_upper_1_256_256_128_stride_4194304_16384_64_1_rank_0_dev_1_brick_lower_0_0_0_128_upper_1_256_256_192_stride_4194304_16384_64_1_rank_1_dev_2_brick_lower_0_0_0_192_upper_1_256_256_256_stride_4194304_16384_64_1_rank_1_dev_3", // 3D multi-process batched in-place transform using 2 MPI ranks each with 2 GPUs "complex_forward_len_128_300_256_single_op_batch_4_ifield_brick_lower_0_0_0_0_upper_4_32_300_256_stride_2457600_76800_256_1_dev_0_brick_lower_0_32_0_0_upper_4_64_300_256_stride_2457600_76800_256_1_rank_1_dev_1_brick_lower_0_64_0_0_upper_4_96_300_256_stride_2457600_76800_256_1_rank_0_dev_2_brick_lower_0_96_0_0_upper_4_128_300_256_stride_2457600_76800_256_1_rank_1_dev_3_ofield_brick_lower_0_0_0_0_upper_4_128_300_64_stride_2457600_19200_64_1_dev_0_brick_lower_0_0_0_64_upper_4_128_300_128_stride_2457600_19200_64_1_rank_1_dev_1_brick_lower_0_0_0_128_upper_4_128_300_192_stride_2457600_19200_64_1_rank_0_dev_2_brick_lower_0_0_0_192_upper_4_128_300_256_stride_2457600_19200_64_1_rank_1_dev_3 ", // clang-format on }; std::vector param_generator_multi_gpu_adhoc() { int localDeviceCount = 0; if(ngpus <= 0) { // Use the command-line option as a priority if(hipGetDeviceCount(&localDeviceCount) != hipSuccess) { throw std::runtime_error("hipGetDeviceCount failed"); } // Limit local device testing to 16 GPUs, as we have some // bottlenecks with larger device counts that unreasonably slow // down plan creation localDeviceCount = std::min(16, localDeviceCount); } else { localDeviceCount = ngpus; } auto all_params = param_generator_token(test_prob, multi_gpu_tokens); // check if fields use more bricks than we can support auto too_many_bricks = [=](const std::vector& fields, size_t maxBricks) { for(const auto& f : fields) { if(f.bricks.size() > maxBricks) return true; // also remove a test case if it uses a numbered device // that isn't available if(std::any_of(f.bricks.begin(), f.bricks.end(), [=](const fft_params::fft_brick& b) { return b.device >= localDeviceCount; })) return true; } return false; }; // remove test cases where we don't have enough ranks/devices for // the number of bricks all_params.erase(std::remove_if(all_params.begin(), all_params.end(), [=](const fft_params& params) { size_t maxBricks = mp_lib == fft_params::fft_mp_lib_mpi ? mp_ranks : localDeviceCount; return too_many_bricks(params.ifields, maxBricks) || too_many_bricks(params.ofields, maxBricks); }), all_params.end()); // set all bricks in a field to rank-0, to change an MPI test // case to single-proc auto set_rank_0 = [](std::vector& fields) { for(auto& f : fields) { for(auto& b : f.bricks) b.rank = 0; } }; // modify the remaining test cases to use the current multi-GPU lib for(auto& params : all_params) { params.mp_lib = mp_lib; if(mp_lib == fft_params::fft_mp_lib_none) { set_rank_0(params.ifields); set_rank_0(params.ofields); } } return all_params; } INSTANTIATE_TEST_SUITE_P(multi_gpu_adhoc_token, accuracy_test, ::testing::ValuesIn(param_generator_multi_gpu_adhoc()), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/multithread_test.cpp000066400000000000000000000331061506652163400225030ustar00rootroot00000000000000// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/gpubuf.h" #include "../../shared/hip_object_wrapper.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/rocfft_params.h" #include "rocfft/rocfft.h" #include #include #include #include #include #include // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform __global__ void normalize_inverse_results(rocfft_complex* array, float N) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; array[idx].x /= N; array[idx].y /= N; } // Run a transform of specified dimensions, size N on each dimension. // Data is randomly generated based on the seed value, and we do a // forward + inverse transform and compare against what we started // with. struct Test_Transform { // real constructor sets all the data up and creates the plans Test_Transform(size_t _N, size_t _dim, uint32_t _seed) : N(_N) , dim(_dim) , seed(_seed) { // compute total data size size_t datasize = 1; for(size_t i = 0; i < dim; ++i) { datasize *= N; } size_t Nbytes = datasize * sizeof(rocfft_complex); // Create HIP device buffers if(device_mem_in.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); if(device_mem_out.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); // Initialize data std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_mem_in.resize(datasize); host_mem_out.resize(datasize); for(size_t i = 0; i < datasize; i++) { host_mem_in[i].x = dist(gen); host_mem_in[i].y = dist(gen); } // Copy data to device // NB: Cannot use ASSERT_EQ because constructor does not return void. EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } Test_Transform(const Test_Transform&) = delete; void operator=(const Test_Transform&) = delete; Test_Transform(Test_Transform&& other) : stream(std::move(other.stream)) , work_buffer(other.work_buffer) , device_mem_in(std::move(other.device_mem_in)) , device_mem_out(std::move(other.device_mem_out)) { other.work_buffer = nullptr; host_mem_in.swap(other.host_mem_in); host_mem_out.swap(other.host_mem_out); } void run_transform() { // Create rocFFT plans (forward + inverse) std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); // allocate work buffer if necessary ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); // NOTE: assuming that same-sized work buffer is ok for both // forward and inverse transforms if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } stream.alloc(); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); // NOTE: This multithread test is intended to test the cases having work_buffer_size // If the assert fails, this means we should change the problem. // But that rarely happens (maybe when the opt_strategy is minimal_buffer) // So we don't put this one inside the if(work_buffer_size){ ... } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer, work_buffer_size), rocfft_status_success); // Execute forward plan out-of-place void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &out_ptr, nullptr, info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); // Apply normalization so the values really are comparable hipLaunchKernelGGL(normalize_inverse_results, host_mem_out.size(), 1, 0, // sharedMemBytes stream, // stream static_cast*>(device_mem_out.data()), static_cast(host_mem_out.size())); ran_transform = true; } void do_cleanup() { // complain loudly if we set up for a transform but did not // actually run it if(plan && !ran_transform) ADD_FAILURE(); // wait for execution to finish if(stream) { ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); stream.free(); } ASSERT_EQ(hipFree(work_buffer), hipSuccess); work_buffer = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); plan = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan_inv), rocfft_status_success); plan_inv = nullptr; // Copy result back to host if(device_mem_out.data() && !host_mem_out.empty()) { ASSERT_EQ(hipMemcpy(host_mem_out.data(), device_mem_out.data(), host_mem_out.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(host_mem_in.data()), host_mem_in.size(), 1, 1, host_mem_in.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_in.data()), reinterpret_cast*>(host_mem_out.data()), // data is all contiguous, we can treat it as 1d host_mem_in.size(), 1, 1, host_mem_in.size(), 1, host_mem_out.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(host_mem_in.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(host_mem_in.size()) * MAX_TRANSFORM_ERROR); // Free buffers host_mem_in.clear(); host_mem_out.clear(); } } ~Test_Transform() { do_cleanup(); } size_t N = 0; size_t dim = 0; uint32_t seed = 0; hipStream_wrapper_t stream; rocfft_plan plan = nullptr; rocfft_plan plan_inv = nullptr; size_t work_buffer_size = 0; void* work_buffer = nullptr; gpubuf device_mem_in; gpubuf device_mem_out; std::vector> host_mem_in; std::vector> host_mem_out; // ensure that we don't forget to actually run the transform bool ran_transform = false; }; // run concurrent transforms, one per thread, size N on each dimension static void multithread_transform(size_t N, size_t dim, size_t num_threads) { std::vector threads; threads.reserve(num_threads); for(size_t j = 0; j < num_threads; ++j) { threads.emplace_back([=]() { try { Test_Transform t(N, dim, j); t.run_transform(); } catch(std::bad_alloc& e) { ADD_FAILURE() << "memory allocation failure"; } }); } for(auto& t : threads) t.join(); } // for multi-stream tests, set up a bunch of streams, then execute // all of those transforms from a single thread. afterwards, // wait/verify/cleanup in parallel to save wall time during the test. static void multistream_transform(size_t N, size_t dim, size_t num_streams) { std::vector> transforms; transforms.resize(num_streams); std::vector threads; threads.reserve(num_streams); // get all data ready in parallel for(size_t i = 0; i < num_streams; ++i) threads.emplace_back([=, &transforms]() { try { transforms[i] = std::make_unique(N, dim, i); } catch(std::bad_alloc&) { ADD_FAILURE() << "memory allocation failure"; } }); for(auto& t : threads) t.join(); threads.clear(); // now start the actual transforms serially, but in separate // streams for(auto& t : transforms) { if(!t) // must have failed to allocate memory, abort the test return; t->run_transform(); } // clean up for(size_t i = 0; i < transforms.size(); ++i) threads.emplace_back([=, &transforms]() { transforms[i]->do_cleanup(); }); for(auto& t : threads) t.join(); } // pick arbitrary sizes here to get some parallelism while still // fitting into e.g. 8 GB of GPU memory TEST(DISABLED_rocfft_UnitTest, simple_multithread_1D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multithread_transform(1048576, 1, 64); } TEST(DISABLED_rocfft_UnitTest, simple_multithread_2D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multithread_transform(1024, 2, 64); } TEST(DISABLED_rocfft_UnitTest, simple_multithread_3D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multithread_transform(128, 3, 40); } TEST(rocfft_UnitTest, simple_multistream_1D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multistream_transform(1048576, 1, 32); } TEST(rocfft_UnitTest, simple_multistream_2D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multistream_transform(1024, 2, 32); } TEST(rocfft_UnitTest, simple_multistream_3D) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } multistream_transform(128, 3, 32); } rocFFT-rocm-7.1.0/clients/tests/random.cpp000066400000000000000000000115411506652163400204010ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include "../../shared/accuracy_test.h" #include "../../shared/params_gen.h" #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/test_params.h" class random_params : public ::testing::TestWithParam< std::tuple> { }; // TODO: Add batch and stride auto random_param_generator(const int dimension, const std::vector& precisions, const std::vector& place_range, const fft_transform_type transform_type) { std::vector params; int maxlen = 0; switch(dimension) { case 1: maxlen = 1 << 15; break; case 2: maxlen = 1 << 10; break; case 3: maxlen = 1 << 6; break; default: throw std::runtime_error("invalid dimension for random tests"); } std::mt19937 rgen(random_seed); // Mean value of the exponential distribution is maxlen: std::exponential_distribution distribution(1.0 / maxlen); std::uniform_int_distribution precision_distr(0, precisions.size() - 1); std::uniform_int_distribution place_distr(0, place_range.size() - 1); while(params.size() < n_random_tests) { const auto precision = precisions[precision_distr(rgen)]; const auto placement = place_range[place_distr(rgen)]; fft_params param; param.transform_type = transform_type; param.precision = precision; param.placement = placement; for(int idim = 0; idim < dimension; ++idim) { // NB: the distribution can return 0, so add 1 to avoid this issue. param.length.push_back(1 + (size_t)distribution(rgen)); } param.validate(); if(param.valid(0)) { bool found = false; for(size_t idx = 0; idx < params.size(); ++idx) { if(param.token() == params[idx].token()) { found = true; break; } } if(!found) { params.push_back(param); } } } return params; } INSTANTIATE_TEST_SUITE_P( random_complex_1d, accuracy_test, ::testing::ValuesIn(random_param_generator( 1, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_complex_2d, accuracy_test, ::testing::ValuesIn(random_param_generator( 2, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_complex_3d, accuracy_test, ::testing::ValuesIn(random_param_generator( 3, precision_range_sp_dp, place_range, fft_transform_type_complex_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_1d, accuracy_test, ::testing::ValuesIn(random_param_generator( 1, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_2d, accuracy_test, ::testing::ValuesIn(random_param_generator( 2, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( random_real_3d, accuracy_test, ::testing::ValuesIn(random_param_generator( 3, precision_range_sp_dp, place_range, fft_transform_type_real_forward)), accuracy_test::TestName); rocFFT-rocm-7.1.0/clients/tests/rocfft_accuracy_test.cpp000066400000000000000000000107571506652163400233250ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include #include "../../shared/rocfft_accuracy_test.h" #include "../../shared/client_except.h" #include "../../shared/fftw_transform.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_against_fftw.h" #include "../../shared/subprocess.h" #include "rocfft/rocfft.h" extern std::string mp_launch; extern last_cpu_fft_cache last_cpu_fft_data; void fft_vs_reference(rocfft_params& params, bool round_trip) { switch(params.precision) { case fft_precision_half: fft_vs_reference_impl(params, round_trip); break; case fft_precision_single: fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: fft_vs_reference_impl(params, round_trip); break; } } // Test for comparison between FFTW and rocFFT. TEST_P(accuracy_test, vs_fftw) { rocfft_params params(GetParam()); params.validate(); // Test that the tokenization works as expected. auto testcase_token = params.token(); fft_params tokentest; tokentest.from_token(testcase_token); auto testcase_token1 = tokentest.token(); EXPECT_EQ(testcase_token, testcase_token1); if(!params.valid(verbose)) { GTEST_FAIL() << "Invalid parameters"; } switch(params.mp_lib) { case fft_params::fft_mp_lib_none: { // Single-proc FFT. // Only do round trip for non-field FFTs bool round_trip = params.ifields.empty() && params.ofields.empty(); try { fft_vs_reference(params, round_trip); } catch(std::bad_alloc&) { GTEST_SKIP() << "host memory allocation failure"; } catch(HOSTBUF_MEM_USAGE& e) { // explicitly clear cache last_cpu_fft_data = last_cpu_fft_cache(); GTEST_SKIP() << e.msg; } catch(ROCFFT_SKIP& e) { GTEST_SKIP() << e.msg; } catch(ROCFFT_FAIL& e) { GTEST_FAIL() << e.msg; } break; } case fft_params::fft_mp_lib_mpi: { // Multi-proc FFT. // Split launcher into tokens since the first one is the exe // and the remainder is the start of its argv boost::escaped_list_separator sep('\\', ' ', '\"'); boost::tokenizer> tokenizer(mp_launch, sep); std::string exe; std::vector argv; for(auto t : tokenizer) { if(t.empty()) continue; if(exe.empty()) exe = t; else argv.push_back(t); } // append test token and ask for accuracy test argv.push_back("--token"); argv.push_back(testcase_token); argv.push_back("--accuracy"); // throws an exception if launch fails or if subprocess // returns nonzero exit code execute_subprocess(exe, argv, {}); break; } default: GTEST_FAIL() << "Invalid communicator choice!"; break; } SUCCEED(); } rocFFT-rocm-7.1.0/clients/tests/rocfft_mpi_worker.cpp000066400000000000000000000040001506652163400226320ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2024 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "../../shared/mpi_worker.h" #include "../../shared/rocfft_params.h" int main(int argc, char* argv[]) { #ifdef ROCFFT_DYNA_MPI_WORKER return mpi_worker_main, true>( "dynamic rocFFT MPI worker process", argc, argv, [](const std::vector& lib_strings) { std::vector all_params; for(auto& lib : lib_strings) all_params.emplace_back(lib); return all_params; }); #else return mpi_worker_main, false>( "rocFFT MPI worker process", argc, argv, [](const std::vector&) { return std::array(); }); #endif } rocFFT-rocm-7.1.0/clients/tests/rtc_helper_crash.cpp000066400000000000000000000025471506652163400224360ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // just crash int main() { char* a = 0; // NOTE: this is supposed to crash, since it's used in a test // that checks crashing child processes. // // cppcheck-suppress nullPointer *a = 0; return 0; } rocFFT-rocm-7.1.0/clients/tests/unit_test.cpp000066400000000000000000000772451506652163400211540ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft/rocfft.h" #include "../../shared/concurrency.h" #include "../../shared/environment.h" #include "../../shared/gpubuf.h" #include "../../shared/params_gen.h" #include "../../shared/precision_type.h" #include "../../shared/rocfft_complex.h" #include "hip/hip_runtime_api.h" #include #include #include #include #include #include #include #include #include #include #ifdef _OPENMP #include #endif #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif namespace fs = std::filesystem; #ifndef WIN32 // get program_invocation_name #include #endif TEST(rocfft_UnitTest, plan_description) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } rocfft_plan_description desc = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_create(&desc)); rocfft_array_type in_array_type = rocfft_array_type_complex_interleaved; rocfft_array_type out_array_type = rocfft_array_type_complex_interleaved; size_t rank = 1; size_t i_strides[3] = {1, 1, 1}; size_t o_strides[3] = {1, 1, 1}; size_t idist = 0; size_t odist = 0; rocfft_plan plan = NULL; size_t length = 8; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_set_data_layout(desc, in_array_type, out_array_type, 0, 0, rank, i_strides, idist, rank, o_strides, odist)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, rank, &length, 1, desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_destroy(desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_destroy(plan)); } TEST(rocfft_UnitTest, plan_description_reuse) { // check that a plan description can be reused between different // plans, with different layout parameters for each. if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } // allocate plan description once rocfft_plan_description desc = nullptr; ASSERT_EQ(rocfft_plan_description_create(&desc), rocfft_status_success); std::vector> output; // do length-8 FFTs with different strides. first one is // stride-1 and we use that as our baseline to know what output // to expect for the rest const size_t length = 8; for(const size_t stride : {1, 2, 4}) { // set layout for this stride ASSERT_EQ(rocfft_plan_description_set_data_layout(desc, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, 1, &stride, length * stride, 1, &stride, length * stride), rocfft_status_success); static const rocfft_complex input[8]{{-0.100, 0.380}, {0.0166, 0.439}, {-0.475, 0.212}, {0.440, -0.432}, {0.445, 0.0589}, {0.296, 0.164}, {-0.084, 0.077}, {0.320, 0.087}}; // allocate host buffer. initialize the whole thing to zero // but set a known input along the strides we want std::vector> data_host(length * stride, {0.0, 0.0}); for(size_t i = 0; i < length; ++i) { data_host[i * stride] = input[i]; } // copy to device const size_t data_bytes = data_host.size() * sizeof(rocfft_complex); gpubuf_t> data_dev; ASSERT_EQ(data_dev.alloc(data_bytes), hipSuccess); void* data_dev_ptr = data_dev.data(); ASSERT_EQ(hipMemcpy(data_dev_ptr, data_host.data(), data_bytes, hipMemcpyHostToDevice), hipSuccess); // do the transform rocfft_plan plan = nullptr; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, desc), rocfft_status_success); ASSERT_EQ(rocfft_execute(plan, &data_dev_ptr, nullptr, nullptr), rocfft_status_success); ASSERT_EQ(hipMemcpy(data_host.data(), data_dev_ptr, data_bytes, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(hipDeviceSynchronize(), hipSuccess); // save output for reference on first run if(output.empty()) { output = data_host; } else { // check that the output matches output from the first // (stride-1) run. for(size_t i = 0; i < length; ++i) ASSERT_EQ(data_host[i * stride], output[i]); } ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); } ASSERT_EQ(rocfft_plan_description_destroy(desc), rocfft_status_success); } // run a transform with all log levels enabled TEST(rocfft_UnitTest, log_levels) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } // clean up environment and temporary file when we exit BOOST_SCOPE_EXIT_ALL(=) { rocfft_cleanup(); // re-init logs with default logging rocfft_setup(); }; rocfft_cleanup(); // enumerate all known log levels and direct all of the logs to nowhere EnvironmentSetTemp layer("ROCFFT_LAYER", std::to_string(0xffffffff).c_str()); #ifdef WIN32 static const char* log_output = "NUL"; #else static const char* log_output = "/dev/null"; #endif EnvironmentSetTemp log_trace_path("ROCFFT_LOG_TRACE_PATH", log_output); EnvironmentSetTemp log_bench_path("ROCFFT_LOG_BENCH_PATH", log_output); EnvironmentSetTemp log_profile_path("ROCFFT_LOG_PROFILE_PATH", log_output); EnvironmentSetTemp log_plan_path("ROCFFT_LOG_PLAN_PATH", log_output); EnvironmentSetTemp log_kernelio_path("ROCFFT_LOG_KERNELIO_PATH", log_output); EnvironmentSetTemp log_rtc_path("ROCFFT_LOG_RTC_PATH", log_output); EnvironmentSetTemp log_tuning_path("ROCFFT_LOG_TUNING_PATH", log_output); EnvironmentSetTemp log_graph_path("ROCFFT_LOG_GRAPH_PATH", log_output); rocfft_setup(); // Test single-kernel Bluestein and a multi-kernel plan // // TODO: add fused L1D Bluestein case like 8191, as that does weird // things with buffers for(const size_t length : { 37, 64, 32768, }) { for(const auto type : {rocfft_transform_type_complex_forward, rocfft_transform_type_real_forward, rocfft_transform_type_real_inverse}) { for(const auto precision : {rocfft_precision_single, rocfft_precision_double, rocfft_precision_half}) { rocfft_plan plan = nullptr; ASSERT_EQ( rocfft_plan_create( &plan, rocfft_placement_inplace, type, precision, 1, &length, 1, nullptr), rocfft_status_success); // assume transform uses complex, will overallocate for real // transforms but we only care about logging gpubuf data_dev; ASSERT_EQ( data_dev.alloc(element_size(precision, rocfft_array_type_complex_interleaved) * length), hipSuccess); void* data_dev_ptr = data_dev.data(); ASSERT_EQ(rocfft_execute(plan, &data_dev_ptr, nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); } } } } // Check whether logs can be emitted from multiple threads properly TEST(rocfft_UnitTest, log_multithreading) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } static const int NUM_THREADS = 10; static const int NUM_ITERS_PER_THREAD = 50; static const char* TRACE_FILE = "trace.log"; // clean up environment and temporary file when we exit BOOST_SCOPE_EXIT_ALL(=) { rocfft_cleanup(); remove(TRACE_FILE); // re-init logs with default logging rocfft_setup(); }; // ask for trace logging, since that's the easiest to trigger rocfft_cleanup(); EnvironmentSetTemp layer("ROCFFT_LAYER", "1"); EnvironmentSetTemp tracepath("ROCFFT_LOG_TRACE_PATH", TRACE_FILE); rocfft_setup(); // run a whole bunch of threads in parallel, each one doing // something small that will write to the trace log std::vector threads; threads.reserve(NUM_THREADS); for(int i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([]() { for(int j = 0; j < NUM_ITERS_PER_THREAD; ++j) { rocfft_plan_description desc; rocfft_plan_description_create(&desc); rocfft_plan_description_destroy(desc); } }); } for(auto& t : threads) { t.join(); } rocfft_cleanup(); // now verify that the trace log has one message per line, with nothing garbled std::ifstream trace_log(TRACE_FILE); std::string line; std::regex validator("^rocfft_(setup|cleanup|plan_description_(create|destroy)," "description,[x0-9a-fA-F]+)$"); while(std::getline(trace_log, line)) { bool res = std::regex_match(line, validator); ASSERT_TRUE(res) << "line contains invalid content: " << line; } } // a function that accepts a plan's requested size on input, and // returns the size to actually allocate for the test typedef std::function workmem_sizer; void workmem_test(workmem_sizer sizer, rocfft_status exec_status_expected, bool give_null_work_buf = false) { // Prime size requires Bluestein, which guarantees work memory. size_t length = 8191; rocfft_plan plan = NULL; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr), rocfft_status_success); size_t requested_work_size = 0; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &requested_work_size), rocfft_status_success); ASSERT_GT(requested_work_size, 0U); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); size_t alloc_work_size = sizer(requested_work_size); gpubuf work_buffer; if(alloc_work_size) { ASSERT_EQ(work_buffer.alloc(alloc_work_size), hipSuccess); void* work_buffer_ptr; rocfft_status set_work_expected_status; if(give_null_work_buf) { work_buffer_ptr = nullptr; set_work_expected_status = rocfft_status_invalid_work_buffer; } else { work_buffer_ptr = work_buffer.data(); set_work_expected_status = rocfft_status_success; } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer_ptr, alloc_work_size), set_work_expected_status); } // allocate 2x length for complex std::vector data_host(length * 2, 1.0f); gpubuf data_device; auto data_size_bytes = data_host.size() * sizeof(float); ASSERT_EQ(data_device.alloc(data_size_bytes), hipSuccess); ASSERT_EQ( hipMemcpy(data_device.data(), data_host.data(), data_size_bytes, hipMemcpyHostToDevice), hipSuccess); std::vector ibuffers(1, static_cast(data_device.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, info), exec_status_expected); rocfft_execution_info_destroy(info); rocfft_plan_destroy(plan); } // check what happens if work memory is required but is not provided // - library should allocate TEST(rocfft_UnitTest, workmem_missing) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } workmem_test([](size_t) { return 0; }, rocfft_status_success); } // check what happens if work memory is required but not enough is provided TEST(rocfft_UnitTest, workmem_small) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } workmem_test([](size_t requested) { return requested / 2; }, rocfft_status_invalid_work_buffer); } // hard to imagine this being a problem, but try giving too much as well TEST(rocfft_UnitTest, workmem_big) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } workmem_test([](size_t requested) { return requested * 2; }, rocfft_status_success); } // check if a user explicitly gives a null pointer - set work buffer // should fail, but transform should succeed because library // allocates TEST(rocfft_UnitTest, workmem_null) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } workmem_test([](size_t requested) { return requested; }, rocfft_status_success, true); } static const size_t RTC_PROBLEM_SIZE = 2304; // runtime compilation cache tests main loop void rtc_cache_main() { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } // PRECONDITIONS // - set cache location to custom path, requires uninitializing // the lib and reinitializing with some env vars // - also enable RTC logging so we can tell when something was // actually compiled const std::string rtc_cache_path = std::tmpnam(nullptr); const std::string rtc_log_path = std::tmpnam(nullptr); void* empty_cache = nullptr; size_t empty_cache_bytes = 0; void* onekernel_cache = nullptr; size_t onekernel_cache_bytes = 0; // cleanup BOOST_SCOPE_EXIT_ALL(=) { // close log file handles rocfft_cleanup(); remove(rtc_cache_path.c_str()); remove(rtc_log_path.c_str()); // re-init lib now that the env vars are gone rocfft_setup(); if(empty_cache) rocfft_cache_buffer_free(empty_cache); if(onekernel_cache) rocfft_cache_buffer_free(onekernel_cache); }; rocfft_cleanup(); EnvironmentSetTemp cache_env("ROCFFT_RTC_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp layer_env("ROCFFT_LAYER", "32"); EnvironmentSetTemp log_env("ROCFFT_LOG_RTC_PATH", rtc_log_path.c_str()); rocfft_setup(); // - serialize empty cache as baseline ASSERT_EQ(rocfft_cache_serialize(&empty_cache, &empty_cache_bytes), rocfft_status_success); // END PRECONDITIONS // pick a length that's runtime compiled auto build_plan = [&]() { rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // we don't need to actually execute the plan, so we can // destroy it right away. this ensures that we don't hold on // to a plan after we cleanup the library rocfft_plan_destroy(plan); plan = nullptr; }; // check the RTC log to see if an FFT kernel got compiled auto fft_kernel_was_compiled = [&]() { // HACK: logging is done in a worker thread, so sleep for a // bit to give it a chance to actually write. It at least // should flush after writing. std::this_thread::sleep_for(std::chrono::milliseconds(100)); // look for a ROCFFT_RTC_BEGIN line that indicates RTC happened std::ifstream logfile(rtc_log_path); std::string line; while(std::getline(logfile, line)) { if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos && line.find("fft_") != std::string::npos) return true; } return false; }; // build a plan that requires runtime compilation, // close logs and ensure a kernel was built build_plan(); ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); // serialized cache should be bigger than empty cache ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // blow away the cache, reinit the library, // retry building the plan again and ensure the kernel was rebuilt remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cache_buffer_free(onekernel_cache); onekernel_cache = nullptr; ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // re-init library without blowing away cache. rebuild plan and // check that the kernel was not recompiled. rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // blow away cache again, deserialize one-kernel cache. re-init // library and rebuild plan - kernel should again not be // recompiled remove(rtc_cache_path.c_str()); rocfft_setup(); ASSERT_EQ(rocfft_cache_deserialize(onekernel_cache, onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // use the cache as a system cache and make the user one an empty // in-memory cache. kernel should still not be recompiled. EnvironmentSetTemp cache_sys_env("ROCFFT_RTC_SYS_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp cache_empty_env("ROCFFT_RTC_CACHE_PATH", ":memory:"); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // check that the system cache is not written to, even if it's // writable by the current user. after removing the cache, the // kernel should always be recompiled since rocFFT has no durable // place to write it to. remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); } // run the main body of rtc cache tests twice to uncover potential // problems with thread reuse between iterations TEST(rocfft_UnitTest, rtc_cache_iter_1) { rtc_cache_main(); } TEST(rocfft_UnitTest, rtc_cache_iter_2) { rtc_cache_main(); } // make sure cache API functions tolerate null pointers without crashing TEST(rocfft_UnitTest, rtc_cache_null) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } void* buf = nullptr; size_t buf_len = 0; ASSERT_EQ(rocfft_cache_serialize(nullptr, &buf_len), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_serialize(&buf, nullptr), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_buffer_free(nullptr), rocfft_status_success); ASSERT_EQ(rocfft_cache_deserialize(nullptr, 12345), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_deserialize(&buf_len, 0), rocfft_status_invalid_arg_value); } // make sure RTC gracefully handles a helper process that crashes TEST(rocfft_UnitTest, rtc_helper_crash) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } #ifdef WIN32 char filename[MAX_PATH]; GetModuleFileNameA(NULL, filename, MAX_PATH); fs::path test_exe = filename; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash.exe"); #else fs::path test_exe = program_invocation_name; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash"); #endif // use the crashing helper EnvironmentSetTemp env_helper("ROCFFT_RTC_PROCESS_HELPER", crasher_exe.string().c_str()); // don't touch the cache, to force compilation EnvironmentSetTemp env_read("ROCFFT_RTC_CACHE_READ_DISABLE", "1"); EnvironmentSetTemp env_write("ROCFFT_RTC_CACHE_WRITE_DISABLE", "1"); // force out-of-process compile EnvironmentSetTemp env_process("ROCFFT_RTC_PROCESS", "2"); rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // alloc a complex buffer gpubuf_t> data; ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(rocfft_complex)), hipSuccess); std::vector ibuffers(1, static_cast(data.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; rocfft_cleanup(); rocfft_setup(); // also try with forcing use of the subprocess, which is a // different code path from the default "try in-process, then // fall back to out-of-process" EnvironmentSetTemp env_force("ROCFFT_RTC_PROCESS", "1"); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; } TEST(rocfft_UnitTest, rtc_test_harness) { if(hash_prob(random_seed, ::testing::UnitTest::GetInstance()->current_test_info()->name()) > unittest_prob) { GTEST_SKIP(); } // check that hipcc is available since this test requires it // // NOTE: using system() for launching subprocesses for simplicity // and portability #ifdef WIN32 static const char* test_command = "amdclang++ --version > NUL"; #else static const char* test_command = "amdclang++ --version > /dev/null"; #endif if(std::system(test_command) != 0) GTEST_SKIP(); rocfft_cleanup(); BOOST_SCOPE_EXIT_ALL() { // reinit rocFFT so caching goes back to normal rocfft_cleanup(); rocfft_setup(); }; // extra scope to control lifetime of env vars { // create a temporary directory to hold all of the temp files // that get created const fs::path tmp_path = std::tmpnam(nullptr); try { fs::create_directory(tmp_path); } catch(fs::filesystem_error& e) { GTEST_SKIP() << "unable to create temp dir for test harnesses: " << e.what(); } // activate writing of rtc test harnesses EnvironmentSetTemp env_harness("ROCFFT_DEBUG_GENERATE_KERNEL_HARNESS", "1"); // set path for writing rtc test harnesses source files EnvironmentSetTemp env_harness_path("ROCFFT_DEBUG_KERNEL_HARNESS_PATH", tmp_path.string().c_str()); // ensure every kernel gets compiled once EnvironmentSetTemp env_cache("ROCFFT_RTC_CACHE_PATH", ":memory:"); EnvironmentSetTemp env_sys_cache("ROCFFT_RTC_SYS_CACHE_PATH", ":memory:"); rocfft_setup(); // construct a few different types of plans to try to get all // different kernels compiled auto create_destroy_plan = [](rocfft_transform_type type, const size_t dim, const size_t* lengths) -> void { rocfft_plan plan = nullptr; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, type, rocfft_precision_single, dim, lengths, 1, nullptr), rocfft_status_success); ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); plan = nullptr; }; // large 1D R2C + C2R const size_t L1D_PROBLEM_SIZE[1] = {16384}; create_destroy_plan(rocfft_transform_type_real_forward, 1, L1D_PROBLEM_SIZE); create_destroy_plan(rocfft_transform_type_real_inverse, 1, L1D_PROBLEM_SIZE); // small bluestein R2C + C2R (also covers odd length) const size_t SMALL_BLUESTEIN_PROBLEM_SIZE[1] = {37}; create_destroy_plan(rocfft_transform_type_real_forward, 1, SMALL_BLUESTEIN_PROBLEM_SIZE); create_destroy_plan(rocfft_transform_type_real_inverse, 1, SMALL_BLUESTEIN_PROBLEM_SIZE); // large bluestein C2C const size_t LARGE_BLUESTEIN_PROBLEM_SIZE[1] = {8191}; create_destroy_plan(rocfft_transform_type_complex_forward, 1, LARGE_BLUESTEIN_PROBLEM_SIZE); // L1D_TRTRT const size_t L1D_TRTRT_PROBLEM_SIZE[1] = {680}; create_destroy_plan(rocfft_transform_type_complex_forward, 1, L1D_TRTRT_PROBLEM_SIZE); // small 3D (exercises 2D_SINGLE) const size_t SMALL_3D_PROBLEM_SIZE[3] = {25, 25, 25}; create_destroy_plan(rocfft_transform_type_complex_forward, 3, SMALL_3D_PROBLEM_SIZE); // larger 3D const size_t LARGE_3D_PROBLEM_SIZE[3] = {200, 200, 200}; create_destroy_plan(rocfft_transform_type_complex_forward, 3, LARGE_3D_PROBLEM_SIZE); // now try to compile each file - they'd need hand-editing to test // something useful, but we can at least ensure they build. // enumerate all the files std::vector> files; size_t i = 0; for(;; ++i) { // construct name of main file fs::path main_file = tmp_path / ("rocfft_kernel_harness_" + std::to_string(i) + ".cpp"); if(!fs::exists(main_file)) break; files.emplace_back(main_file.string(), -1); } // we should have generated at least a few kernels ASSERT_FALSE(files.empty()); #ifdef _OPENMP #pragma omp parallel for num_threads(rocfft_concurrency()) #endif for(i = 0; i < files.size(); ++i) { #ifdef WIN32 const std::string command = "amdclang++ -x hip -c -std=c++17 -o NUL " + files[i].first; #else const std::string command = "amdclang++ -x hip -c -std=c++17 -o /dev/null " + files[i].first; #endif files[i].second = std::system(command.c_str()); } // check that all compiles succeeded for(const auto& file : files) { ASSERT_EQ(file.second, 0); } // clean up temporary files try { fs::remove_all(tmp_path); } catch(fs::filesystem_error&) { // this should work, but ignore errors as the build // status is what matters for this test } } } rocFFT-rocm-7.1.0/clients/tests/validate_length_stride.cpp000066400000000000000000000075601506652163400236330ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/accuracy_test.h" #include "../../shared/array_validator.h" #include #include #include inline auto generate_valid_length_stride() { // Array of tuples of length, stride. std::vector, std::vector>> vals = { {{8}, {1}}, {{8, 2}, {1, 0}}, {{8, 8}, {8, 1}}, {{8, 8, 8}, {64, 8, 1}}, {{8, 8, 8}, {64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 7, 1}}, {{8, 8, 8, 8, 8, 8}, {32768, 4096, 512, 64, 8, 1}}, {{299, 307, 495}, {1006, 50, 674}}, }; return vals; } class valid_length_stride : public ::testing::TestWithParam, std::vector>> { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; auto direct_validity_test(const std::vector& length, const std::vector& stride, const int verbose) { std::unordered_set vals{}; std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const int i = std::inner_product(index.begin(), index.end(), stride.begin(), (size_t)0); if(vals.find(i) == vals.end()) { vals.insert(i); } else { return false; } } while(increment_rowmajor(index, length)); return true; } TEST_P(valid_length_stride, direct_comparison) { const std::vector length = std::get<0>(GetParam()); const std::vector stride = std::get<1>(GetParam()); if(verbose) { std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; std::cout << "stride:"; for(const auto i : stride) std::cout << " " << i; std::cout << "\n"; } auto test_val = array_valid(length, stride, verbose); if(verbose) { std::cout << "test value is: " << (test_val ? "valid" : "invalid") << "\n"; } auto ref_val = direct_validity_test(length, stride, verbose); if(verbose) { std::cout << "reference value is: " << (ref_val ? "valid" : "invalid") << "\n"; } EXPECT_EQ(test_val, ref_val); SUCCEED(); } INSTANTIATE_TEST_SUITE_P(reference_test, valid_length_stride, ::testing::ValuesIn(generate_valid_length_stride())); rocFFT-rocm-7.1.0/cmake/000077500000000000000000000000001506652163400146705ustar00rootroot00000000000000rocFFT-rocm-7.1.0/cmake/get-cli-arguments.cmake000066400000000000000000000041601506652163400212220ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Attempt (best effort) to return a list of user specified parameters cmake was invoked with # NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is # not returned because it does not have the matching helpstring function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # Retrieves the contents of CMakeCache.txt get_cmake_property( cmake_properties CACHE_VARIABLES ) foreach( property ${cmake_properties} ) get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) # Properties specified on the command line have boilerplate text if( help_string MATCHES "variable specified on the command line" ) # message( STATUS "property: ${property}") # message( STATUS "value: ${${property}}") list( APPEND cli_args "-D${property}=${${property}}") endif( ) endforeach( ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) endfunction( )rocFFT-rocm-7.1.0/cmake/package-functions.cmake000066400000000000000000000040651506652163400213000ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ######################################################################## # A helper function to generate packaging scripts to register libraries with system # ######################################################################## function( write_rocm_package_script_files scripts_write_dir library_name library_link_name ) set( ld_conf_file "/etc/ld.so.conf.d/${library_name}-dev.conf" ) file( WRITE ${scripts_write_dir}/postinst "#!/bin/bash set -e do_ldconfig() { echo ${CPACK_PACKAGING_INSTALL_PREFIX}/${LIB_INSTALL_DIR} > ${ld_conf_file} && ldconfig } case \"\$1\" in configure) do_ldconfig ;; abort-upgrade|abort-remove|abort-deconfigure) echo \"\$1\" ;; *) exit 0 ;; esac " ) file( WRITE ${scripts_write_dir}/prerm "#!/bin/bash set -e rm_ldconfig() { rm -f ${ld_conf_file} && ldconfig } case \"\$1\" in remove|purge) rm_ldconfig ;; *) exit 0 ;; esac " ) endfunction( ) rocFFT-rocm-7.1.0/cmake/sqlite.cmake000066400000000000000000000061621506652163400172000ustar00rootroot00000000000000# Copyright (C) 2025 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include( ExternalProject ) # SQLite is used for rtc_cache. Require a safe baseline (>= 3.50.2). # Note: the backup API we rely on has been enabled by default since 3.36.0. option( SQLITE_USE_SYSTEM_PACKAGE "Use SQLite3 from find_package" OFF ) if( SQLITE_USE_SYSTEM_PACKAGE ) # Require a safe baseline (fixes truncation/memory-corruption issues). find_package(SQLite3 3.50.2 REQUIRED) list(APPEND static_depends PACKAGE SQLite3) set(ROCFFT_SQLITE_LIB SQLite::SQLite3) else() include( FetchContent ) # embed SQLite amalgamation (version 3.50.2 -> serial 3500200). # allow override via environment variable for mirrors/airgapped builds. if(DEFINED ENV{SQLITE_3_50_2_SRC_URL}) set(SQLITE_3_50_2_SRC_URL_INIT $ENV{SQLITE_3_50_2_SRC_URL}) else() set(SQLITE_3_50_2_SRC_URL_INIT https://www.sqlite.org/2025/sqlite-amalgamation-3500200.zip) endif() set(SQLITE_3_50_2_SRC_URL ${SQLITE_3_50_2_SRC_URL_INIT} CACHE STRING "Location of SQLite source code") set(SQLITE_SRC_3_50_2_SHA3_256 75c118e727ee6a9a3d2c0e7c577500b0c16a848d109027f087b915b671f61f8a CACHE STRING "SHA3-256 hash of SQLite source code") # use extract timestamp for fetched files instead of timestamps in the archive if(CMAKE_VERSION VERSION_GREATER_EQUAL 3.24) cmake_policy(SET CMP0135 NEW) endif() FetchContent_Declare(sqlite_local URL ${SQLITE_3_50_2_SRC_URL} URL_HASH SHA3_256=${SQLITE_SRC_3_50_2_SHA3_256} ) FetchContent_MakeAvailable(sqlite_local) if(NOT TARGET sqlite3) add_library( sqlite3 OBJECT ${sqlite_local_SOURCE_DIR}/sqlite3.c ) target_include_directories( sqlite3 PUBLIC ${sqlite_local_SOURCE_DIR} ) set_target_properties( sqlite3 PROPERTIES C_VISIBILITY_PRESET "hidden" VISIBILITY_INLINES_HIDDEN ON POSITION_INDEPENDENT_CODE ON ) endif() # we don't need extensions, and omitting them from SQLite removes the # need for dlopen/dlclose from within rocFFT target_compile_options( sqlite3 PRIVATE -DSQLITE_OMIT_LOAD_EXTENSION ) set(ROCFFT_SQLITE_LIB sqlite3) endif() rocFFT-rocm-7.1.0/cmake/std-filesystem.cmake000066400000000000000000000036521506652163400206540ustar00rootroot00000000000000# Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include(CheckCXXSourceCompiles) set(HAVE_STD_FILESYSTEM_TEST [[ #include int main() { std::filesystem::path p{"/"}; return 0; } ]]) set(CMAKE_REQUIRED_FLAGS -std=c++17) check_cxx_source_compiles("${HAVE_STD_FILESYSTEM_TEST}" HAVE_STD_FILESYSTEM) if(NOT HAVE_STD_FILESYSTEM) message(STATUS "std::filesystem include not present, will use std::experimental::filesystem") endif() # Link to the experimental filesystem library if it's not available # in the standard library. Experimental filesystem library is not # ABI-compatible with later libstdc++ so link that statically too. function(target_link_std_experimental_filesystem target) if(NOT HAVE_STD_FILESYSTEM) target_link_options( ${target} PRIVATE "SHELL:-lstdc++fs -static-libstdc++ -Xlinker --exclude-libs=ALL") endif() endfunction() rocFFT-rocm-7.1.0/custom.properties000066400000000000000000000001351506652163400172370ustar00rootroot00000000000000booktitle=rocFFT API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocFFT-rocm-7.1.0/deps/000077500000000000000000000000001506652163400145435ustar00rootroot00000000000000rocFFT-rocm-7.1.0/deps/CMakeLists.txt000066400000000000000000000074671506652163400173210ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Helper cmake script to automate building dependencies for rocfft # This script can be invoked manually by the user with 'cmake -P' # The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 cmake_minimum_required( VERSION 3.5 ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # Consider removing this in the future # It can be annoying for visual studio developers to build a project that tries to install into 'program files' if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() # The superbuild does not build anything itself; all compiling is done in external projects project( rocfft-dependencies NONE ) option( BUILD_BOOST "Download and build boost library" ON ) # option( BUILD_VERBOSE "Print helpful build debug information" OFF ) # if( BUILD_VERBOSE ) # message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) # message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) # message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) # endif( ) # This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation include( get-cli-arguments ) # The following is a series of super-build projects; this cmake project will download and build if( BUILD_BOOST ) set(ext.BUILD_BOOST "static") include( external-boost ) list( APPEND rocfft_dependencies boost ) set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) endif( ) # POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" # Familiar target names like 'install' should be OK at the super-build level if( POLICY CMP0037 ) cmake_policy( SET CMP0037 OLD ) endif( ) add_custom_target( install ${boost_custom_target} ${gtest_custom_target} ${lapack_custom_target} DEPENDS ${rocfft_dependencies} ) rocFFT-rocm-7.1.0/deps/external-boost.cmake000066400000000000000000000166071506652163400205250ustar00rootroot00000000000000# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. message( STATUS "Configuring boost external dependency" ) include( ExternalProject ) set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) # We need to detect the compiler the user is attempting to invoke with CMake, # we do our best to translate cmake parameters into bjam parameters enable_language( CXX ) include( build-bitness ) # TODO: Options should be added to allow downloading Boost straight from github # This file is used to add Boost as a library dependency to another project # This sets up boost to download from sourceforge, and builds it as a cmake # ExternalProject # Change this one line to upgrade to newer versions of boost set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) mark_as_advanced( ext.Boost_VERSION ) string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) if( WIN32 ) # For newer cmake versions, 7z archives are much smaller to download if( CMAKE_VERSION VERSION_LESS "3.1.0" ) set( Boost_Ext "zip" ) else( ) set( Boost_Ext "7z" ) endif( ) else( ) set( Boost_Ext "tar.bz2" ) endif( ) if( WIN32 ) set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) else( ) set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) endif( ) if( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) endif( ) include( ProcessorCount ) ProcessorCount( Cores ) if( NOT Cores EQUAL 0 ) # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM if( DEFINED ENV{TRAVIS} ) if( Cores GREATER 8 ) set( Cores 8 ) endif( ) endif( ) # Add build thread in addition to the number of cores that we have math( EXPR Cores "${Cores} + 1 " ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread set( Cores "2" ) endif( ) message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) message( STATUS "ExternalBoost building [ serialization, filesystem, system, regex ] components" ) list( APPEND Boost.Command -j ${Cores} --with-serialization --with-filesystem --with-system --with-regex ) if( BUILD_64 ) list( APPEND Boost.Command address-model=64 ) else( ) list( APPEND Boost.Command address-model=32 ) endif( ) if( MSVC10 ) list( APPEND Boost.Command toolset=msvc-10.0 ) elseif( MSVC11 ) list( APPEND Boost.Command toolset=msvc-11.0 ) elseif( MSVC12 ) list( APPEND Boost.Command toolset=msvc-12.0 ) elseif( MSVC14 ) list( APPEND Boost.Command toolset=msvc-14.0 ) elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Command toolset=clang ) elseif( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command toolset=gcc ) endif( ) if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) endif( ) if( NOT DEFINED ext.Boost_LINK ) if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) else( ) set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) endif( ) endif() mark_as_advanced( ext.Boost_LINK ) if( WIN32 ) # Versioned is the default on windows set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For windows, default to build both variants to support the VS IDE set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) # Tagged builds provide unique enough names to be able to build both variants set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For Linux, typically a build tree only needs one variant if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) endif( ) endif( ) mark_as_advanced( ext.Boost_LAYOUT ) mark_as_advanced( ext.Boost_VARIANT ) list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) message( STATUS "Boost.Command: ${Boost.Command}" ) # If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable if( DEFINED ENV{BOOST_URL} ) set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) else( ) set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) endif( ) mark_as_advanced( ext.Boost_URL ) set( Boost.Bootstrap "" ) set( ext.HASH "" ) if( WIN32 ) set( Boost.Bootstrap "bootstrap.bat" ) if( CMAKE_VERSION VERSION_LESS "3.1.0" ) # .zip file set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) else( ) # .7z file set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) endif( ) else( ) set( Boost.Bootstrap "./bootstrap.sh" ) # .tar.bz2 set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Bootstrap --with-toolset=clang ) endif( ) endif( ) # Below is a fancy CMake command to download, build and install Boost on the users computer ExternalProject_Add( boost PREFIX ${CMAKE_BINARY_DIR}/boost URL ${ext.Boost_URL} URL_HASH SHA256=${ext.HASH} UPDATE_COMMAND ${Boost.Bootstrap} LOG_UPDATE 1 CONFIGURE_COMMAND "" BUILD_COMMAND ${Boost.Command} stage BUILD_IN_SOURCE 1 LOG_BUILD 1 INSTALL_COMMAND "" ) set_property( TARGET boost PROPERTY FOLDER "extern" ) ExternalProject_Get_Property( boost install_dir ) ExternalProject_Get_Property( boost binary_dir ) # For use by the user of ExternalGtest.cmake set( BOOST_INSTALL_ROOT ${install_dir} ) set( BOOST_BINARY_ROOT ${binary_dir} ) rocFFT-rocm-7.1.0/designdocs/000077500000000000000000000000001506652163400157325ustar00rootroot00000000000000rocFFT-rocm-7.1.0/designdocs/bluestein.rst000066400000000000000000000372511506652163400204660ustar00rootroot00000000000000.. meta:: :description: rocFFT documentation and API reference library :keywords: rocFFT, FFT, ROCm, API, documentation .. _bluestein: ******************************************************************** Bluestein Design Document ******************************************************************** Summary ======= This document describes the implementation of the Bluestein algorithm for prime-length discrete Fourier transforms (DFTs) in the rocFFT library. An optimization of the Bluestein algorithm for large length DFTs provides several benefits, including significantly improved performance, and the ability to reuse the design to perform fast convolutions without any major design modifications. Background and notation ======================= Let :math:`\mathbf{X} = \mathcal{F}\left\{ \mathbf{x} \right\}` denote the DFT of :math:`\mathbf{x}`, which maps an :math:`N`-length input sequence :math:`\mathbf{x} = \begin{bmatrix} x_0 & \cdots & x_{N-1} \end{bmatrix}` into an :math:`N`-length output sequence :math:`\mathbf{X} = \begin{bmatrix} X_0 & \cdots & X_{N-1} \end{bmatrix}` with .. math:: X_k = \sum_{n=0}^{N-1}{x_n e^{-\frac{2 \pi \jmath}{N}nk}}, \qquad k = 0, \ \ldots, \ N-1. Conversely, let :math:`\mathbf{x} = \mathcal{F}^{-1}\left\{ \mathbf{X} \right\}` denote the inverse DFT, which maps the sequence :math:`\mathbf{X}` into sequence :math:`\mathbf{x}` as follows .. math:: x_k = \frac{1}{N}\sum_{n=0}^{N-1}{X_n e^{\frac{2 \pi \jmath}{N}nk}}, \qquad k = 0, \ \ldots, \ N-1. Bluestein algorithm =================== In Bluestein's algorithm, the following identity is considered for the DFT computation .. math:: nk = \frac{-(k-n)^2}{2} + \frac{n^2}{2} + \frac{k^2}{2}. For example, substituting this identity into the DFT equation, the DFT can then be expressed as .. math:: X_k = e^{-\frac{\pi \jmath}{N}k^2} \sum_{n=0}^{N-1}{\left( x_n e^{-\frac{\pi \jmath}{N}n^2} \right) e^{\frac{\pi \jmath}{N} (k-n)^2}{}}, \qquad k = 0, \ \ldots, \ N-1. +++++ Chirp +++++ Bluestein's algorithm is frequently used to compute the DFT, but it can also be used to compute the more general z-transform. This transform is similar to the DFT equation with the difference that the term :math:`e^{-\frac{2\pi \jmath}{N}}` is replaced by :math:`z`, where :math:`z` is an arbitrary complex number. Let :math:`\mathbf{c} = \begin{bmatrix} c_0 & \cdots & c_{N-1} \end{bmatrix}` denote an :math:`N` length sequence of the form .. math:: c_n = e^{\frac{\pi \jmath}{N}n^2}, \qquad n = 0, \ \ldots, \ N-1. The sequence :math:`\mathbf{c}`, which is present in Bluestein DFT equation, is also known as chirp because it defines a complex sinusoid of linearly increasing frequency. Bluestein's algorithm is also known as the chirp z-transform for this reason. +++++++++++ Convolution +++++++++++ Now let :math:`\left(\mathbf{a} \ast \mathbf{b}\right)_k` for :math:`k = 0, \ \ldots, \ M-1` denote the convolution of two :math:`M`-length input sequences :math:`\mathbf{a} = \begin{bmatrix} a_0 & \cdots a_{M-1} \end{bmatrix}` and :math:`\mathbf{b} = \begin{bmatrix} b_0 & \cdots b_{M-1} \end{bmatrix}` with .. math:: \left(\mathbf{a} \ast \mathbf{b} \right)_k = \sum_{m=0}^{M-1}a_m b_{k-m}, \qquad k = 0, \ \ldots, \ M-1. The DFT in the Bluestein DFT equation can be expressed in terms of the convolution sum in the above equation as .. math:: X_k = b_k^{-1} \sum_{m=0}^{M-1}{a_m b_{k-m}}, \qquad k = 0, \ \ldots, \ M-1, with :math:`M=N`, :math:`a_m = x_m / c_m`, and :math:`b_m = c_m` for :math:`m = 0, \ \ldots, \ M-1`. From the convolution theorem it is known that, under suitable conditions, the convolution sum in convolution definition equation can be evaluated by computing the point-wise product of the DFTs of :math:`\mathbf{a}` and :math:`\mathbf{b}` and taking the inverse DFT of the product .. math:: \left(\mathbf{a} \ast \mathbf{b} \right) = \mathcal{F}^{-1}\left\{ \mathcal{F}\left\{ \mathbf{a} \right\} \cdot \mathcal{F}\left\{ \mathbf{b} \right\} \right\}. Note, however, that Bluestein's DFT equation in terms of the convolution sum cannot be used to directly evaluate the DFT equation under the values of :math:`M`, :math:`a_m` and :math:`b_m` provided. ++++++++++++ Zero padding ++++++++++++ Consider instead that the DFT in the Bluestein DFT convolution equation is evaluated with .. math:: M \geq 2N-1 and the sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` are zero-padded as follows .. math:: a_m = \begin{cases} x_n / c_n& \text{for $n = 0, \ \ldots, \ N-1$},\\ 0 & \text{otherwise} \end{cases} and .. math:: b_m = \begin{cases} c_n& \qquad \text{for $n = 0, \ \ldots, \ N-1$ \ and $n = M - N + 1, \ \ldots, \ M - 1$},\\ 0 & \qquad \text{otherwise.} \end{cases} In Bluestein's algorithm, the above conditions ensure that the convolution theorem holds and, therefore, the Bluestein's DFT equation can be properly employed for the DFT computation. +++++++++++++++++ DFT via Bluestein +++++++++++++++++ Based on the two conditions for the sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` obtained above, and the convolution theorem, the DFT can be computed as follows in Bluestein's algorithm .. math:: X_k = b_k^{-1} \mathcal{F}^{-1}\left\{ \mathcal{F}\left\{ \mathbf{a} \right\} \cdot \mathcal{F}\left\{ \mathbf{b} \right\} \right\}, \qquad k = 0, \ \ldots, \ N-1. There are quite a few operations involved in this computation. More specifically, computation of the chirp sequence, two :math:`N`-length plus one :math:`M`-length point-wise multiplications, zero-padding of two :math:`M`-length sequences, and two forward DFTs of length :math:`M` plus an inverse DFT also of length :math:`M`. The main reason for using Bluestein's algorithm is that it applies for the DFT computation of any input length :math:`N`, including prime lengths. When a fast Fourier transform (FFT) algorithm is used to compute the DFT, such as Stockham or Cooley-Tukey, it provides optimized length support via a given radix or combination of radices, e.g., :math:`N = 2, \ 3, \ 5, \ 25 \times 2, \ 16 \times 9`, and so on. Considering that the DFTs via Bluestein can be carried out with any length satisfying :math:`M \geq 2N-1`, a suitably chosen value of :math:`M` can be used to compute the convolution via an FFT with existing radix support. However, it should be mentioned that the Bluestein DFT computation is much slower than directly computing the DFT equation via an FFT with a supported length, even though both computations posses the same complexity of :math:`O(N \log N)`. Implementation -------------- An illustration of the steps required for Bluestein's algorithm is given in the figure below. .. figure:: images/bluestein_fig1.png Diagram of computations involved in Bluestein's algorithm A few observations can be made from the diagram. It can be seen that there are no direct dependencies between the two branches that compute :math:`\mathcal{F}\left\{ \mathbf{a} \right\}` and :math:`\mathcal{F}\left\{ \mathbf{b} \right\}` therefore parallelization can speed up the computations and perform the sequence of operations independently. Second, the chirp sequence is used multiple times throughout the diagram, therefore re-utilizing the computed chirp sequence across the operations where possible may also be advantageous. Finally, there are many operations in the algorithm, so it is preferable to combine these operations into the minimum number of kernels to reduce kernel launch overhead. +++++++++++++++++++++++++++ Device Kernel Configuration +++++++++++++++++++++++++++ Important factors to consider for an efficient implementation of Bluestein's algorithm are (1) the length of the DFT to be performed, (2) the size of available shared memory for the compute device at hand, and (3) the latency for launching device kernels. For instance, when the DFT length is small, all the operations in Bluestein's algorithm may be performed in a single device kernel, if data can fit into shared memory. This minimizes kernel launching overhead and provides the best performance. In the case where the DFT length is large, and the entire data does not fit into shared memory, a hierarchical approach is used where the large FFT is decomposed into smaller FFT device kernels that fit into shared memory for improved performance. In this large length DFT scenario it is important to minimize the number of device kernels in the implementation to reduce kernel launch overhead. The default implementation for Bluestein's algorithm when applied to large length DFTs is illustrated in the diagram below. .. figure:: images/bluestein_fig2.png Default device kernel configuration for Bluestein's algorithm and large length DFTs As can be seen from the diagram, Bluestein's algorithm is performed with (at least) six kernels in a single device stream. The chirp sequence is computed in a single chirp kernel, and the sequence is re-utilized at later stages via a temporary device buffer. The two forward DFTs are joined together in one fft device node. This is possible because the padded sequences :math:`\mathbf{a}` and :math:`\mathbf{b}` are contiguous in the temporary device buffer used in the implementation, thus allowing for a single fft node to perform the two fft operations. The inverse FFT operation requires a separate ifft device node. Similarly, the three point-wise multiplications are carried out with separate kernels, pad\_mul, fft\_mul, and res\_mul. Note that the fft (or ifft) nodes are usually split into at least two device kernels for large length DFTs. For example, a large 1D input data vector is viewed as a matrix (with same number of elements as the large vector), and the first FFT device kernel operates on rows of the data matrix while the second device kernel operates on the columns of the data matrix. In this scenario, a total of 8 device kernels are used to perform Bluestein's algorithm. ++++++++++++++++++++++++++++++++++++++++++ Optimizing Bluestein for large length DFTs ++++++++++++++++++++++++++++++++++++++++++ The default implementation of Bluestein's algorithm for large length DFTs can be optimized by following the design principles: #. Use the convolution as a building block for the implementation. #. Minimize the number of device kernels by fusing FFT read and write operations with Bluestein operations. #. Move computation of the chirp sequence from the FFT execution phase to the plan creation phase in rocFFT. The convolution building block is shown in the diagram below. .. figure:: images/bluestein_fig3.png Proposed configuration of device kernels for fast convolution In the building block, two independent FFT device nodes are used to carry out the forward DFTs. The point-wise multiplication of the two forward DFTs is fused with the read operation of the iFFT device node. Arranging the convolution in this configuration has two advantages. The independence of the two forward FFT nodes means that parallelism may be leveraged, since the two forward FFT nodes may be executed concurrently if required. Fusing the point-wise multiplication of the two forward DFTs means that a separate kernel for performing the point-wise multiplication is no longer required, thus reducing device kernel launch latency. A typical use case of the rocFFT library is to create an FFT plan device handle once, and perform FFTs on multiple input data using this same plan handle. As shown in the diagram of Bluestein's algorithm, the chirp sequence :math:`\mathbf{c}` is independent of the input sequence :math:`\mathbf{x}`. Since the execution phase of rocFFT depends only on the input sequence, it is advantageous to precompute :math:`\mathbf{c}` at the plan creation phase of the library. That way, it is not always required to compute :math:`\mathbf{c}` when an FFT is executed, thus reducing the overall amount of computations. Based upon the three design principles above, an optimized implementation of Bluestein's algorithm is described in the diagram below. .. figure:: images/bluestein_fig4.png Proposed configuration of device kernels for Bluestein's algorithm As can be seen from the diagram, the implementation of Bluestein's algorithm is similar to the fast convolution implementation. The main difference between the two implementations is that the forward/inverse DFT stages have additional fused operations in them. Compared to the default Bluestein implementation, at least three device nodes are used in the optimization. When using the row/column FFT decomposition for large lengths, this brings to a total of 6 device kernels in the optimization, a significant reduction in the number of kernels compared to the default configuration. The read operation of the first DFT stage is fused with chirp + point-wise multiplication + padding. The read operation of the second DFT stage is fused with the chirp + padding. Similarly, the point-wise multiplication of the two forward DFTs is fused with the read operation of the inverse DFT node, and the chirp + point-wise multiplication is fused with its write operation. Since the chirp sequence is computed at the plan level, the chirp operations are performed by simply loading the computed chirp table into device registers. Parallelization of the first two FFT nodes can be employed in the optimized implementation, however, preliminary tests have shown that little performance is gained by executing the two nodes simultaneously. The main reason for this is due to the fact that a synchronization step is required after the two forward DFT stages. This is denoted by the thin solid rectangle in the diagram. Another factor is that the amount of computation performed on the second FFT node is usually much smaller than the first FFT node. A typical use case of the rocFFT library is to perform batched FFTs. In this scenario, the amount of computation in the two forward FFT nodes is unbalanced since multiple FFTs are performed on the first node while only a single FFT is performed on the second node. This unbalance between the independent nodes makes the benefits of parallelization less pronounced. One last technical aspect of the optimization is the need to have separate transform and contiguous data indices across the multiple FFT nodes. Since the FFT nodes decompose a large length FFT into a column and a row FFT, the device kernels need to keep track of a global transform index to properly perform the fused read/write Bluestein operations. A similar concept is required for the data index, as the temporary buffers utilized for the computations are accessed in a contiguous fashion for minimal storage requirements. Copyright and disclaimer ======================== The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of non-infringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD’s products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. AMD is a trademark of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. Copyright (c) 2024 Advanced Micro Devices, Inc. All rights reserved. rocFFT-rocm-7.1.0/designdocs/buffer_assignment.rst000066400000000000000000000445051506652163400221750ustar00rootroot00000000000000.. meta:: :description: rocFFT documentation and API reference library :keywords: rocFFT, ROCm, API, documentation .. _buffer_assignment: ******************************************************************** Buffer assignment design document for rocFFT ******************************************************************** Summary ======= Buffer assignment in rocFFT is the work of coordinating the input and output buffers of each step in a rocFFT plan. Observations ============ Some observations can be made about the FFT planning and buffer assignment process: 1. Buffer assignment begins after the plan structure is decided. This means all of the node schemes, as well as input lengths and input/output strides are known. Note that at this time, output lengths are not directly known. 2. The first child of any non-leaf node must read from its parent's input, and the last child must write to its parent's output. 3. The input of any other child node must be the same as the output of its preceding sibling. There is one exception: a CS_KERNEL_CHIRP node is only used to populate the Bluestein temporary buffer, and can essentially be ignored during buffer processing as it does not actually read input. 4. The top-level node in the tree must read from the user-defined input buffer, and write to the user-defined output buffer. These buffers will be the same for in-place transforms. 5. When deciding buffer assignments for a node, only the output buffer for nodes besides the last requires actual decision-making. The input buffer follows either from the top-level input, a preceding sibling, or a parent node's input. The last node's output must be the user-defined output. 6. During buffer assignment, some number of buffers are available. At minimum, we have the user input and output buffers (which may be the same for in-place transforms) whose sizes are defined by the user. Zero or more temporary buffers may be needed, whose sizes are dynamic and can be as big as necessary for the transform to succeed. 7. Some choices of output buffers are clearly invalid. For example: * Transpose nodes must always be out-of-place (i.e. input buffer cannot equal output buffer). * Some internal kernels only support interleaved formats as their input or output. For example, the input of a copy-kernel (like COPY_CMPLX_TO_R or COPY_CMPLX_TO_HERM) must be interleaved. * Internal temp buffers are allocated contiguously, so they can be used on both planar and interleaved formats. This is not always true for user-provided buffers. An obvious example of this planar data: users typically create these using two buffers. * A node cannot write data to a buffer if the buffer is too small for that data. This really only applies to user input/output buffers, as temp buffers are always made large enough. Solution ======== We implement a decision function that determines whether a buffer assignment is valid based on the observations above. Buffer assignment should do an exhaustive search through the space of possible buffer assignments for the tree, calling the decision function for each potential choice. If we arrive at the end of the tree and all assignments are valid, then the buffer assignment operation is complete. Returning the first valid buffer assignment found is a simple solution. However, not all valid buffer assignments are equal in terms of memory usage and/or performance: some buffer assignments allow more kernel fusions and/or use more in-place kernels. This implies that we should keep all valid assignment candidates in a list and subsequently return the "best" one. The first pass of buffer assignment shall attempt to assign buffers starting with just the user input buffer (and output buffer, if distinct) and a Bluestein temp buffer (if the plan requires it), but without any other temp buffers. If that first pass is not successful, we retry with one temp buffer added to the list of available buffers. If that is still not successful, we retry with a second temp buffer. Implementation ============== A structure storing a try ------------------------- We store our current assignment try in a tree-like structure. We don't assign to the tree-node directly since there could be many valid assignment paths for one plan. Once we determine the best assignment path from this tree, we fill the assignment back to the real tree-node. .. code-block:: cpp struct PlacementTrace { TreeNode* referedNode; Buffer inBuffer, outBuffer; ArrayType inArrayType, outArrayType; // each branch stands for a possible try on the next node vector branches; // a parent pointer to backtracing PlacementTrace* parent; // propagate these values from the root to leaves size_t numFusedNodes; size_t numInplace; size_t numTypwSwithching; } Exhaustive search ----------------- All possible assignments on each node are attempted. There are several limitations on each node that allow us to reject many illegal assignments and prevent the assignment tree from growing exponentially. For example, SBRC and transpose kernels can only be done using out-of-place buffers. The exhaustive search is implemented in pseudocode like: .. code-block:: cpp // ------------------------------------------------------------------------------------ // Recursive function enumerates all the possible assignments // Returns a subtree, starting from execSeq[curSeqID], with input startBuf & startAType // ------------------------------------------------------------------------------------ Function: void Enumerate(PlacementTrace* parent, ExecPlan, curSeqID, startBuf, startAType) // for terminal condition: - if curSeqID is the last nodes - if the end buffer and array-type fit the root-plan setting - calculate the number of eligible kernel-fusions. - add this candidate to the winnerCandidates list. - finish this path, return // not terminal condition: // add a single assignment on current node and append to parent's branches - if current node->isPlacementAllowed(inplace) // add a branch which uses inplace (if allowed) on this node and test validity - if ValidOutBuffer(execPlan, *curNode, startBuf, startType) - append an assignIP = PlacementTrace(curNode, startBuf, startBuf, startType, startType, parent) - call Enumerate(IPAssign, execPlan, curSeqID + 1, startBuf, startType); - if current node->isPlacementAllowed(out-of-place) // add branches which use out-of-place (if allowed) on this node and test validity - for each testOutputBuf in the availableBuffers set, (where testOutputBuf != startBuf) - if ValidOutBuffer(execPlan, *curNode, testOutputBuf, testOutType) - append an assignOP = PlacementTrace(curNode, startBuf, testOutputBuf, startType, testOutType, parent) - call Enumerate(OPAssign, execPlan, curSeqID + 1, testOutputBuf, testOutType); // -------------------------------------------------------- // Decision maker: choose the best one from all candidates // This function is a sorting function, pick the first one // -------------------------------------------------------- Function: void ValidPathExists(ExecPlan) - if winnerCandidates is empty, simply return false - using std::sort, sort by: // the one can fuse more kernels is better - lhs->numFusedNodes > rhs->numFusedNodes ? // if tie, compare inplace kernels, more is better - lhs->numInplace > rhs->numInplace ? // if tie, compare the times of switching-array-type, less is better - lhs->numTypeSwitching < rhs->numTypeSwitching ? - pick the first one, and do the Backtracking() - fill-in the assignment back to the real tree-nodes // --------------------------------------------------------- // Top-level function that assigns buffers on the root plan // --------------------------------------------------------- Function: void AssignBuffers(ExecPlan) - add rootPlan in/out buffer to availableBuffers set - Note: For C2C out-of-place, we can't add USER_IN to the set to prevent it from being modified. - add rootPlan in/out array-type to availableArrayTypes set - add OB_TEMP_BLUESTEIN to availableBuffers set, if plan uses Bluestein - initialize a winnerCandidates list to save all valid results. - initialize a dummyRoot of PlacementTrace as tree root, this dummyRoot pretends it's a parent of the first node (in execSeq). So dummyRoot.outBuf = rootPlan->obIn, and dummyRoot.oType = rootPlan->inArrayType // The 1st round try - call Enumerate(&dummyRoot, execPlan, 0, dummyRoot.outBuf, dummyRoot.oType) here 0 is curSeqID, which means starting from the first leafNode - call ValidPathExists() to pick the best solution - if successful, return // The 2nd round try - add OB_TEMP to availableBuffers - call Enumerate(&dummyRoot, execPlan, 0, dummyRoot.outBuf, dummyRoot.oType) here 0 is curSeqID, which means starting from the first leafNode - call ValidPathExists() to pick the best solution - if successful, return // The last round try - add OB_TEMP_CMPLX_FOR_REAL to availableBuffers - call Enumerate(&dummyRoot, execPlan, 0, dummyRoot.outBuf, dummyRoot.oType) here 0 is curSeqID, which means starting from the first leafNode - call ValidPathExists() to pick the best solution - if successful, return // Failed - if not found, throw exception. Decision function and output lengths ------------------------------------ Much of the remaining complexity lies in the ValidOutBuffer() decision function mentioned above. Output lengths often differ from input lengths on a node. For example, R2C/C2R transforms change the data length from the input, and transpose kernels swap dimension lengths between input and output. Tree nodes need to store their output length explicitly so that the decision function does not need to guess at what lengths any node will output. This information is also helpful to log, so humans reading the plan don't need to guess either. As the exhaustive search proceeds, it likely needs to call the decision function multiple times with identical inputs. This is because it might need to decide validity of two plans that might only have tiny buffer assignment differences. The results of the function are cached to reduce extra work during the search. Fusions ------- Kernel-fusion is essential for improving performance. Unfortunately fusion depends heavily on buffer assignment. Two (or more) kernels can be fused into one kernel only when the resulting buffer assignment remains valid. To maximize kernel fusion, we also implement a FuseShim framework. A FuseShim class is a container/shell indicating that there is a potentially-fusable kernel-fusion. Each FuseShim defines its own requirements to fulfill the fusion, including the expected buffer assignment. During the buffer assignment process, we can use the test function to get the final number of the achievable kernel fusions. This number plays the most important role when making the final decision: we always pick the one which can fuse the most kernels. Padding ------- We have cases where reading/writing along certain strides is bad for performance (e.g. power-of-2). While we are unable to adjust strides for user-provided input and output buffers, we can potentially pad temp buffers to avoid bad strides. Once a plan candidate is constructed and buffers are assigned (including any kernel fusion), a padding pass can adjust the output strides of any node that writes to a temp buffer with bad strides. The padding pass must also consider the input lengths and strides of subsequent nodes that continue to use the same temp buffer, and adjust them accordingly. The writing and reading nodes might also decompose the problem differently, so the logic needs to be aware that a change to one dimension's stride on the write side may affect multiple dimensions' strides on the reading side, and vice-versa. Padding example ^^^^^^^^^^^^^^^ For example, consider this excerpt of a large plan: .. code-block:: scheme: CS_KERNEL_TRANSPOSE length: 4096 262144 outputLength: 262144 4096 iStrides: 1 4096 oStrides: 1 262144 OB_USER_OUT -> OB_TEMP scheme: CS_KERNEL_STOCKHAM_BLOCK_CC length: 512 512 4096 outputLength: 512 512 4096 iStrides: 512 1 262144 oStrides: 512 1 262144 OB_TEMP -> OB_TEMP scheme: CS_KERNEL_STOCKHAM_BLOCK_RC length: 512 512 4096 outputLength: 512 512 4096 iStrides: 1 512 262144 oStrides: 1 512 262144 OB_TEMP -> OB_USER_OUT The first kernel writes 262144 elements on the fastest dimension, and the higher dimension of 4096 elements is written along large power-of-2 strides, making it a good candidate for padding. The following two kernels decompose the 262144 length to 512x512 along their fastest dimensions. Padded output of the first kernel needs to modify the following strides using the same buffer, until the data leaves that temp buffer: .. code-block:: scheme: CS_KERNEL_TRANSPOSE length: 4096 262144 outputLength: 262144 4096 iStrides: 1 4096 oStrides: 1 262208 OB_USER_OUT -> OB_TEMP scheme: CS_KERNEL_STOCKHAM_BLOCK_CC length: 512 512 4096 outputLength: 512 512 4096 iStrides: 512 1 262208 oStrides: 512 1 262208 OB_TEMP -> OB_TEMP scheme: CS_KERNEL_STOCKHAM_BLOCK_RC length: 512 512 4096 outputLength: 512 512 4096 iStrides: 1 512 262208 oStrides: 1 512 262144 OB_TEMP -> OB_USER_OUT The second kernel is in-place, and would need iStrides == oStrides. The padding pass would need to continue through the execution plan to keep the third kernel's input strides consistent with the second's output. The output of the third kernel is a user buffer, so we cannot change its padding. When to pad ^^^^^^^^^^^ The exact criteria for when to add padding to a temp buffer (and how much) are an implementation detail, but ad-hoc planning we've done in the past has padded strides if higher dimension data longer than a threshold is written along sufficiently large powers of two. The decision logic around padding is centralized in one place in this design, making it more feasible to have per-architecture decisions around padding, should they become necessary. Choosing a winner ----------------- The exhaustive search is a depth-first-search that produces a list of valid plans, each of which would produce correct results. The list is sorted to decide which option is best, and the best plan is ultimately given to the user for execution. The sort criteria are: 1. Number of fused kernels (more is better, to minimize kernel launches and global memory reads/writes) 2. Number of buffers used (fewer is better, to minimize temporary memory usage) 3. Number of padded reads/writes (more is better, to maximize use of padding once we've accepted the memory cost) 4. Number of in-place operations (more is better) 5. Number of type changes (e.g. planar -> interleaved, or vice-versa) in the plan (fewer is better, as a tiebreaker) Future work =========== Strides ------- Currently, rocFFT does not guarantee that strides on user buffers are respected if temporary data is written to those buffers. With this implementation, it would be simpler to begin enforcing such a guarantee. Enforcing read-only input ------------------------- rocFFT may currently overwrite user input buffers for out-of-place real-transforms (not C2C-transform). Although we have documented this behavior, and it is common practice in other libraries, it might still be unintuitive for some users. If we ever wanted to start guaranteeing that user input is left unmodified, this buffer assignment implementation would make that work trivial - only the decision function needs to be made aware of this policy change, and buffer assignment will work fine. However, we may need to introduce yet another temp buffer, since we'd be taking away a potential work space from existing plans. Flexibility between minimizing memory or maximizing fusions ----------------------------------------------------------- We can't always expect there is a perfect assignment that maximizes kernel fusions while also minimizing temporary buffers. In some cases, these two goals are contradictory: if we choose an assignment using minimal buffers, we may lose the opportunity to fuse more kernels. On the other hand, if we are allowed to use more memory, we have more buffers available for out-of-place kernel-fusions. With this implementation, it is possible to introduce an optimization strategy option to users. For example, if the memory usage is the main concern of users, we can return the assignment with the least buffer usage. Otherwise, we return the result which maximizes the kernel fusions regardless of the memory consumption. Make C Buffer as Temp2 Buffer ----------------------------- There is no reason to limit the "C" buffer to real-transforms only. We can make the C buffer as another generic temporary buffer throughout; this can also avoid any confusion about the purpose of C and T. Copyright and disclaimer ------------------------ The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of non-infringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD’s products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. AMD is a trademark of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. rocFFT-rocm-7.1.0/designdocs/codegen.rst000066400000000000000000000244671506652163400201050ustar00rootroot00000000000000.. meta:: :description: rocFFT documentation and API reference library :keywords: rocFFT, ROCm, API, documentation .. _codegen: ******************************************************************** Code Generator Design Document for rocFFT ******************************************************************** Proposal ======== Create a new code generator for rocFFT. Rationale --------- The current code generator: * dates from clFFT * is based on string concatenation Ideally, a new code generator: * based on an abstract-syntax-tree (AST) * generates faster, more robust kernels ASTs allow generated code to be transformed and manipulated before being emitted. A concrete example of this for FFT kernels would be: automatically translating a kernel from interleaved to planar format. How the generator is designed and implemented is crucial for both conciseness and ease-of-use. Required kernels (scope) ======================== For rocFFT, we need/want to generate: * Host functions to launch the FFT kernels * Tiled (row/column) + strided + batched Stockham kernels for arbitrary factorization * May want to extend to Cooley-Tukey kernels as well Kernels need to handle all combinations of: * single/double precision (and be extendable to half-float and bfloat) * in-place/out-of-place * planar/interleaved * real/complex * small/large twiddle tables * unit/non-unit stride * transposed output, including with twiddle multiplies for large 1D * fusing with pre and post-processing kernels (e.g. real even-length) Ideally any configuration/runtime parameters required by the kernels would be defined in a single place to avoid repetition between rocFFT and the generator. We have flexibility in handling these combinations at compile-time or run-time. For example, multiple kernels could be generated for single/double precision, but unit/non-unit stride could be handled at runtime. Fundamentally, all multidimensional and batched FFTs can be written in terms of 1D transforms (with affine indexing). As such, an FFT is broken down into: * A *host* function that is aware of dimensions, strides, batches, and tiling. This function would be responsible for determining how the problem will be broken down into GPU thread blocks. * A *global* function that is aware of GPU thread blocks, dimensions, strides, batches, and tiling. This function would be responsible for determining offsets and strides for the device function, and declaring LDS memory buffers. * A *device* function that is passed offsets and strides, and is aware of GPU threads. The device function would perform a (short) 1D transform. A device function may be called so that a thread block is actually transforming multiple batches. As such, indexes (the spatial index in the FFT) should be computed as: .. code-block:: c int fft_index = threadIdx.x % width; Tiling ====== Launching device kernels in a way that traverses memory in tiles will be handled at the host/global level. Kernels need to support reading/writing in columns/rows. These are the block CC/RC/CR flavors (where C and R refer to column and row) of the existing kernels. Strides and batches =================== Host ---- Host/global functions should support arbitrary dimensions, lengths, strides, offsets, and batches. Users should be allowed to store their arrays arbitrarily. For an :math:`N` dimensional dataset, the flat index :math:`a` corresponding to indices :math:`(i_1,\ldots,i_N,i_b)`, where :math:`i_b` is the batch index, is given by .. math:: a(i_1,\ldots,i_N,i_b) = s_b i_b + \sum_{d=1}^N s_d i_d Where :math:`s_d` is the stride along dimension :math:`d`. To support these strides, the device function to compute the FFT along dimension :math:`D` would be passed: .. code-block:: c int offset = 0; offset += batch_index * batch_stride; for (int d=0; d < N; ++d) if (d != D) offset += spatial_index[d] * strides[d]; int stride = strides[D]; For example, in three dimensions, to compute the FFT along the y-dimension given x and z indices ``i`` and ``k`` for batch ``b``, the device function would be passed: .. code-block:: c int offset = 0; offset += b * batch_stride; offset += i * strides[0]; offset += k * strides[2]; int stride = strides[1]; Device ------ Device functions should support arbitrary offsets and strides. Array indexes in device functions should be computed as, eg: .. code-block:: c int fft_index = threadIdx.x % width; int array_index = offset + fft_index * stride; Large twiddle tables ==================== Large 1D transforms are decomposed into multiple transforms. To reduce the size of twiddle tables, rotations can be decomposed into multiple stages as well. For example, the rotation through :math:`2\pi \cdot 280 / 256^2` can be decomposed into :math:`2\pi \cdot 1 / 256 + 2\pi 24 / 256^2`. The resulting twiddle table contains 512 entries instead of 65536 entries. Generated kernels should support these "large twiddle tables". Launching ========= For a specific transform length, the generator is free to choose among several algorithms and related tuning parameters. These choices may influence how the kernel is launched. The generator will create both the kernel and the accompanying struct, which gives indications of how the kernel may be used in both rocFFT and other applications. The generator will populate a function pool with structs of the form .. code-block:: c++ struct ROCFFTKernel { void *device_function = nullptr; std::vector factors; int transforms_per_block = 0; int workgroup_size = 0; // ... }; This moves the responsibility of figuring how a kernel should be launched to the generator. Currently, kernels are launched with: * dimension * number of blocks (batches) * number of threads (threads per batch; kernel parameter) * stream * twiddle table * length(s) * strides * batch count * in/out buffers Implementation ============== The code generator will be implemented in Python using only standard modules. The AST will be represented as a tree structure, with nodes in the tree representing operations, such as assignment, addition, or a block containing multiple operations. Nodes will be represented as objects (e.g., ``Add``) extending the base class ``BaseNode``. Operands will be stored in a simple list called ``args``: .. code-block:: python class BaseNode: args: List[Any] To facilitate building ASTs, the base node will have a constructor that simply stores its arguments as operands: .. code-block:: python class BaseNode: args: List[Any] def __init__(self, *args, **kwargs): self.args = list(args) To facilitate rewriting ASTs, node object's constructors should accept a simple list of argument/operands. This, for example, allows a depth-first tree re-write to be implemented trivially as: .. code-block:: python def depth_first(x, f): '''Depth first traversal of the AST in 'x'. Each node is transformed by 'f(x)'.''' if isinstance(x, BaseNode): y = type(x)(*[ depth_first(a, f) for a in x.args ]) return f(y) return f(x) To emit code, each node must implement ``__str__``. For example: .. code-block:: python class Add(BaseNode): def __str__(self): return ' + '.join([ str(x) for x in self.args ]) Stockham tiling implementation ------------------------------ To support tiling, the *global* function is responsible for loading data from global memory into LDS memory in a tiled manner. Once in LDS memory, a singly strided *device* function performs an interleaved, in-place FFT entirely within LDS. Polymorphism will be used to abstract tiling strategies. Different tiling strategies should extend the ``StockhamTiling`` object and overload the ``load_from_global`` and ``store_to_global`` methods. For example: .. code-block:: python tiling = StockhamTilingRR() scheme = StockhamDeviceKernelUWide() body = StatementList() body += tiling.compute_offsets(...) body += tiling.load_from_global(out=lds, in=global_buffer) body += scheme.fft(lds) body += tiling.store_to_global(out=global_buffer, in=lds) Different tiling strategies may require new template parameters and/or function arguments. Tiling strategies can manipulate the following methods: * ``add_templates`` * ``add_global_arguments`` * ``add_device_arguments`` * ``add_device_call_arguments`` Each of these methods is passed a ``TemplateList`` or ``ArgumentList`` argument, and should return a new template/argument list with any extra parameters added. Large twiddle tables -------------------- Device kernels may need to apply additional twiddles during their execution. These extra twiddle tables are implemented similarly to tiling. Different twiddle table strategies should extend the ``StockhamLargeTwiddles`` object and overload the ``load`` and ``multiply`` methods. Twiddle tables may also require additional templates and arguments. See the Stockham tiling implementation section. Copyright and disclaimer ======================== The information contained herein is for informational purposes only, and is subject to change without notice. While every precaution has been taken in the preparation of this document, it may contain technical inaccuracies, omissions and typographical errors, and AMD is under no obligation to update or otherwise correct this information. Advanced Micro Devices, Inc. makes no representations or warranties with respect to the accuracy or completeness of the contents of this document, and assumes no liability of any kind, including the implied warranties of non-infringement, merchantability or fitness for particular purposes, with respect to the operation or use of AMD hardware, software or other products described herein. No license, including implied or arising by estoppel, to any intellectual property rights is granted by this document. Terms and limitations applicable to the purchase or use of AMD’s products are as set forth in a signed agreement between the parties or in AMD's Standard Terms and Conditions of Sale. AMD is a trademark of Advanced Micro Devices, Inc. Other product names used in this publication are for identification purposes only and may be trademarks of their respective companies. Copyright (C) 2021 - 2024 Advanced Micro Devices, Inc. All rights reserved. rocFFT-rocm-7.1.0/designdocs/design.rst000066400000000000000000000011551506652163400177370ustar00rootroot00000000000000.. meta:: :description: rocFFT documentation and API reference library :keywords: rocFFT, FFT, ROCm, API, documentation .. _design_documents: ******************************************************************** Design Documents ******************************************************************** The Design Documents contain proposals for features of the rocFFT library. They are intended as development proposals for engineering and contributors to the Open Source library. The current proposals in this directory include the following: * codegen * runtime_compilation * buffer_assignment * bluestein rocFFT-rocm-7.1.0/designdocs/images/000077500000000000000000000000001506652163400171775ustar00rootroot00000000000000rocFFT-rocm-7.1.0/designdocs/images/bluestein_fig1.png000077500000000000000000002564451506652163400226300ustar00rootroot00000000000000PNG  IHDR !o'iCCPkCGColorSpaceAdobeRGB1998(c``RH,(a``+) rwRR` `\\0|/” RR N.(*a``K @ [$)^bdo!`5 g ͗f3K@l 蘒򽆡&~ JR+J@s~AeQfzF#0Rþ9ɥEePcv1HJ_G9+8eXIfMM*i Ҟ@IDATx ]?_BbDSJDj)5![Lz<KPEiShbӘ*cbȿkssI鳞>kY}ku  @ @ @ @ Ю۵6 @ @ @ @ @ @ @ @ @A@@/TU @ @ @ @ @= @ @ @ @ ^$@ @ @ @ @z> @ @ @ @ @ PUI @ @ @ @| @ @ @ @ @@z9 @ @ @ @  @ @ @ @ @r@U% @ @ @ @ @ @ @ @ @9倪J @ @ @ @ 3@ @ @ @ @rU @ @ @ @ @@@g @ @ @ @ *  @ @ @ @ @ @ @ @ @A@@/TU @ @ @ @ @= @ @ @ @ ^$@ @ @ @ @z> @ @ @ @ @ PUI @ @ @ @| @ @ @ @ @@z9 @ @ @ @  @ @ @ @ @r@U% @ @ @ @ @ @ @ @ @9倪J @ @ @ @ 3@ @ @ @ @rU @ @ @ @ @@@g @ @ @ @ *  @ @ @ @ @ @ @ @ @A@@/TU @ @ @ @ @= @ @ @ @ ^$@ @ @ @ @z> @ @ @ @ @ PUI @ @ @ @| @ @ @ @ @@z9 @ @ @ @  @ @ @ @ @r@U% @ @ @ @ @ @ @ @ @9倪J @ @ @ @ 3@ @ @ @ @rU @ @ @ @ @@@g @ @ @ @ *  @ @ @ @ @ @ @ @ @A@@/TU @ @ @ @ @= @ @ @ @ ^$@ @ @ @ @z> @ @ @ @ @ PUI @ @ @ @| @ @ @ @ @@z9 @ @ @ @  @ @ @ @ @r@U% @ @ @ @ @ @ @ @ @9倪J @ @ @ @ 3@ @ @ @ @rU @ @ @ @ @@@g @ @ @ @ *  @ @ @ @ @ @`>+E@. ,@tAԭR @ @_;^z+V#fmbWȵ]T@)8"@@5 ;6_~jnh8޽{:L @ @Vvu׸kM@w{79j4%n{ @ @ @ @rmN%@6t8zK8xwEB @ @@[ѣG[|*,0lذK+ '@+ WceԐRK-C k*8#CzC @ @rXr%L&LPz %nsV= @ @ @ @4^c^ @ @ @ @ @@z9 @ @ @ @S@@1]  @ @ @ @ @ gUO @ @ @ @) ט @ @ @ @ ^'@ @ @ @ @kqk @ @ @ @Y@@/g` @ @ @ @ @@c 55 @ @ @ @, 3  @ @ @ @ @1s @ @ @ @rX @ @ @ @ Иz9zM @ @ @ @9  z @ @ @ @hLw&@ @ @ @ @rV= @ @ @ @4^c^ @ @ @ @ @@z9 @ @ @ @S@@1]  @ @ @ @ @ gUO @ @ @ @) ט @ @ @ @ ^'@ @ @ @ @kqk @ @ @ @Y@@/g` @ @ @ @ @@c 55 @ @ @ @, 3  @ @ @ @ @1s @ @ @ @rX @ @ @ @ Иz9zM @ @ @ @9  z @ @ @ @hLw&@ @ @ @ @rV= @ @ @ @4^c^ @ @ @ @ @@z9 @ @ @ @S@@1]  @ @ @ @ @ gUO @ @ @ @) ט @ @ @ @ ^'@ @ @ @ @kqk @ @ @ @Y`W= @ @ @ 0'رc^ɓ'G>}o߾2Ă .8w{Vj"@ @ @ @;OѣGϴlI{챱[/ @) WU @ @ @ P~ 2$}Ѭ7_ZꪫƄ 瞋_|1{mĈZvmֱ @Ы @ @ @rI&[lO?tt%vmSNe-裏^x!;>f̘r-cԨQѳgϲs @)б:U @ @ @ @>.,zfaklM<ѿc  @*0^ @ @ @ @>,\'q]t` >(C='N[.viXc5\W%p/^{Eo[ .-k*ĴiӲ=1nܸ۷o @@u V"@ @ @ @ 6,8rߴ7pCώ?>O:t3fd=3#s9uUĩSo]l?{,gׯ_ 0 F]|ȑ#6 @@ XzF @ @ @ Pq*gz-:5a„V/w19X|k>,|رc;s㮻jU}>sαb5#cv%͠WZJzm @%`!@ @ @ @@|ױ{M7TlS.]bUWX Fi%X"ps=wٹSNk_zxG}tOW^ٹVX!^zl?-}δjy!wI'!  m&MTvNU @Ы1B @ @ @TDc-=8蠃P\  n8㨣*1-;{qo]vYx[7R K.]_ z~ʹ暫]x`Αf',g洬SO=UvJ^CT^  @ @ @ @z>8c+2cƥo^?%iI zl|u e4_-ݻ_}[niq Pz;6ZF @ @ @ t-{l[Y;n:uj+f,x5Ҷ믿~q?o16llV[-zORoGqi5{ @oL#0lذXfeE92&L|Own+zǥ^J;|/r\{/6 @ @ @ \|goҥK(kZü/|p S< 6x5K73J.+߱{d?{,RYnZz1 @l@4"nmI&e;s\qq駟^駟N;zkq0 䥻;참۳^x"97tSq @ @ zxr>fRIG}|_ϥۦ >y\#] M4)ݺu?qEE']wݕ+tc @@ ]ehpc=6:Lo߾qWo^Tb-җBe]SN_G׮]^ 7P cv+z&@ @ @Z) fg.qQG]'pB|,t+B.[[iW^3mǜPzTGsZ_K8qb裏f/~UңL>=s:uj_ʖرcPy  @Ыq*4}W 筸EYE5o饗7x#{-җTZW_3f^+3nܸ¦g @ @ @ if^KKj~/cƌm&Ѿ 7#F_#8gf[lL_/V/FzI֜ @z'@}~Ǝ;͢־5-o)S~2.h}R8Ў>Y|NΌ7a„x66p¦g @ @ @ /x$kkޒBdў/Y}Vȑ# 0[-{㦛nzO ۥYRn}:[/Ճ1 0`N} @,[ @ W/8[7M/7Jwyٴŗ]vY,첳l /}vmWo,ofc/ 饥jlis@~L_o +rkNs PzU0@@ Ҳ+B\z_Ԗ7ޘ՛Y^#QoӣG4hPٱtgZ vi漻+w^v @ @ @:~_СC87n\a)8w5߃枻]@o&D8?v)޸ mK}hi5tB/M4B~s8ӊײAT@JuWh 7x#IzjyqD.]Hvv/b࣏>zSz@kZ暫b-G @ @ @NsM)ȖnO=#sx'U3<GRH/szSNvۭx^xWMP&b4s@C Sk%Ȗ6M_իWzȐ!϶qߛz  @ @ P뮻n)w)o[mU{ѩS8ce#GƴiӊmHIATҪ=nip;cƌy4G}6ҹsl¾g @q̠׸cNO>.(.zcg_:L;Z[Z el @ @ @,ؖfK.+lMz٤ ymxrH\uUeO8l:ꪫF 奥kSI  @ '0cƌ{GϞ=cѿO<9#ݻwM @ @Ԡ .sNH~ѣG( ٭4@Z6͖W_}u;[wذa&b-l  @ЫT_`ĉqfUVY%vmu]cei]y䑘>}zY]ożv @ @ @@+ lwqG|駭|7;mf;\7W^yeWMoH;8餓Ԥ46O>d{~_e-\tMk^  @T@@T6 -3Dz/kVlnCT[GѬIlIc@ r-OĒK.kFva~hZ~_WL0!]tXveeR  @ @HK׾o cҤIqYgehE_x:Eis{FӦMk}cǎ>8A-29H P;B"@@q^;*=]Qi6-2|6R9zi*w1b)wXmղ?x'?x^o~XgusioG -av};s9 %8㌜jW- @ @@%gifo+B ewIW{.x}Yҍqg3/7&MWQT{K4G;hP;4׿66H?+;vW^9jlJѮꩧʺ Gkf9|q{s5WY^x!v},wGfwzo/?"s%ı՗ {zkYIwCw9s][ 5;|'OfA @ P;SG-}Fz4Zina;q7_ofƒz뮻n:S׮]jM>i"@;U PiiJ P]t*5[_MTI#5mQʗ_~-[B/\sMtqKr)1eʔ,wf}L3祒ڜfK_ @ @ @:$)tW(;wnOi&:*L\niKieZgѩSBu  @~hTtMY%5䢋.ګm- pԳ>Km> +Đ!Co6 i{,z![qk %,x[*oVR) ,@q;mp^*^zixvvb@/6f̘Vi=.š}Q*,~K3|* @ @jY d~jZo3[ݦ鹅TWK+H\*?;5lW_=w޲wesw/Yqw6R xl*l& Р5Kw'R9矿x/Ғ jh뗻}߿ׯs8묳/~^~峻Ϛ_f-hʹ~[,[ofurKvV[-[Ƹpѣ 紤yĉӓ- ߨo eU-*#@ @ y睗27ˤvi3{;rF4]Z&>b뭷{}dgeoO+TZ^ܖ꫷c~QT@M L[?4-yrwt \i8/]^˾T<ؘ4ދ/X/lꪅ iT~_d_[:qȑPx鎹Ғ\rIfۅ^H1 @ @Գ@aI&ŴiӚuώ=#d + \H%24=Vmj?/l?c2; @a:6luHwZ Vb7ߌ3hnq>L;v.,{2$ݻw#Fonaٱ;i9'xxxW_o @ @Ի?񏬋MM;_a;NvGp}/;B+iʼno )@X(kFt֭[Zk?3~Zv?i0lhf$@ -zuer)bA^YKҝq,H6^C9$;~JKBx(N8Xowމ/nl4 ?yɫ-o42h !g @ @԰3jn ŦemfyxegV9rd{94]iIϬl馑n/O>9['cԨQһ7|se @TBp P)Zi<:]d뮻n<ѯ_x饗b„ ghKsz-ij>ӦM^K3&/uYXw[6Jzi…?Am @ @?8OǏ-: tZZvHSȟq,y'O~()eieYpM/Bj %}9Oͪ1|)imݶ6N @ @OW_~[N[t,\wuWj{emzj3&&M={nOzh{EO?=gXb%Z(ėM7U6d,Tkc%d_~yx1zԩSjgV @z yc]v%LJm],*]uƍO>$iJkZҲp@v/tHskiٕ4_nPy)W(it7e+SO-L @ @OcC۵kHߩXcܹs<#q%d@V[m=Yw UiN:餸wJΥc*]R.=RZjH 0+Yxj}=f+݅fK.iҒB+MHHʌ3P^ p @ @#GfMe #֣Gi֟gرc#ͺ?)JD!WK7اUq~nlu 6ؠxt#^Tnk  @$< @D }/i7h3o+ҽ{l ?vyq @ @ P9x [Ҵ4~+Gp^zgQF6ygt -i:oKW)}駟?8uJK  @5% WSå m/&M*^*}OΪyxi{ԩq 'c=_~ٶ @ @ @r>`lv1eʔF\s5 / eɅ͆~=zt./СC\tE 婧ݲiR벓 @j@ @xG..wg+;\O?G?7x#Ydl۲ @ @ @.n{/(vZ5oM6lX5\ ^dumnp/ipóF_xg @@5 UhTTg)~Re/쬸⊱;oXuUC6K/t @ @ Pc9&y睲'.²c3IK|ŗK.lB@oV3<3tRԙ8qbr)Re'!@Ԁ^ & @+P:J+ol׿۲mǏ:^}Hup;(g @ @.W\:G}t,B͎ꫯ"̶z(,k.ĬiӧCulLڻw8^K⥗^*3fL|ѭ[XeUm @jQ`Zl6 @<9w|IlV1r-Xw}f{#ѳglfS @ @ @]w]̘1!;ww߽Xڙ2eJvCC={oJc` ksmVY=ztZѣG 0`=㪫nrO'N6-ұdJa&7ޘm& @jyrԩS 7hR .(-~ذaqwvmÇ*53/ Pz=~ZO @ @ @@+xxwZ<{ȑ-OӍkFҬn&RBnVۖƠA{)>#b7/.+WA԰jt @ @ @Z/0nܸO:th3-#ܭ[7d*!@j]@@GP  @ @ @ho%\2:E]T\w7.n^'@T^ @ @ @ .kVϛoXkvuHέ}[M7y?~|;ѳgXjo߾ѧOݻw,B3O~~zVN;Ԧ: PsWs㴍 @ @ @%袋+?|ʧ~ݖ4܎;sO.]5{n^r-cP @$`zM}!@ @ @ @`\pAF1cƔNuoРApr-{u @kAe @ @ @*0p2dHY_W_}UvSi+R1";egK @[@@_  @ @ @4iB_I&O4Sމ'y5\qGO<-X| @J.ݱM @ @ @zHA~:kx6,w͎7eʔ۷o?>z뭘 @ @ @u/f⬳Ί . {H=sYǎ[f.[|-{ @ Lq @ @ @Zcǎqqewy'U/ͬƒK.K/t,2٣k׮ums @V @ @ @jH`4hP ZS  @]c7P @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @T^ @ @ @ @ @@- i3 @ @ @ @TUB $@@ tMqw@K5@} |'1"@ @ @:4Pu@}| @9 ZK`ڴi  @ @ 0gnĝ37"@n#@:w jѣGuZ  @ @԰@>}^ ,"M'@ / ̩RK->윾 @ @ @@ }ّ  @zX/ @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ @ @ @@%* @ @ @ @ Pzu;:F @ @ @ @Ыk @ @ @ @ @@  @ @ @ @TR@@M @ @ @ @u+ WCc @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ @ @ @@%* @ @ @ @ Pzu;:F @ @ @ @Ыk @ @ @ @ @@  @ @ @ @TR@@M @ @ @ @u+ WCc @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ @ @ @@%* @ @ @ @ Pzu;:F @ @ @ @Ыk @ @ @ @ @@  @ @ @ @TR@@M @ @ @ @u+ WCc @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ @ @ @@%* @ @ @ @ Pzu;:F @ @ @ @Ыk @ @ @ @ @@  @ @ @ @TR@@M @ @ @ @u+ WCc @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ @ @ @@%* @ @ @ @ Pzu;:F @ @ @ @Ыk @ @ @ @ @@  @ @ @ @TR@@M @ @ @ @u+ WCc @ @ @ @ PIJ6 @ @ @ @ԭ^ @ @ ػ+_ebFʭRP5ke;,"dV$ve_ !R_92sΜoΜ9q~sf>,|/    TSjn@@@@@@@@@ ]K@@@@@@@@@)@^5Y7         @jKb          @@@@@@@@@@ vR1@@@@@@@@@j WM}֍         ZRk         @5Ы>F@@@@@@@@HzݵT @@@@@@@@@USu#        VZ*         PMϺ@@@@@@@@@R+@^jw-C@@@@@@@@zg          @/!        TSjn@@@@@@@@@ ]K@@@@@@@@@)@^5Y7         @jKb          @@@@@@@@@@ vR1@@@@@@@@@j WM}֍         ZRk         @5Ы>F@@@@@@@@HzݵT @@@@@@@@@USu#        VZ*         PMϺ@@@@@@@@@R+@^jw-C@@@@@@@@zg          @/!        TSjn@@@@@@@@@ ]K@@@@@@@@@)@^5Y7         @jKb          @@@@@@@@@@ vR1@@@@@@@@@j WM}֍         ZS[3*  @Lj̙hk^|EFvȐ!/: #@@@@@HW_}e~{?VoɌ+]wݕy//y]k>!A *t  S`w6'N 4~kfΜ;.k;     Uyk׮fܸq=J=a8&@Q @@ `z=̡siӦMs19     8SMfB\rX PLb:C@@ $1c4xIZ2G}t@@@@@H@˖-7q&l0? PRDL  @&-lЂ&۟{N.߿*A%@-ZLTجYfbhAGa~嗂)xTK.n%z@@p.  @@@H?Һg /oWVXӧ`8@Ƞ4A ~fvH QE]PZ#<2gڵkg:u3E`ڴifm PE6<3UV]L`̙s&aTH@K?*t[.SA]=zt5O@ ^]v5ꖚ@E/e7jD+Z @6uEz 4n,8_c&b$Wv#* 6 g(xCbS$$@v*\c9 d,%  @1u=92Qd^X@ oo~,\sP$@pY-  @ŁtG@@@@@VZ={f&h֬f4x @Z@@8p`Ƣe˖92#x     %o6f%,9  @,@@;w6[Sn6~٘@@@@@j̐癁3Oo\vefڴiW_5}]N4i 6l7o^_'裏6+L]]] @@@3|S '~-Bpg1,@.@N;ͼ⋾Uݻ96l; #Hww]e]vKJ+VZ<[l1pBgyg̢EKz[̈́ lf33!   @l8ͮ`C@@TE#@,Ћn`#@hi+4ib6pCӹsg駟y>jҥ; ne pM7#F/2R_|qӭ[7ӷo_ӯ_?ӽ{wKם~3f2=ySO=eTyGLN̘1cafHA@@HOl%#W_5l-Y8kg) PMyϺ@ *J  @KۦM)S {̸qlv^{L>t!S×^zlVFY(Es={(8oV3_~/̓O>i8 馛"cƍm߉'hf̘atrĉ}%y{sGm|%g@@@ pY]֎ ̛7nE) P~ z@@W@Axg϶ow .Rϳ>wٌ@ I SIJn:czoC94k֬1BufZli_~y3zhO?eX+ '!@5,rF! TICN[jezYf#ڷ~v6?YlL֭*bڶmk3xe pY63 @ԍ[źo gWc[j+X@樣Yܛouuuuv1c\pAei%\Ҭ9TԩSN2{^fc<L5i\r%栃k{o߾F7MkolO?mZk-i  <eǻ+̈́ l@W 7nlXK/mt{z̜9y?8>܊qԢ矵ש3%ТE {]H~V0z \I/ubE&4.R&e'U @  Pw￿Bݨz=7'Iଳ2첋 T~++o)UҊ O=Ts۠Á`R IPe,Es=gFƭ~Ν;f]pL+HOG)  P<5XhQNEjw]v5]tw9Ѝs[F_}U{l|*jvmرcp @o cnL6mjQYfSO٠W^9'xF7t1d%Xfnh){ S=iӦMPW@nj~u}=U6 Q0[b601k֬IQ@>}?_tuS"uU];f{neߣ  iP{6P-+UN:sP5D[ncРA3ΰҺÂ_||:t'| ӹW_}Y̥^jt,,(O@يT\!1O5Y'M@  SkC=Ź7;Δ)Slw;sLfap 'vvQ_2-ys2SE7e\f?h馛3ml ^jy3b3~2  Du!6ey+S6 s=_|o FjМ)ѷo_7dqy=zs@ @I?K1 86 PcYV6 @O/4]tmWnfU NQr-Gg}־^ir/[.\_{L}+)+2*dtE ̞=ی=:1H +=/.Uv*{0@@ L:va棏>z]^~eᄀn 7`eǜs99zFK.f/S83;,̼q6ֶLړpSD qM EZhaI׮q[GMVȑ#oo V~ui]tc$E@ֵ`LUO=T^{e^ (29%NAz szuެY3w@@D ({zܺ]u sF{a;i3f̰]WrU!Pe?9[7@ y@YVLԾ}jbpS @/  p }7vҤIv[lsCB>c;I&ѯ_?W[gu<K~橧U͘s=ެIjP^zɳʺꨣ5\S0 P+{fUVFUTI஻ SWb ZZ4n˩םwޙ: rq2XK|͜IXc +g`/E>lB}-[d.첡!F?9LT yka?[(ܽqcG> @/Jmօ Pq>վJ߾}s֧.ܒݐxG3J{E'xֹu7${F]2{ݸl9dȐ λk+W^u0aM=( PeQĴiwm͛gHFzh#// %k˖-nW&޽{e+5-nѱVmpo}/Txe^+Z@8crrUV2܎ PacJf+&Sq$G.g@IDAT1O"ޮl$ 3+AJϷ]h}>f(ST%ʜ9sYt}^U:P (8ϯk?N{Z鳩mVٳ9ӌ*gqF1 (^]+@ ?5ye4rx>/K.fVJӺ^xk uݣ/^\M_y;Z w+yM6v_54Cr뭷.fw1>* 8`:ud3敞Ø`M6٤`X~Rfb5jReC(0% Jׁ#P ?TO [2{OTgVLg<rejySw@ eVA(*:H#nqaGLk2+5#.Ҷ̃:(Q7Ͽ۽.*PUWYwuy睗3*NAzCͩÉ'h|Aϛ ޽{<@Z*0o桇z]߇~hu晽3J/ffw7[l kQ/G;Cԫe}J`W4z-:_T l~l~@ [l c8U|j=3o|L T\}˚'בc\^!Pj@ }^x駟lżX]uU3V ݻロy֓ۧ2ΤIMo)ЬYL~p<՝o64,s@C9)9 nWF&M87C+U7tg4ZQu B?Xs63鲾yu[1}T^wߞߢJg+t@8:rShjMZHŮLj~Q{bӮIZ|G3Hc<]R`];#3x`ߛj̙3M^26IvZNZ>3{C駟<ʀzdu@'e{*CٓX L(9P) 4hY-袋L޽.G}u\E^ gt+=ƶ!<?g&/P̼ 2zP@j p<U- H{ қ8qQh^EC^㽆 λ+A5{a^kZE+o]qצvڕ/9Fmd.’1AxM4ize_1G}Tt$Lza+O>>|+x?`t_vaAFk!oBd;]gkI9<A??(+:RKP uMg:JgU>s衇xw5dm 0 Ot0^zymfihet.)cnT_|F <6m;v߿0)Ң-}8쳃ib$KNl7 ,і)@wp߷o, y_~6lXٯ6 A- ‰ T0k?j"+l R@LզMnԍ(%:'>JCZ>qlCѣGoRWI@SjV-bԎ/lU({̙3S[q?sw-矹Iyc7 <ؼ[IlW1y+X)й?7B,61O. <B?K.6IAqYv5%@du\z  n)C8{ϬV]^wuV ge2Sf@b=UW]ծK]ՑNgM61BAbn6Zs ikf{y@of&L`/(E]FYf[n nAzA.2s7x{WYe^^̰7 . kH1Rfp Fnq(x>+5?iKNXc5JnW6mJNTRo5r>묳̢E*:]W]wݵ%0)4T6 UB g3#it^wݷc7LM@t8j({N{L|M*Ԇc]1O(WMRZkߨW~jT@Ylf Lx7jTjG%8(r-g_>Y$^6 ξga[T)OAx2ťwygs >eSV0ezm6=efm2[tIIe|m.W>}`ې Rym ;CF*oL]i)HKɪkffu)ST^\RJuɭ [*/œznRWqݑ:.NZف^7n\*DWJ@ u^{#VXύ;ϪpJTS@AΝ;u z?;J ݿQ"O>9UsS*! /kF"?|bQO&g@ ʨwUWǛ~ÇI ޽qTp^}U3c K,a:(]/nֹق2 вZhau\ǡ pٳgT_|f}wTO_q// wyǾw7ӡCMyAt Puo ;RP %Z~ rD5a6 ^ȣ}饗.D鷂@C죇~?pfJ6d՚O? kͼCz:SO j\̼y].0QH IJ =z$>0 DPd7ߛ 2GB^RۉEt1@7(PWpԢ _o36*n~(+.6u)Էvm-ޥKi_/8O.9/g^ O>cVAg_"/e^p C)E @+׿e]t1ݫYyM˰HZz袶> g7Euױ_]ƍ Dx3qFHߺ]MWМl2QPߒV*8jhQДTBڔ*? W?uEt7ߴח Sp26V tW->=r-Kr=XZ..䨡EЮtw%?e~믿nXPѴAy2^y3rHsqϳ* jdֺz[,zkh z8ߠ[2^_g{k~ǖ^a{ 5P\umԶmRb9矅ϪpJU/9:Q@> ߅d{kV+yLE ^{5>{xϫ~]3({1c3Pؒғ8<+?)LzlS*L^8H1TAz:l-ߩ:؟:uGUn& 'P~aveoDgERY^U(I^Æ s-Vo\w5L P V 0BT ƥJO]J}(x(YeOd(xLS\ʪjf? Sﭷ޲-W^Yjܥ@?]((#;2W2-j ^yZׅ} ׅ_m.{5,PͯnJi-Apע*6s<*PD… oڢs_QlXneQVK_uv1=tAQZrc& {O eQN}ee20xBܢu.~?G>j p]WR\8]g;OK͜9衬:_uv ӆڧ!3oC Jͯ ʈc]PCepjsP5hҵ*5Z˿.^j}aVb2 5J.uN8gW䘧=5B P)'xOԠPǞJTc'.:җ.($? ZE7$f *:֍$]'I.lJ. .U{ugFP"wn顋lzV/M~Xm磒Azcǎ5ޣEu$ 2pUzX~ W fmUAq2½LqWձtβ"4NZ7usM6^KSeP r T*:Q8Z^uUp T]Vs#DՠBK)uѱnzh8!Ee^_lH}(@ :je\!&>{\s}^_)3d[*^*&)09HwHi GU=F]w@ɓ'GULǩꫯfߤ[n@%g[nQ0^A ؆[Oo4չmPc6~^ ٘&i[i{JRld*N;r۫6Z $n 9~pP*%{ZӦM5bb-v{oj-@LL-pAdss `=e*C_ TMD#VQԕT0ʍ7h/\ԺuI Sꗹ>K<(X*Mqne~9sn EZ[tнi$wZp+khCu֥ =ụQiR=w* r<(ˍ $ױ.h7>? eRװ{Qcw 9ݗ+1cz@f9?.vϼ{=2 *zV<r2{[j=@Y2_fQ ZTgW^yeAuPt/V|A$0Gٞ~1!>řgimsJ_FW涂Q+='S| fʙAǫdUpuTEYWʼVw9cW7'Nqu1.]zv 4ֹm-iq%s}WrIX(Z8﹎eޫWHVgȟx1S Ι9rYs5iyLa 6DeI0t]lIT9a?j(c~=R9B hE9 PRY%<0H{ ֫ҥKݸq,'N2g JHyo:75%W\̣rfsy81qp%2}9s.p9Uy9<=#q2MynG{ɒ69t< t;Wdܦl'Wsz_^'6{P\l{L跳X:'Xs]N`Y B+x4s.]e;.7̙>ѭ *:/A\t9NQ9[n s+ ݻwVYN`e9-2=7z.ǹQgOZ -v bKR~?W;: ֫sti/RoDC8NFXLW6\9U~9Qn綾^Zb뮌C>}2ϳ+~FOuN}`d3ȱ}vlu'|r]g{&Ovfw۝̼uN@'|]޽>躹sƹ5mNJ`UQmW)0Xf]Hh鹪`;7][]sM]uT1z܎ytw_'5^{>O>29JiEv+j (ˇZ]ZkL =e)q3:]xa!`ucL*nQ#="-Ybe}ٲ*{[zֹqKU݋94cq,^ۧL*VZciext{C߫ٿI^ӤqwU~ s.GKk~C}gQz zyv1ms.ﹽ*[Y]?RqxӡISƪnV\'-S1ߟocL4aeYw ~G>a"Y^vcOy 9!pF %_`Y@MunZfGH+j5O-˯87Tq˼/:e+Unrqn:̢>jA逸J{uU?'%Y}ʺtW\׶Tz&VΉiWkLIMK.de$xns!%Vg| ptS,Sѣ3[c=6mutg)YZYkXe5wl<ڂI?cdiM>=l'pv8i96衇f AOgg!rw`+C݂ 2.DR\ȶYMtSLQAtSYl֜qNrA/gcyx}>_62i;,>2y+9#^j^@{N^I1hӰ#97 \e6fw,vayٿ]2a[ R%OŮTzsס ~:;7]~wΝ> &%[q\3ggЫowVF[۞.t}*c1 C(&PIy;k%f*ƑAKHmQ֒]v8]0#8YON s70 ='ۖgࠬ( rبR(tZ=,VwךM,87wRz vUpj*}`;Lu+RxAAPX^lNj'[IR[n}J^cj)Un#@[N͜*,i]u]9^a 0 %\R /4NVAܠ t5mxGxrV:HOu*~V^2,~ ,W% b^[^񉜱P/9C }*K&MWC]s~y 0B)([4@ [M)̜9>oeW`~8u 5x|u쎨/0nܸHsЋ.Ⱦo+t]o3esexοYI6l@׮]́h'5kFs*? wU @>}U5d]I[n=ka8f+&Sq-.kuNnu3qyrϪpJ uUb^NMZ!@"tpذa6J.FVE]5fwmCSuaڶE:@]Ynit?bW} e7o&K(P }wX3OT[cþ48/X =ep>ݑ?P%(ٗ^3~;A])wy% :A;er.)+@4K]J6Tc2$*+_wW sOy]03!OR-m $MM'DlIΰPN8dwu[z)K~Qf3 a! >(3P 矅{B$QC5uKrUd-52qFlN5u:{ '$#|DVT@RL-<̚ z0ҝÕ]Ī[[e.jᯬrAsoxfHEKL@묳NN }VԊsḿ@1UVY\z饶KXeHzpl#^Ea邌WvwLzӦMs߯~V.avi S~JeT nU*NהF*]-/jJ82ezm1첋H_ gѯ,Zok@@]qw5s ysϪRv9S_&򹾃:t萳駟rUX.W…e6L@c=X w]Hzp4Ω+g}嘯-[4g}?m@UKy2vC98딭҅Lŕ7xcs]w#GL|pD9)|_Y.! ʶ c&.8O 7Pщ'h 3Qhxݖ.+n*J-Tgʸq XSfUs=X.*"}_E(X21b%/r˒U`n 2ľg^0aO˸l@@1A oS`I qIAz*Qfꛋ^'0޿qjVH7tK{c5u_ Y ߍ~*r@<_7LRKyLA۷]nyh;؛ S+ ueDT@/4kT, nQwF+|pfϓ3f1QHoov}z$ԩ8pi&lBpϾ 0P@;cttP4SN5'|.[c H@MswVkc^7@/iӦsΛo_{]udmET _v+xhk4\^JM0*x~0Rx#PK_}ouu;([o5_~yfzSO4輙D)wn{)x@e[4->牕?+2=Oi??UYP;[2]wn}_7R1w4W7eʘ1:tȩN1$[O`wossByh|o3fLYudp! һ ̯&_]ynVdw} }g&ooIvPIl lO:餜n\3fاGd {2e[oy*EpҰ>3oLRaXP]Cee+=s+@OA>^)]]G5o9 Awź|zuWO?,^ĞŖ`Phbύ7޸duV,HOI  V0q.]J6AÄ @r{5\̼ 7d  @<'x;QG}Um۶5\rI08y=zd7@XPe5m4C.Unt^~e:|/E]dZk-;#8¨k%R,<'pB@5uQ~ܸq~6o}b߱ }]i/)Ol:tF]r +Vc/6wWu1ަMHOO{Z!@}|Fi<|p dN0˝~s* 2?9L[:!3o*v#@ p1Oޒl(}86ʆ L]]]NEՍWVe ߇~fPK1-YY~ju>(@IDATsv ZMYոëH \{s&Qݯꪜa el2sLץlU ((R ڀO^ÕKT]hzSA:άoqj TF߫Hc猂]=g Cp3pfҤI+Wf^s8  <60Z @8E 膚X 8(%Eu @)t)/_\L6dj=+k"ٻᄏJ= qcWհ_s'^xL2jx֬YE;MӦM3hʜvUǃ~EYDͺcG.V8ӧOLY:t`QS>{Nu똩yh^ Tc-?HQV,W] k^n׮] QiEHWY<ȂJw?Шu;lݲ_x.#z2]{F Dػ0q'C fh!TA(I RQ@T;t顇^!wϸevwv{g=3sswnDRs̑k LDߟMߟM=,@!3oO|;E~Z*l93U&逹ia@ 4|_!@f zEiY,4n5|h8-ڐ2 Z7`3( S"55|(=[)ʮˎ8蠃ڐ&? Ӓ\4dr*^{w~̞qltާ&lS!1kWTZݿ!hxt5!H|&k!3]i|t!kZC&~ޓO>I}-4NkrNkOgznk瞛6UO!nZCpgV_Çe( Yvw霣 }٧x_8.U.馛ǽrWty 7lqw}+:ڢI'Tс5ޯo=u@vXw9)P^<=q6ZEuau= qS׿(%t'>s9c̘1骫2D&^x!!/ziĈo_Q u0DHֻȢX&si.#Ϟ>-nXq["32W5dS16𦘶WљI!HC T~S*u+ Ltm9D@ӛ6][lbz.,7*i/2dH)#u9-0vyfua WXalhC%,z #P=#j+kgtzGߟp#Գ{{z)wMDq4D8qbs/)D~EQC%& }+8J"_*tYf%p -ׂ 5%Y:\@7maϛlIn @Jz-=SU{Y{gF @@ZV\ɶT\I]uw{w>* @/#]ԝ@`K~x4|g_~)vi+S-e/MSO=5eu.oyFw$H/~E@ @:V_},s|ֿki 6Ȧn+^OJFjRU|0W蘀?;eo@kي++%ޞpӓM@^e$(5eԩS%hOW矧2?拏n΋gy"w*"RǔC-.UԈ 7ܐ^x6{mӓwqvO7|s. @  jƻ;[K/M] /rmzwnrX8 z-X Ю?%z\TCpSTU Pun4Z2dH-֕"Q[nYaUG}&Mfq4s\0-"iiyMqHׯC}`?>w-бv&@z}#}gmV;Si/riܸqS~ض߿tgmg @@o&tUWɓ'٩iӦiCIZZtE e9H]w]: < AvfV,Sq7u{nv2r( @/Iԧ\s͕o7c=d܅&7bĈ-}iZ3z)n:|^lzWdkiuI#GLrg 6o߾)DB PwwimɾhVYdQG6|tg6|Xs5neEޘ1cRLJgV$@@OT:[L=1yzB9 ȋ@߼4D; @ N;-zE{7=i/kE?Mwygk|KSNi0  @@|(p><;%HYvΘzG:묳z%w{.xolL1w|8;lذ6 @zZkg^z믧'f_`K,DmQc ,@> mVf#e ,&; @G ي㳗?B)d+>+#};InaD6+=)Qd* }fPA0 @뮻nGeplGm&b:>,ueIzk4hP[Fm|O1dOmeX 7MZuUۨ6=˂ֶJ>~mf @@ 6liwTOfitwX yvΑ@![q矟}n_d xű⤓NJrH&2ذ22ijsf֎@5穆QFZ]->L};P 7ܐ{|ᇅUN1Ȕ!קOt|Pp^Q t^`ςʙ6޷c7j;G_JlI/4o/8/Qo +i- @ <6 @@![%\x4f̘tGv-bKAlŅLu?o;tg'|2s=ٗ,_<52A@mc*; @@W D]|jvȦUzǎ3_jYdlZ^{-}GMNbH @@ 9sL-)SYyLgzk瞻ZS7totͯqO:Ӏo @r(& @@gT b] yu' @\ 04觝vZzSdˋoOEytcEM?O6l};F  e7o۴馛vک ^}td[veСC|͗eLkϱw?&MJ{oz衇͒׼!νk6d @r, @]&(UD@|4ݻwq:gg @Sߟ"@V=Oz| xh  @@ Ĵ䧞zjz饗/ ׯ_s=?Yv*c\ @hGߟLԄ{F @f   @tF`yɦ}7ӥ^FfqTU11a҅^s=7D @U+Ϫ: '@: Xv%@ }sB $@ PE H[ou裏u]ƍz*}WM>}Ґ!C +6t4st. @ @@ P @od/ @/c @U*0,>{D&O|#I&>cڴiiYgM7/[nԿ*l @?+n @ /y2A@GuT @:)Я_+fNV0 @ Ю?% Pyj`u@~& @ @ @ @Vzd @ @ @ @ P/e @ @ @ @U@^r; @ @ @ @ԋzi$@ @ @ @ @n׭NF @ @ @ @" @^FZ?  @ @ @ @ @[u+ @ @ @ @ @@ЫO @ @ @ @Vzd @ @ @ @ P/e @ @ @ @U@^r; @ @ @ @ԋzi$@ @ @ @ @n׭NF @ @ @ @" @^FZ?  @ @ @ @ @[u+ @ @ @ @ @@ЫO @ @ @ @Vzd @ @ @ @ P/e @ @ @ @U@^r; @ @ @ @ԋzi$@ @ @ @ @n׭NF @ @ @ @" @^FZ?  @ @ @ @ @[u+ @ @ @ @ @@ЫO @ @ @ @Vzd @ @ @뮻zN @:WG @ @ @>;M6  @tn@v  @ @ @@Nn4~<4G @ Pj~u @ @ _Gy$} @z݀ @ @ @<~Y3n<4G @ Pj~u @ @ _B`^d믱 @ @@UX @ @ @<|Κyh6 @ @ @ @ @cǎM'O.r\{^ @ @@eUU @ @ @\ 4kj  @jDoC7 P1gy&s=_t믿='rԸ,R=>=$裏O>[駟oϭ)Sҍ7ؤ}=\2dHiӊ8q/jxA 1mB' @<'{ PvX^  @M:蠦+, @@.vi\K @w}E.4r z*/KB @Lq[# @ @ @v׿K/-J=!8^O9  @TB@JXeU!RJ ?7XK%V[m5F%e$@ zϼ\ 뮻n0`@^4B`ꫯ.ٖ6F_~Kn@w ,Np"ZTAj[W7Qv @ @ PW^yeb-Z8蠃ҨQZn @t^W;oH @ @ @@.6%M2}l$@ @ 뜛 @ @ @xرclon6 @:' @sn"@ @ @^`̘1믿nGnw; @ @@zMk(? @ @ @y:ujZl/n3&Lv @ @d+ʞ @ @ @ ΋E>j @e @ @ Z+7#ck}  @ @m @ @ PuOSXL @@MD @;_})*{ /AK4 }r08/&h,пzA&@ CSpP4 @ @ @ @_@C= @ @z&NXe\#STJ>8∴^{qZE@ tAK.s:YfmV$@^~juY}*"@ @*v @իW4hPV =9Kfes0`@uzo~u T^] Z @u.`:t @ @ @ @*# @2j%@ @ @ @ @:W @ @ @ @ @@eUU @ @ @ @ P} @ @ @ @ʸ @ @ @ @\@^_O @ @ @ @WW @ @ @ @ @@ Ы @  @ @ @ @ @2*V @ @ @ @szu~> @ @ @ @TF@^e\J @ @ @ @u. @/'@ @ @ @ @ЫZ  @ @ @ @ @ @ @ @ @ PzqU+ @ @ @ @Թ:t @ @ @ @*# @2j%@ @ @ @ @:W @ @ @ @ @@eUU @ @ @ @ P} @ @ @ @ʸ @ @ @ @\@^_O @ @ @ @WW @ @ @ @ @@ Ы @  @ @ @ @ @2*V @ @ @ @szu~> @ @ @ @TF@^e\J @ @ @ @u. @/'@ @ @ @ @ЫZ  @ @ @ @ @ @ @ @ @ PzqU+ @ @ @ @Թ:t @ @ @ @*# @2j%@ @ @ @ @:W @ @ @ @ @@eUU @ @ @ @ P} @ @ @ @ʸ @ @ @ @\@^_O @ @ @ @WW @ @ @ @ @@ > @ H୷J/bzG}Yd gў^ @ @ @ kOv @ P_}:##|x?JiKofz'O?m;S<_~tWE]t @ @tD@^GK @jH?L묳NzӟfI/otg}Ѵz{7=M@ @ @O^zE @ԓYgE#3ވ#ZŶ 6 =i饗Ŭi6*,z&@sSf'MTW_}x4y @U@j9&@ @@ Y@5\iV~?[o[,miVp]y9+6evHoU03g.)>L^z)-"z @}:x+H/rz믿^O7;.r&Nq^ @ @Ui3 @@C-~i΂ :F:ꠃ*wvXdM鷊:"~ߧYg5vm༨&zw5ƇX~eFBa[L&?p]gk @, g|qܗ;6u(}ݴ.΋,]4_u]Scu20O:?/L @jhjv @ @@{SO2e\tE)>쌬 vM7e]{ȖcJ)vZvC7 Ћ~Zc1ZN?я^ @Mkɦf~{48/t7wDVO?I?d]W-OoQ]dS @ PqԴ @5.0쳧?ik+2M<9~̓*@Hg}vg}Z?m\Z/1][L;ָwqٴjyMڼ馛7x#7].|+K+]v4s69qR% nr2  @BSv! @ @VZiV+KmkV#ޤImor6)J7hРR̺)'NXȑ#SdQ @v{31"-Q=# ks5Wxׯ_ӧO1sUW]UX̞#Qkm믿2ٽ)2=Mv]tEFmd]W,DeVhKpeIﻧM>43wTF @J Ы  @ @K^}3:M,o8+;SXUgϋA7|tf}L)'pB:C^ @@m~ק뮻.+Y"hN.馛fq^)dmJe[s5Sdhկ~e.ѣGۆ .2KS|-YA @ i @48䓋S[5p[o}׊SO=~b78}Y{4f̘jEnd4*b}^|GY /egJ?3U_{iĈe/z[nEZXA @WêS @=ѣG;[4+"@㡇JC-;=/OoJ^L.K/4>Ço?N7xc=yTԊ@d,s.owqG j2,.k5cm+_`f ЛaƵJ{lLoozҤIYa|SU-6:r._8qbVF&$ @ Pjvhu @%03gVtP*ˊ};q^d)[Y֤/J)2Gţq3ɓ' &72-ҍw@U DE&w޹̞2Ove vD7_S}Tm/8U{^d ]c5/FtH @y @()0nܸrj-+}Gl"CRWȊ E66ڨO?tG9eEi}W?A:W_}c#p7ow?;t@L{>=W^SOMkfDrjAmϳ@mM+6 @t@ﮭNm @?鷿mϪuf۶WM7ݴK|;S8EZwuwߝ"gVK[mU:^ڻ+xYֻSp%++˴k7 kkRۖYfRv)+5қUַ߱11cmN9唖@"no>հ m|w>"#O[#ߚM6٤* @ Pjt`u @bJ#8"-)LAY?iNwᇷ_\7x5\eg:3SL3{gS%F@Hd,9ROgqFZяrԩS{Wήom6TE]\rlJVAcO 芩cv7plFT>}f%XFHv}sLOzNں@d`/A /pu @ @v @e L4) Ist"b/N8K/S^xXǏϲƇ/Z(;OLӻfeF֭W_=Ŕaqe̋|c~e_J[neY1adыi;v(-]wKq]WJofus;vl+:ܷ=ܳUaJu ,6|yiw;@y}±>JN1O~, @ @@ Ы5 @% FƛqHW(6묳fv[6lE~la:9裳l#/b#Fk 7<|)OJ'tR5#{/=MN_=ۻw}~ȡ]o>D1mL3Ԣ/~W( @&z4ZJ @@ |gCMkb-RaNV1cdSPF&22E&,"ue_e[~s:AKdj-8/y晛[o5Y@" aӟ44w9}uYikwrvxӉ'lz֎?<˼Y} #h.뮻 7ܐbZﯾ* zEӟ fsOj/@/"X(?-Dyۗꚞ;{oݟY[1k)5mrJ|)(?y*ǏϦJo3cwWSg^x&eY&n mmq" @tV*bڍ}٧}t @*.0q#Xcf"C>}*ކZ8Ad`j. <^|^"Nm?ci tb^Ex0 {^`tP-͑E(ۦ7߼~w߽>;o7n\2dHq[4{AaªlܵZ+{C6wqc(^iSGƽւ"P/o2"[^kz5Y~4s׬CEcJEvD LG96 s>tQGhRLumqR %dJSdglZ"3Y#~dב2ϪWA44.1xGq:F#f;NYdEi, x}]wݵɔhVL^x}L9BE @@9UWNGC @|g41`-D^{핗1-f=zСC OK( |')l]wuW8#Sd{SLA ^\ޱ1it_4 +SF-RX2E(+ro|-"py`J<&yE_٣;묳1m=G%X"{5|G͕W^١,j9##`OKF@ߞZ5?<( |{6~|+;ډG9/*xr-GͥŋhSO=ekqSA3 @ @'"@/R= @԰@#8p`6lX>|x59}+O["𢣙>8T)dĴYܑѨ?I&l8h$?\h+S)-U=^oYPd)m l=1v{%` ۭ1][rڶ +#Z+3{Kd,3#k0_Dx0~Q$jzoF#1cv{aLżAx% @]hCW  @Q''[ouPl1k/7^/}_E-BΈ^TCyƵ^[}[W?g֪{ET:U256F0@Ty rb̨$rl1ocڵ^8Ԭ>,.pnV0Z{DTJP=:qzn굮bĈ՗ Yd;weMµH_T|}Vڣ/s1Gn\ꪫxt6pÖx{O/uT@PF@ܿ٨aTmzNoV?cw0g4o|p9SEE?>-Ֆ  @ 0  @Q 15cT|L349aQ <#STG!xţk5"DW DvS>c)eI5q/űկy&o\h;cQ?*M[8ҋk)+g6إX guVjc EWA}?_.znϽu򗿴ܵӰ[Svw74I.S5\ŔY֨5 /јV#Hxȋ`H[ߺqߊ Azn/ǟڪH?_VM J+ROx c{>V @ @`  ԑ/ @@T7Q<_|/?[n)*Tm⋽=#PWݧ\J!"_>_RuS|Ϩ0՛Zj8/[o0^,{i"8lذb?N;a5XZ|j:P ǖ'Gib=.Q3#BG @霾/}7*Sܹ,ejrؘ}M٢ƆnU!*Ҳ1C՝j\K/tۋ/b%_T \xiv- \ SZLAgݮopq[~_cZ_ 6W*^uUSOMo7er6 EY0aBM93i]wMV,z'pBQѴz'՗Y.ge wrc @ ^n# @tMWLa~[_1_Ec1_|9B"AtJ1iѢ?" bc9&k{SD>O}Ǘe[feҺ[l.,ŘXyqs=7ԣnD#`^闻qy ߧv-BZ]-n>c39>|xOFNqZ739΢Yk믟N94IEfc7=>t W`/ַV?=]s5SL:ZT: 3TWw}9z`Ӫe @r2:~ @ 0$b 6$lnWLڳźGyxD%\2H-X]vC=ŗ׭ZYo5H}0{뭷l;o@a:tjSS_`f*{W~ߤ.e=Ci__weYDܬZLuQa]tjژUkWa{Ķ x㍛V̋_뗿Zu,όޔs;~U 8==yg~""@}6ڨaz^ZmS5| N #$@ @ +<餓ҫZLA;. @"?9zuSz~zs96<@\ \L|ŻX; ԷՊ>i„ EuU2 ^f6f̘O}&ݱO>dfMI[:-*jsO͵m|j7 20ި;oyp{pmzgyfi?+TnPxכ^n8|{]|O?/_pC&^p  @ R@@% @z_}tMĉ.NQ=]/&nr})#rITnZveAN8.qfA2X @YSԶ}1ڪE]\.jb=u] *^}ugm_'vmoف&#H1ڤ~Lj@/*wqͺS[WT_~;ꨣRܳjZɏ}c颋.Jkv@ @CSߚ w @ Po߫ΨQ].^wum: d.TCu**MGVo,{V+{&@`tnmӊ+*{}-Glz `..-=ot.C5\sUlf nM>}[3o~UU}g?6L2 E='A 7|E5-آؑ @kWJ @Oz[h_ًD吲b>V?<<̵~)7fSVL@}mI/Rۋԏ~ۡZ=n'>S O=TZguc=VYx1m:餓׶ ;-sK/6pÞ^_zu{xO3L?׾ںF [ouԭ~ve"ta V_}{E'g\^xUKsUeYO))c @ L9.zE @).׿5~WWD} _H崶1W\PZkP3O:GhG{҄ ϞFQ[o'pbf) 94tӵڭ嶱c{Rz" /:k=شK/}KivK~{qkb*ǾѣG ԐO<ĦdM:l7n\|r+30C~x\qӎ;X\G|g?k{5k6~mկ~ULa?묳瞻Rz뭵Zi>,??χg}v}݋t#zgx)^i @ @@iN  @ @`P {?jQ吨n"OV[mVNsL9眳{g:3?^_WuQE(ZYo5l.)v{!Q~;woϝ:*4 >,QUUcGզxOܿ{VBHycjp9EUF-1 dZj]"riխ+_Ds=tԮ'|r`]Z#c*fW^)BͶ7Zb\c͞-|F2*2N;TcW g @IQ @ 2 .uU~lRT^TR{kX!DoJ}ߗP];vuץ*TVt{&@` lEب3/t׎kЋ~KW\qEE{キ~kqܨAr5ZT|]]tѢj|.V-Q}ݢ߲.b*[nIw}wqmnN;T"#EUWL1?ӒK.dϦ;XF%•=\ZnWS.ǵV[暦[] aguV`z^Mbco;-CY&@ @h" j @ %DT^>S~:f9x)][feRǰavmrf6|@ЛPk"IݕW^Y|?mjm^[1;=+oն  @% _K @Ǝ۰0G?ZlHT|eYRL洀 ;be]*z1aL'H;CQ%pgO#F(KU8gs<ȴ曧>. ׮OqWņn>ogy:ztM~_'?SU8ȑ#_;:)/.>WΦJ{1-mTЋϣfIkx ׿ui.,S}-G2BCG`iu}ozs+: @^@@o @ @}^'Y`RL8[L"(ЗK/]T:cjժE%F_Bv@@ U|Ǖ9^?V#Bs,#mbuSFp E-Bկj1k7S#J1uivᇧ8 Ŵ=Q5/G$u\K_˭)(cƌIO?tq 1mz!q/_dERTD-B{W)^ǣx.|>LnϙnxwM7Gy$7.8T-UOGc'NX<7 űsbѣG8*˅A#{NWXa/qx?1pLm_ZEpgCC ]=[L%~sIy=~^o+?{m @X@@o @ @`J 4oWRF_E5^;B4@L+ڍxD[e]&Tj" vۭ=ek9^.a<:iB1Q6QT5++vz 02kSG(ZL]SzW[Tm٪, @ @ ,t @C[Q$ ?+rKva'׋  @!,[?>E!RTU  @$ h+ @IQ/7HwyglC=TtK.I1eeLOؗ;VZbqĈ @ u#GX]m5 @ (# @q_ڙnVXBT݋cƌ)v]|Gm @ 4LYg믧wiرŸS9IDATK>lQ:]jZ^MldM/ -)Xs5_jy  @ ^F @O[/mfŗly~1b*V+x1U;S˧?iymV @P`fK;$|gLW]u$כ @ uX  @ @=,vW-xrU?O馛{쑢RPn(}y5-  @ @ @Ph]a(f @ a=iK_}!quץ9#-2iĉiE-zgқoY*O<1Ju @ @ @ E8ꮙ @-bJژ#Hw\?~|jyߊw=u^lV[mÇm @ @ 0v @4viѣ?_TՋzZZ` /Yd1,49 @ @ @- 7 @ @|͗ @ @ @ @ @ @ @ @n uKq @ @ @ @ @@E@@a @ @ @ @tK@@[C @ @ @ @*z  @ @ @ @ @[zݒt @ @ @ @ PЫ`X$@ @ @ @ @떤 @ @ @ @ @^" @ @ @ @薀^$ @ @ @ @T*  @ @ @ @ @@%8 @ @ @ @ @" WH @ @ @ @% -I!@ @ @ @ @ E @ @ @ @ -nI: @ @ @ @U0, @ @ @ @ @n uKq @ @ @ @ @@E@@a @ @ @ @tK@@[C @ @ @ @*z  @ @ @ @ @[zݒt @ @ @ @ PЫ`X$@ @ @ @ @떤 @ @ @ @ @^" @ @ @ @薀^$ @ @ @ @T*  @ @ @ @ @@%8 @ @ @ @ @" WH @ @ @ @% -I!@ @ @ @ @ E @ @ @ @ -nI: @ @ @ @U0, @ @ @ @ @n uKq @ @ @ @ @@E@@a @ @ @ @tK@@[C @ @ @ @*z  @ @ @ @ @[zݒt @ @ @ @ PЫ`X$@ @ @ @ @[r @L?L=Xg;ۋ ]?vO`t$Y L0!ο׃}]߀xlt @zM!@ @@ |i饗s88r8#S<4{WF @0mk[  @ @ @ @ @$ 7IlD @ l&ie~oܹB -FIuY5 6l@}#G  @ 0͇m9tT  @ @ @ @ @@Ni @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @ft @ @ @ @ri @ @ @ @k+%IENDB`rocFFT-rocm-7.1.0/designdocs/images/bluestein_fig2.png000077500000000000000000001217031506652163400226150ustar00rootroot00000000000000PNG  IHDR WV?†9&iCCPkCGColorSpaceAdobeRGB1998(c``RH,(a``+) rwRR` \\0|TuAfa\)@g'000f% v- f/ XMH3K@vC 6^tLOJU^BD?%% 9(3=DR yz: FF pOF31@͑``_!f@*BL͐A@aߜҢ21L@; J?C8eXIfMM*i WVqGU@IDATx }c?*YnPIR %u$**B&) +i4KH2!C !~ʘ_w=uy?~sZYz}G+E!@rחYfUey) -PYxᅫ~#6 @Cp54j+"@ @{^C575+D @ @ @ @ @ @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @kS4 @ @ @ @ @ 5 @ @ @ @ @ PC5)D @ @ @ @ @W  @ @ @ @ @M"@ @ @ @ @ @`on @ @ @ @ @P@pu w& @ @ @ @ @ 0|7F @ @ @ @ @j( ;E @ @ @ @ @[# @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @kS4 @ @ @ @ @ 5 @ @ @ @ @ PC5)D @ @ @ @ @W  @ @ @ @ @M"@ @ @ @ @ @`on @ @ @ @ @P@pu w& @ @ @ @ @ 0|7F @ @ @ @ @j( ;E @ @ @ @ @[# @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @kS4 @ @ @ @ @ 5 @ @ @ @ @ PC5)D @ @ @ @ @W  @ @ @ @ @M"@ @ @ @ @ @`on @ @ @ @ @P@pu w& @ @ @ @ @ 0|7F @ @ @ @ @j( ;E @ @ @ @ @[# @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @kS4 @ @ @ @ @ 5 @ @ @ @ @ PC5)D @ @ @ @ @W  @ @ @ @ @M"@ @ @ @ @ @`on @ @ @ @ @P@pu w& @ @ @ @ @ 0|7F @ @ @ @ @j( ;E @ @ @ @ @[# @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @kS4 @ @ @ @ @ 5 @ @ @ @ @ PC5)D @ @ @ @ @W  @ @ @ @ @M"@ @ @ @ @ @`on @ @ @ @ @P@pu w& @ @ @ @ @ 0|7F @ @ @ @ @j( ;E @ @ @ @ @[# @ @ @ @ @5\]ÝI @ @ @ @ @ _@pͭ @ @ @ @ @ N$ @ @ @ @ @/ zH @ @ @ @ @@ Wph @ @ @ @ @\=|sk$@ @ @ @ @ @Oa4 0/xo 0JoyQ$Rsy+5$Zkj:5Y[G@`/F`Kl?XZh [R{ʽ.-򗿼l{#~Z o @BQ`#ZBwyq@:#$ 'PM%d'fv @ @ @ @ @FS8_m P#c=,5j9~zw}g z%-*7Xn.f~}Xb%qzUH@w_9S\({6lkP5n*E`RjW:e-D@{E]{Ej УX^X YF澁:Ҵ 5\,#-6t曧[wD^xk,)?}ի@O~˺2# ,T&! +8L"JU[/9ԧ:~!xӞ6Z+A o;sT652[bC @ @ @ @ @ Ѓ,J @ @ @ @ @}iK @ @ @ @ @A@pux%@ @ @ @ @ @`tWξ% @ @ @ @ @ < @ @ @ @ @ 0:Gg_ @ @ @ @ @z\E  @ @ @ @ @գ/m  @ @ @ @ @=Ϣ @ @ @ @ @ٗ @ @ @ @ @WgQ @ @ @ @ @FG@pK[B @ @ @ @ @@{( @ @ @ @ @# zt-!@ @ @ @ @ @=Y @ @ @ @ @\=:Җ @ @ @ @ @ Ѓ,J @ @ @ @ @}iK @ @ @ @ @A@pux%@ @ @ @ @ @`tWξ% @ @ @ @ @ < @ @ @ @ @ 0:Gg_ @ @ @ @ @z\E  @ @ @ @ @գ/m  @ @ @ @ @=Ϣ @ @ @ @ @ٗ @ @ @ @ @WgQ @ @ @ @ @FG@pK[B @ @ @ @ @@{( @ @ @ @ @# zt-!@ @ @ @ @ @=Y @ @ @ @ @\=:Җ @ @ @ @ @ Ѓ,J @ @ @ @ @}iK @ @ @ @ @A@pux%@ @ @ @ @ @`tWξ% @ @ @ @ @ < @ @ @ @ @ 0:Gg_ @ @ @ @ @z\E  @ @ @ @ @գ/m  @ @ @ @ @=Ϣ @ @ @ @ @ٗ @ @ @ @ @WgQ @ @ @ @ @FG@pK[B @ @ @ @ @@{( @`}r/宻NZ\ve.7tS9=G]N?QLF @ @ O$A @Nf*g}v9餓mV5~vyk^3y娣*_}k2,SvqDz  VDS#p};:xo'<FY7)rJOG 6`h" @ @ @\=5J u]Wr5T?_+ `Js9l%*ǖ/| Uk1m @4x衇~r 7q)^ziyGViD߳?B @ @ 03Wn  @xWVrmqi-Rnr%y晧N:Q&D X`VݹȀ7Y @ @ @Ww2 @`f uY妛njW^yeI^<& @ @ _._|qwy/_V_}C9̞#vyO/ -PU?#U^ۙL$@i!J+:hv[O8ᄲ[//OzF~/|a8= @_ <}/RI-~WI#8"K/UZ6X @AW{JfȲK?q3K @^6t^/zы3\sZ<& @dMgõ/.hy//*!@`P;:%@x/~ew26]d`#=i @ A\ .6, @`\ w PfȲgbz\dh<)C P``WꪫVَ<Ȧ /K-Ti$@-0<'=IOS]#ga(0{s&@߫?9ޟo/pw:!_Yf;\p==AFZ 0)[_^{'>?Q2 {~}}ooVԧ>zY^yԋ!Zᙖk-{nPy%<εbU?\rɲ: 6ؠ,­!M!@}1.C),H?_p-O~r?\k4S; @`ylӤ:L2$@`;:@a뮲[:/"Ѩ<9yW>Ta vۭ3k֬ri{g46V Ps9:e _B9餓k1I>A9  ݑS\lC矿05þ?9"CW)TO~{UgV* 8ǰ (K\mFe7//~[U} {RSTw}UW]U'T猍Nuiv @@fBNK[B`Tqwu @y0y2yw+7ʡZ1]~ח6}裏._򗫄7y/lt؏hOӳ7o//| x6{_ 0 {キle|`uJ"ۮ\r%%3z)>Dle  @nn)Jz(/}K{޲V[0cȲ?~o}[m;)%_\/U`YL'@$(mm!@ @` ^ H dd_xslռ[~曯zWF#{ォVvYߣ>ZP#(?˃>8G:HQb+ !n首U]y嗿eY{[n Qgm0d S\깝v]vYu!,{nYzV9#L\?5!Ѷ1S1[nmnʞLl,L5TYJVe2Yph @ @Y`zQ~uiwVgK_OK^J{;Q?fCoe򔧌FB&Q5ە^`uf]wF3]5`?S\]veV-JIV%d8=iO}j5[Dk֬YU.ʸyVU7\6dя~>&!1!ˆ)[ŠOpoJV/|M퓡!O>MDF*5 @ @ NɇI3 @A';gI-G&n32'̶^`jۈzmi6Y tly)dN~57(Cu-rk?i^]w(?ʡZ9vVxo~SN8 ,vi .ne2BC^zrgVlG>Iכuf9K)R3CƮ:Uf9fǝwY3Ψ{_=nRw%T#Vvm9w:]ehe]vt,|܆y]?;n(_}IۖXb(NJ\,]o[3뮻zzyFzs[u^pqO]@+ +{$ykt\_jؔыZsꫯ1r篎'WoFMdG?*կ+҄}Z|焖73B`zU~SN {Ɨsԉ~)[36P`&j2Cxы^vm{)>~c  @@g zv٘%\mpD6xy-ozӛW\Q=L;s;VUuǙh~Y6E1tëNpofmM@lRvaVXat/toN#kAglfkO{7{dwZ-b>m5Ko$(+Ee"\t嗗_ h^Օi<LG=Graoǖ,> Kݍ I ,@\}e]wݵږu=C~+_)Ǿw@+u2bN* z~]vӪw:W~2j[uZ[_~e:'pހyW!긍hS?7Zw7͎?Vʞ'lB2rzzt)بc#>t n(ǖd%vԨ{dg~^Wb*,AW H˃dNpVyN$`-a虺nU_SO=q8Ar7Aqi[֮$^k5'; 9䐒`k'P2m?pҗT̛@M7tx`u ߋJӡ|;ߩockM?Sϻ*+unSK9#:$;nX]V{^u<2:]S+׌cm10~6-0{wO'@ LT=mjN @EY$Z#i˓^ڸ~^4V:]w4d$@ P $'?jxw޹j(?u em1Fxb?w~=ޒ+Cj7++7dkW)8t @_\`[w1mv%-~'H5l)~{TAƭ;S{V'~2@'3j%7Djթ#d[=r-U9OHw[);YsV!0r-˳ۼ:~WF|hHlFItHI׿Do.2g:,[O1’=UzU|-c&3yFlIg>.n9Zcjz;4~NAg͹E`j'3α|P#!@`X~x5r-7UZOST8ફ*I<$euN Y? VoF|6NϸN'CaD~Y~Yk{f2ae7ΒaUYz饫 V۽ Ϟ ݔt}!@`f $6lSJ~:3?êu76 "#c#o;SLnŪcMQ/_Pz; rlͪw}):A'h/py߾$LAgMϘ--tfuM:xXBnjr9{k7hкtL'3<nW%8ה- ^jJs=cuB:'qM7TuK:eUVg9ɃpWU5J+4ψy.ψZkw2f 0]r/:I?=8v&Ⱥ>hltdEc$0rQ3zd:p$!L*I1' Uةdn^՝oJ: +Ss/_2jq/%r Sg֛g92j_ƫ u Rkz]adO>{mWI=dY[ zpo-K.duC,w!XT:4': \xՍ 6ؠ6m봈'No|K=K,DQ%/yIm J8$]v2k֬j)+yvFAJ9Pqfdz_^Ms[yk^S6ojg;ۖWf3x,.TeW"7pÒFiq'خ?nDv3A:4 }ғT:\>O~Piڜʑl:\틬#n& _r{3ۚH$##FLf4uto_z92}cKY~# ?9dĚkN 5@>O`N:%Rhvکj`\5 FXYr ֪lw2L?yTя%:W+hk享4tȽe>Nq[{~^4:]w4d$0k /xA|-(Hx:wE'΂Ζ_G}'ykԟ}چ5a/_V#|SUkR޻ӧy[ftpurvqrA5w/Seuڶ֖-[Jz[%@l7o5K;tH;K~+y^xϽi`u@@F{yehL*}sg>_Up n<,ͻ\&H]IV,L+%oq9SUrZRzNfb>yEYc ]|e8,kTA1Snt*Ũg`h $(3A_f0lm}ݿ+^Uǻ>xGlass|I^{LDԸk[o=5iw[J2%I Hr;|nNd6;N??:5c+r+%$V_}rW1?z:(c=3aGmqu7j83j7ؖcA~5ZN8*Kr6% 8r.t>L[]_0vs8 =$|jL'\5;_>Y]w]4 n'A@%ZzrO)5kIٻyt뭷VrӉ}T} g?%4۵1-*(EY>wsmXO"Ho}[>>l@?N Y? W +jno;p[73`:WF.}\k׿zI~b:aNڭ0נgb%}FT'}ugV# <,Gnd?+w;~}3t'a \`]wnDaGHJ2$[eP MtGp29OuaT#[eJȅmNub/ILOdiHخc:j v 9^lcYNЪtȃt CwFGsle2%Mq|^JY]_ӸRdJFfcl_i%sxFhIo|uD5?S @ c[r鸞 IWYKFHpdo9f@cF"y^W=7_oFrr, JZf3KF}yؒd`'˚kYM3&ߓY{[V#TbX("{WKo:WnL,CՑĝsMFcC!2YI1pnQk*{KvN%7r4'?#hݫè /3N~@.R!%:~s!4SKpsoV*[ B9L6Stʃœs4+-L:+9p3YBӪ3[V/7]s.DZ!0U+-뵯}mQqu9 m]V}M6'U5}2<4+T,ZlsQ]pu7˷ `r#{d,\Is>9W*Jquz߯^ZC`$IF +c4,_#8/MLpuKTziRK-^62RmF,KF￿g?qh֬Yi5fNI'f%$J' >\JUw^|V=3Z /d=FDӄ28|7'VF[k,tӚ~_duG7{<'9w+:r,yTEײkJ[ iFwj Ne|puEi|YlJF{ǕI \/%ux'U9_rMwA7|1y%:)k$N||3˰'nW۩ # ,v&PzxT;dn05CtaUW].i^ڛ y%-'$/tn( a_|C`.첪Q] 4{F~{%UYb%ZM$si.'{ vY }]y5H=Үnʍ_rUW}3F 9`֪tHGBzH27@[涺F첁sg#ڕ^U?{Z=c_z]E&0ypNy{6$Ӂ48;OĽ~ٯ~hh*ӟT8ꕑzr]D,LF8'[9tjUr#ϸ^e=iW o={d2vT_ծ9Ȩv1LCˑG9 Hvi^v-PFNfFv|CsPbx*<4GcG=' dSO=z!B:5)?[ HƻVOp0ʳitr!Lwy֫ȅlnZn M/~+9svJH u2]tEի]vA8VM}16vQպ;-_om> o~+C{ofձ,C&2AwSyo8{ZLo^3RO^ HAqfR_[?nmy׻5v2q7{=<g= u,y:&olr>98묳k6iScų7|%?ǜf%Yx饗n6y{oLt6K ™}F $0C^/Gzo3#x}}D;b&!s=zUuYr-7HOiW3j&ԧ?*6 \}d|!0%J32AoccdˮSpco޸p s*W?םJH?!m+_*6I3lWwJ2+?O2_]u|unt2N;T .h24G+A_~>9WfwzO$CLul_6rL%zk!"ژg? G^jʙgY=Q$2 ~=, =W{q<ewE:3Ψ'M9<k ͨ=Jk^.ߺe4馛JTf@%wG]r^69}seHm-0{Zk2dYmb2uZfz 4Nz2[s_+Yr@ {ӟ>j-34kvM j;OƓ5O'Qs馛VYǾ??;mZޏ:Z]zN{djr5eyvpaU矿u KFM6٤A/ϴN<d6i t12"5L[|/mu/~W\qzU<^_ҡNz-9#-9lUs>)7|:6V풀]ɨ-cr$O(2Rꪳ)p 2\A`J!}^5-r$+o\=JCt3Srckw._W횮%0!lַ2(24G3A_ͼ^2\wyV}`K.4XJ\$+Zk:m0ڒ+ڕQ ιA=ndؒ ,ܓ 7 dXȰvwO.:V2|߱Oݦ>eu)#Ҟ{NJP2\s 9oiZcoN!8[NN{h+ [ne<>H:4G+a_uʯNOOV+lĒmWJ5o J ]Й}Q$V= :j`gǃtFȱ dY걌ovXz qʖ[ci6h}:dV(K: *XV=3v XU(DzȐ4>oٯ;:;{Gթxc~I҃>x;~@7ZXƵMJPFTFSwIմ+͒- c^oVgvΔm4c /\=#1$/|:YԽ-9.zn Q &6p\JmjN-_KWXargB5MK=},(W3-4 FF6@%s/;5~(v: iVד[N'.(򦏆@4F_uVYeO)X;=/&w/]%}Mj@K?9K啯|eu=QZ%/PI:t[ꖳ~.sN5|U?:Wf&`:ש$/^_Wٟwem-GT:]CtN˻$8 ~l}zagÓ)lIݩ䧿Mo*}k˱[:reM1K&{u<*톂cm+ W@HoxJzΔ VѮ*홀@W\ Xu9tfTT&]q-_ {}ݷtspl5^/oo:όȓ@K=M\wulV@7m٦ֹ,@2y%!ǃdz}fieM`WId)9Kokt:=DC:YIu<m:G%A,o{ʞ{9*_WIo}P;jP._CMp瞎l+hN,Ha֬YM,ݔ^{|7mew^\pA9'\IoRN2Tǁ8:&`}\ס9&+tB[&~@or*8l 7PX߯6$_޴_WմQtG͛nB?-oyKh&>4w\Yɹ|-:_s:f=C=|ӟ=7VܼVh-g<U Ϫ @[L]|=$mX^׍]L~ٯLg뵔"cPKf&fọOHfNwy|{FiݷzkQ7xNOAnF ~lWp whI_w3E]T2\)r@2O}lm'tR^'4kMfpu64ޫunW:tڦN˷Zة^ LjVvi*cB -4M: Upuorתӝgt<`u]N&"NAgrwYgՒ,C3ʑGYSg૮w}2m ?u\륗^q3iy3o}[fm6c;δn{  0^ m7WǃQ @@7ɘ73s5W7FL I4bBFl927K .`3VYejT$hmV!Ϯ5@tU 3 zdsM/eǯgے;P<_ '`.+Ccviˉ'X hu]W =.Z^uOS%޺-/yK\ooKVZiZ_ <_zӉ{Wuat7 @Ιs)rk|W$:s[^O\_{ݪYvX&N rڬoY(ڬXE)64dzdS @`z΍֏-"x|oz/xy;Y @FTyws1չe{s11syfu,wqǤx?A5ݤ*ЄZ}N&Tk!{cSp^;kP_TC(4]|G m٦5\u`u7yFW TA9x [tȗ7q0 q>3,\%*;8M7t&FoVQoPt]w&% {A`ұ&C_}}G>z9#zv!@KDۗ_U׽uLhvk&:fV_}*)W-;Nȏz׻DrJ5"8v foIc}x5Ld|$(/2] v5|-uOfe_]xՐpW^ye*sSnu-mQ׾V5Yf*~/Yd^gԮe饗n7KM}4,Hmݪln VZ 0'|r˙~tIeei9_ zpvޮL{ᇫnn[Wind?e#8ed8czM`: CA[o:矿d*##$%{%o[~ݮdnU7k5O_|K2%vOs=ݢմv .wiz;mV/K.iYݶn[gڳI0r~/ݶ!˶Uu{0V]uXNM @ !3Tw:KOײ.M` tw?}ppdel @DrO?|(-D1' J[s9YEJY2c_Tc֬Y%s|u n~Y=p 2:\)tu)/˛Hߴ!=uUW^V[^ZO~xaf;rW*{Wɍdx=m~v٩sR5Tj7%0(|'{3J2ҝX.i:i/&H4XC/K}sekL~s=s=N?\|"]2Ui72}w7[:ꨒ? SOFƾ]'زq7DA\%g:2[b%J26+vy_]f;tz?7n{o92sLi>L6{mWiN7q]I;yFq& m7.? S9yNԧ>hdlu @>`i>g>s+POSV_z1Qhڹ7Wb6 @@餹.T 챁 o$-SIЪ$Cv֯jgvay[l1ͱ~$KIƲs1A` R&9k{I_^p4U[ȓKx vO  $LX&X3C{yۯ261Jw*lM?[ͷ I4\dFg>J*ajg/|%dr᜛Mo?jRw\f?9:q]ӭ Wmi/~ l%/e\hwSn*y^{J+Uo(9NWwkf}:S%sDJ̷VmHݔ.|{߫΋~_tƍ6ڨ'pQOX=r뭷V^΍rO\gާ<)e7N]dJF񼚕t:蠃 oLG(s }9묳\Q2BΑGYuj@`oյz:/KFZjU0Y%r/}~&wV!]zFjxBxc"lW:nUVwLGtMN;m "Y߉nwr 7T&}n|Ipt:eD;1*sg=Yt6JɹMFuHfӮSIfԓkZ䳮@V @`f ,G?FF[?c4jvWMmdQjΔζ\rI59h{b|ɽ7CD[yl @X~13&믿~gyJg pjlpu5&J2aBykQ4uކ\Le.pm$'{ؽ[N8:x衇&ٚ-̓pCtklVW_rt+_toK@Y%=nݐGZ~p,6sd+%D&\Vk|ARnĒ?b~V{w uܶ.'c\l@~Wd}k%:twg:$[F ͂@6!7$V_MKvƨniW74;V:&zLIGuTQ7gw2OǁT"@8+r.=va_z/&/Տ=XƸ8åץԐsV' M6l3AӼ5_V=/wnmۜSww^P]J!XTw.Zi֛Z^[oֶzHCIi8`<:HV2ZFqWRdt#8HpJ^NsH^. 7ni[f~xGydyhsGO uMa& @%  L㰇zhYoz:q$#P vN/_|qөg]|[cX;qqZԑ1S#EHu9qӪ K/8s_fo}z$7vyw?<˝p]kєrsRJ˚&itߴ&fɏnҺ>Ak|ֲꪉ=gnax]߫m_I Ny X v:3Eok->XO&z=(T/ףpeߔ@<Ҁ%=Mhw7ͻᄏIOzR;Pe,GUƪ@rJy3%ƕ#eܸqcY*GLppG>R\p[|M zN~?HPjRz~_]HX9<蠃^jVaYz 0}E;it9 d EzPNaU>;(ٗ<37P\ve{r\t9NF=Ju{&@@___ mկcIs|#AY6{%i9&7b]wU{ս܇?ʀ05szʹ|j͟s56. kwZ__|_~\f(&o޼-@nMJuA2PIٙ%X.Ll]>L_%{Rwz[Z$׽uc?Qiiu\l]]UWrUM]ޙo:Hу^XijҠ- J/M0v%lRJïl@SNK6Mo%ԞyH$0}Ef9&i><6/~G3φ ΚEI~?߬~.[;euy|ߪ$ƭopAӛxգH~s]$ Rt]wQih"Pwݤɵqe7#@@%__wQOIC\csYWs9#dT4 l2bZVtܓ/ץ4ZMG<}nH65_ ?[{-=вIVk+ d=أ{9Nǚd4O#M N|=زINҐ7$XfRJ>0OMW7"s;՜XW^YOJ=ʹ _²d5O>eiLKٿ򗿬-ь^C7M'Wm8$}7w\]'Nk Oh;y¢ -dpp8I&7=9Eⷿ-PLjttկ~itZ/o(Ҕ )%\uKuɎ敛9yG'*H|Z, MR!S#?M[#ٌn޺iL -7 bT> ]ME2;񎲥Tz욖Rra_iLaʓdhy|W{U J @ @$Pw]ɵqVs-6 oxY{m hPW:qKBz{k^3neq=Yo1c o|CjܞG]өU#)۽,3|z\.!IJeP?\HFȾI Ωy[N=nQI"@@@nڸK >餓Vk dNc*Zz尭udZz:쳋|rӼ-׬,K IYgN/uu3A54D״{KCq 9X8ӊWUe`p]ESZg}kÓ6?ߏofա9V߫iN]g< @A >sU,˙ޤ=Yf)/XY]ޔY9~? ,#iy:q{ }ΌqI r}et)Ur˞uz$]n~3CNuH =-{N>u݌gJ-?& ]ctӟH`޴4󃬯9FUuQ=N94!ӧ ,Tp"AMr$߮iV;=_ԛBz4yt֐esu׍n׹0 Mޥ3<)RW=ۧ!j a^' yB>prGh:/8:ah{ǡ3F?W^ @XT -8WʛFႫҕ/'Ƶ6YuOi{ 1#0$l##ViU>G?Ѳ+~Vvm}P]z~ pصm)9OӈG)%T//."ue󜺩OdjU9\tba:%H:zjy/o/~ߗs3r܏NYt5הNe4۲0M=Ãc٥4Il{+_Y|s+瞲>"]2U'?)By9)>7Mq7mu1.L7QvaónzM vL띴Zoƭsxj;l%?Z<"@hZoyh^^ @@Q,{-Ooy,7%7izY:>k},wܱiq;C<*%c)kUw%'hʿ^j]^7|&_e]5ߕ->pMj#_[xpZA˯7^ @]?9qM\z)P?K E @ @ @ su @Vu] n"e+`_~ޖ @ ${έeE @,sū%ZlՋ]?JG @knӟT\yƍ'$%/yIҗ}݋vک8Ӌ,K[k @qW<([C @ 8YتQ\X @!IDAT*K.ꪫSo"*(cgz*`_ӊY @ @ @U` @X;evaVW \l @ @ @l% @V.h82Hzym-3~Njc=V>WN;[Wt @ @ @\h @"n{,  WH @ @ @F[5L @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f9 @ @ @ @ @ y< @ @ @ @ @ nd. @ @ @ @ @z. l @ @ @ @ @h&  @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f9 @ @ @ @ @ y< @ @ @ @ @ nd. @ @ @ @ @z. l @ @ @ @ @h&  @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f9 @ @ @ @ @ y< @ @ @ @ @ nd. @ @ @ @ @z. l @ @ @ @ @h&  @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f9 @ @ @ @ @ y< @ @ @ @ @ nd. @ @ @ @ @z. l @ @ @ @ @h&  @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f9 @ @ @ @ @ y< @ @ @ @ @ nd. @ @ @ @ @z. l @ @ @ @ @h&  @ @ @ @ @蹀W#@ @ @ @ @ @fN"@ @ @ @ @ @{^6 @ @ @ @ @f4\ @+ظqcDH`MqGN72|gmMFr<)Xg=XP\kd\k 81m'm]FO WgZ  @u$aÆu6 |[*)8 @@7neݨ*$@@guVD#p7:Xy  SO<[fV}(D @ @ @ @ @ 6kb'@48蠃:f!@`^Ve=Jo31D`GqDcޫ>jva&d6g# s_pg l֋W(%~|5lL8'O4H(" @ @ @ @ @ @UZ] @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @舀Tb @ @ @ @ @ Юv}N @ @ @ @ @@GWw @ @ @ @ @vW+w @ @ @ @ @:" # @ @ @ @ @+ ]_ @ @ @ @ @ ($@ @ @ @ @ @]ʝ @ @ @ @ @HE)& @ @ @ @ @ nW @ @ @ @ @tD@puG*J1  @ @ @ @ @hW@pur'@ @ @ @ @ @#;RQI @ @ @ @ @@; @ @ @ @ @\ݑRL @ @ @ @ @\ݮ  @ @ @ @ @Nsg"IENDB`rocFFT-rocm-7.1.0/designdocs/images/bluestein_fig3.png000077500000000000000000001403071506652163400226170ustar00rootroot00000000000000PNG  IHDRj_2V#&iCCPkCGColorSpaceAdobeRGB1998(c``RH,(a``+) rwRR` Ƞ\\0|TuAfa\)@g'000f% v- f/ XMH3K@vC 6^tLOJU^BD?%% 9(3=DR yz: FF pOF31@͑``_!f@*BL͐A@aߜҢ21L@;  zJMp&8eXIfMM*ij_7@&@IDATx cOVP$Ț=kv٩}#Ⱦ-nٵٲE"*i(BI{r9t\s59˷\K?JFB@@@@@@ș@!      8jy#       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      X@m9       @       c9p       Z       @!      jy       9 Pcp      =      Xj@@@@]&N Ξ=j7w\3gNcҤI6n8[lYe|}G6o<) ]zukܸmsfͬK.No iRO@@" P[$j"  TlN=Ԝd:uawuWc=v!gŁ˝\.WlhݺYتV}馛gu7" 87zbW\ve;#N:yF@@ miӱ#   +b ;vpb}1cn]w[1=[."{Nj5 Ih޶{oVJ؎  @@6"   Ν;oK,ɓ'z*ˤ!ahڴ=N\?cֳg =jN*KC5~ר.WÆ ?.2ghr"^l+''nX賧ŋvqixrNʇC:d@@͍3GA@@@ P@>}믭m۶K|} dl3/_y(!\?3'Ӽɤ9kM;(̱̣޽iԵkW{G`̙3GyV\I^0tرcܐe3x/FVIs/Rw6m3/Hۭ[7´JS]ջ|ȑ6sr(WyV  ڂT@@@ NֲerAXL&Z^ySRC%߿}/ +? )6k֬iծ]Y/׻ˮj[pk...4nؙXL7W_իW@m6z÷UW]eAjl66b{駝 iwv_>!rlD@(i)S @@@*@f";A,O>d3f|Րɤ_~nd-'xb *>ic+馛uݺu֤xyoC )U g3)ME&onZp¤Įի)tvp3T Q1Β@L:(;S2-s,@6@ +V!5@b;i( _~haĈ6n8HD믿޽{e5jd7pg驧ʖ,Y⵫N 8Zn|d>3kڴiɇ\@(? ~V!`_ (i&% #yP@z&O~H~Vr z!]s6/)@ j)rJӱ\jUת߾|70O DkA2~K/9A7CAV7[kԓrƌZw$KX6WXs" l+iTÒ{g_o'p}3>vq3qMWu5YoM[o%V%z>L鄟yK+]ؠ2ds9^AI\pNoٛo*>}$5$op=N9Abam1؎@ 8XT 墲   nJ+XSQ]vE==+K;vthA`Gm?*g )\_y gΜiW^y)P+իWkO֯_?4h+3SJ}viN0v-,Sn5 kIn-ݳ… 4,)K.1}we4tIUZ=tsyh(tS>=3IWA 9 UoYf3;ӝ!}O4fU[n: g={jq饗:oz;銑@*@6@@%a5dEMɧޑ$R\a),PjguVлnytM7z.'ڻ,_|yYΝRo.o\A +=#OxmJz=3IT5? [NdM2긚u=tMz_J )hy]RJ?pk/jyuSh)wh~<vaNo|cjw?]ɏ T@m    0-'h݁:È7}Vj;s_8ՐÇ+NC38Q$@mbA`]&MgM hѢĬq˚+UCEoqU͕!5sSO=ՙV"8qb\^PoI3I͛7d}.'2]'|ynT zHEGޞc=fAl K^\|^gJQOs5)͓Iw>.yٲe$AqN6  ڒԜ(   k+)@ Һy4<o*Y~W֦M[vTUDs%0^iZN=ӽT|'.hn@GQM<h*iv2=4GrWu\ T/X/wܱܺtW.z("\޷~?#<2nCL%y5k,6L| Enݺ.z>aʭ|57^{5SC)nasu7Nv}'t1@[@mq_?j   @d5k -kM7sכP Fs<餓<'O ԪF56 J .͞kgz)SJKno{*U8A|jUf=\YU;ʖc_k.yAcOxmܸq`?ٲ|Izq@pr^l24P CUsMM >g3c6=FcIz곪ve;uFu]}W#fm @=u@@@bVXk\YȦM垻w^n] >uܤv7^b= TCZP^mQWv1W/X J5kGUdR]c9&Md~w}mС 65)yll6"1z% Uw,Ym  PB-ͩ"  @@ժU!Ds5kX>Y@mXWVsnذ!1y]RFh~C*Gj8~Aĺr! J^ 5ݴFkOc=֞}YL=Hꑪ5m.R5V/w{)6{yk.?S: /mAIk.YF/~X %&@.8   l p˪Uu)*%3/W+xwY $4h`3+-ZkuSoMul2:)X ˟=˫h:3t| 6 e2\ޯ7|Q].]X23NygJ8tbLo7 @Z    ̙3'Tb.gֹ+ bcǎ5-[vmYop~AZB{VPvĉvU ȆhZ=nV7o1y|CSIofAJfn Zh&^ϤX +@x5G@@@ Rq^fmڴ)'(кrJ1cF}iO}]=z͜9jN}ofFr>|Pޱ5k3dFc(TBLe:u" @fj3co@@@*@XGO?ydz*(6l&wNOGyy睴%P\86xcٳX`oCVp6|fm|}^34t]O -@G@@@ ]pa`Y dyuy.[w 'N|zկ_ynԨmZn_?Ǐ:˦N{8ͭ۵kW;3Wf#V[me|3Sl||^L؈ %,Js   a.]~f{WwWҰO?KR;~sHNNoc96l믿tX\?W/k֙}֯_?Ӑۧz]xiGVYTNM>ןBL셡b  P5FI^"N@@@ On4sݷ6w{~Aګھ ;sʂa߰aCX?sv6 L6͙OO>ɓ'oLqf͚vq3\~^ 3U`  @ -7   -b+VPN:nl|iرcxTo٪URu׮]Rj{?X1c7ܷ mkҤBݐk!~ P/@(My9k@@@#F؝wѢE {|kî}РA eѣ}rGM5Y&]/Z(nkaҤI^Kr^v~A I>}o?2ڷ,n?SYh@@ e)   C`ʔ)viٟyBrȑvswl&Lp YՉ'-݀\2ZjW{ k~%xFcˏ"P9oI')t}l=C3G!W|~"&8@@ kjFK   5?W^⋶;xm[׭[7h%.ҥKWofׯ;瞳>sVwC˖-c^+8jԨu} &s wt9Kbo쭷JpÇ7½6l8>{׹3xΩ.G~OG@ P[zל3F@@(:N~qv9تU<׼wxnO\٨Q#;W-ϟ?o?9sfz f>rV^]n"(s:u6~wo-Es^{Nl ?جZ],W^Up}6urcW렼 vkذܳ˯jQ3gNYQ={ǗK9={w]wZTA&޹3 ~V~؛#g; @*5E@@@bN^gL/ 5q쩧/ҫu{K曗Ke]f>l0]qNx-+Z!UGhΝlS=3.__`S@5TF S̮`ѣ͛74u<Ʈ\dIsR>V[m
+ֿ'>_tMNNf=/^l-˾&X6m%.|4ѣ@Vw-!ʬ <~A+ym喾B ,0=b nΘ1#vuk`VOf͚zT wqGbU/tG^P[gSO=4 ޯIe!Cb7ŽgK.[>%<7Rn}>L|PRÕ>7>AI{AYi(F@Pj+d@@@*믿n\pAhUF)WV=x5ׄ28Mfh[ K;q"?|x|g+z)(4?v>})XްzVy$ & }']b-C=_wi'gN:ywwʸݒzV߷3<7Pν{ִiSvWIsׯQ/nV9 wwwٶzk_y/Ҍ*[VC=ԠAk ORJַo_UTR)}>#nNk׮=zOw[3R.>Lx㍦^j71h >m۶5}<6ydǼ߼/w75(uXcw_=?oР]yuk@( J64ND@jy4 #ТE &qR:tblRƒ:(vUְ^7=Pc= Tz}WT*)ׯ_?CAMdG~衇/Nry;v{tÆ NPsʔ)I\Ѻuk~;찃U^=0ۥKz*a2im{ϹvꅪS2IzWۀ٭\ 㗞{{}}5bܯ$"א r*`U58piXTS&5xX5)*lP5ڵ37_cƌ޻,(`lVOdUիT3XޞuYNo$M9ÜvoeJb!V6۟|~C$ԐzvZGժU]ҰJK=?5ܹsmҥNoGOBճW÷f͚UU 0&!4x{k׮{^5ꬡ͛g jXY MIC0ks:Ӱg jh i=m4=@~7eaV&5 7tիWO~pRr0辖qLǢyh(尤ೆyU`{ԩw}ae]?tz*k؇|ֹ>X!Wz\+Xe-U ^}g:~;P*VʛϫgJU}c{YG@Wvw? *C=ֱ4w2c~?賭rH УO @-k (@6E0#EYĥPDd@@(p*@@/L bȊ      D!@6 E@@@@@@@R PY@@@@@@@(FH       @ jS"+       ()@@@@@@HA@m XdE@@@@@@ P"e       )M       @jP @@@@@@@ )`@@@@@@B@m       @6,"      QB2@@@@@@@ԦEV@@@@@@@ Q(R       Ȋ      D!@6 E@@@@@@@R PY@@@@@@@(FH       @ jS"+       ()@@@@@@HA@m XdE@@@@@@ P"e       )M       @jP @@@@@@@ )%+E)0ydf͚9Pi@ ]&Nlve@@@Xd֤I޽GV D}'bk PyJ?FaÇ;R]xEu.T+Wk~a[zu8z.lqɃ    _:Yݿ?~i]׮].PtR{ꩧlVrevm3ϴ1==Q '@6·… /XQzFٟiV{4nڵkq -ZdSNuǬV_֬Y<2}J-Z,@2sL3g͝;f͚M49|֍c# |N VGQfTu@@@(-ݷk{ƍe˖=رcgϞv:Q.3zعFxSOMZjل lvJz'x3kUW]廽6[vGCUZW^y:h}:u +p=_ue23#   @ n}_mm۶ >쳶{\Po*/:꨸ǭ9rdMDw 6+ۢe+ } cĈII @! {{7Uk׮裏~C5<#ΐjd._~;4RÆ _~A`    ))ݴxbرs/]W 4U|6F}'Tm!\Sݨ(bO @@@BnݺQ2vygݻ=uKyVFyGLjjEXF@@@t¦yJ|fm挎n: ?#iߣD].+cuN5H:zh袋ҭZ1-+u;ݗ<#P!.;S!`* qKea-Y5jW}#@ƌczAZZpD@@ sw}>sӧҽ("ݿzp 6m4RiwB~iQ=$>)UOLpL\ (;su4b̜9D4v:)27x*У6D.o9[je;:r@@@Xl]y啶v!V؍,THl۵kgmڴI;H+(dzhgnPYqzKGtV^>}5kN:ΰܶm[\9gvkСNGkZnܫ~׭Cv秴E̬ň#l-4}RIQq.{H\M"@6}ѡ((zaG}daUGTb&MdvLv c$١i gyw;ë_5aL_= 5\cW\qEqޙלXǏO>E H@JG@-z!{m>J="  `>޽{ۑGixRlIUK'(ܸqLz4 믝ԨQ#jwygr6^ Ҫ'P/_ ; K%uy;㗥{v}whիo4(Wp+Lg$i5}3ązpyk^[d_@xWਅ`c^jAN;iczݾ}{gue5~{5yP֫NM6ل 3&jFZl 6,,S@@To7xg cǎ &x4qAZ4߮y^ "~O剐{HEF UGm/RK Ӗ0EY驩@駟  PU=5ݪ n۩':(K^lY.\nTWknyn~s4u 䥒4LߠUQF\ʕ+;_{5g>dt͵瞛3V.:WhknȌ{ԳF CIsArT3z(SzohtR{4RΕetԦ}ڵkQVnfMcՖ_R1>|*W^uCf͚ PnVyްJ@Z:q;ǾW^˖ *Dj˸<_Dz Z8&;JRj\?ՠBvm7gr#E/Ĺ_xU lsϕR t~ۜNeEJU!3\ujڴ;b-CۨW(B@*:,Z(SRfݪ>J/f5LA55VG]v%^N,5F׫F⚾om)p^ԉNrM7P=QߓVq)񪳜 PṂ:ʙSk -A%G}FDS\i Ԇ_(#p9B P[W!믿6ԂGN=Z3Ij94ԩS}3fL@zy$34Xo9ϬۤIk>=Q׺kʔ)2wF$Pw}g# @^Yf# Uv=s뤩j?(P6[7'RLp;S9%{ lH~LzI ^1iUϞ=>ZkhbMcB?0aMbÆ W,u50P駟sȐ!?[neX{ǒ^)àxu>{ryN ˊ>`]_HV@#qASu$[@@Ȇ[o8SE8DesJP sJjժ㶅K\fL(2U?]x) 7`z$֭[q'dE>JU@m^yꫯZǎo ᣏ>j,/!ld]~rΝ[n"Wӧ,@رcuk׮ cd,ﻳ>N?t j՗(@`XGӆ<@u|V! z}߭N:K)u ߫hXż_~)Sw P > 6{o[rehu*%~I?sn1L uyV)n ƌ)HvR@u#o߾N/BhQ @rmi .Klȅ  PQ7nl(FlIF83:8;#9 P#t.;c}?uU! fZ+V0 aE ._aϟ_n_u_f٫omڴqJZe]fo}2p @ 7@ {eaã<  yq_~v 'w>ooEE$@/1c7x]{5U#8694sLիW,yTOd!_wVZwߕ˟ajqtPtzm۶֭[3~w9cW\qEʍOBf3 @ kήb WD" %/n?gz%vHI@mJ\ӦNj+5,֣Glq֭5dW5kz.[^z,U6)O * hA iyuי "!@V[o={&g  Pꨡ[<Ȣ/7;k5Pǎqg#VwyNCd RJYɇi M.׻ 4fϞm&L=tAU ákAs&[qmѢ-^86[~4mezAJϝ;aWOhD^xnD(%C9vx  @! lcYge Vd}Vju<^ƨQlݣ%"@ޓs*PF :thֆlgҥݍa?j-[t=oe^<#^˭?Ӟk7ǎ[x@h4;AhM) (6N:م^v/  Pb]tql"H[1.|=meP'Y @K>n6l0kn#&1jx ߡc˚6mﱫ~뮻Uf     @XhQ M4)4@tԦ>֭˽~T41-Ik92p1lNt?s=!|}Qma:w?NjB؈ PT-Ee@@@~7ߴ 6m)S:   PRy'"_jUoc=nˉ-C *__\tr5\MtSnLs?z 6@XpaqU"   P@Pה|nQhԨQbܳp'tq볱PH^8\&*GDz lrW\qwyIoorH`y/Rf͚囧_~tOV[m1p ϊ@@@"xwkS1bկ_s{J׹,Z(vu5kڠAK.vW=ܹP Yfq5ēR'NooqH?tP'`<+1S-l7 kРU^yTWB g}fŬzj4}t/tg햍2 ;'6͓ZCk#   Mo4V1,{N;Y۶mMB|IӽyꚩSi3:xݯuz=?{W^yխ[+hKoR=l)3ϴ3fxf4x(6m= F[N@uQ9rСCt}Jݣj8WCx wagZCw{{.ϭg3b5x@-<<3ةґ57jЗsKQȥKۥyArZaMks82dMv'ʱ-UUJ*0JUqbULRNEșkzEUVTQ-ݹIN?tgpTNC_|V b[f >svI'%\>CH+3('P+ՈH}^X+ñ q;/[FAVZyMY{GW! hTgURCcjԨ:ױ`Tk׮=-u%ޯUCpOsz>A'Jfl+L۷w}NVu0`@2sG;mHro^.pw:#B&YD\L(+JC<υxI=k/jpm%V.cqZQꁤ]t&!C k6դ/5jTJAZGI&9-*UxhG_qUp &[7ղJ_pn˧dU˯>}zWkliUUXqxZ(Ѻu벇5DzZwmfʧ$@  ӔXFr:凕q '8n4cL6Hm   (}N@ݯM,RI)zOj0sUJK~QKeŖ:ؼɔ=X1^G%@ڨ$ ՠhj}gtSW=U\P uaRE 5KNrڴiγaCh 0i~՜:/ ^j-Q$lzMC%Ky5샾5'mbpQu:,{ ]ֹ.Y=rj_s)5 | =zxf=Gsw] bk׮Wlw[4l҄ LzwZ p԰Eo=[{էTѣTta'=j P+@@x\}e=kh{ϭ{zr-ϯ\ҹq=뾀{Veh==~ᆵ':_d9rd=t}n{iZ9 gK@un37  Sy]gCQ D)8{ĞѵO5T!E]^{!CGmЕ*mj:QK@CGJQd @6f͚-kV6mZ+vKz S  @` \s5)~ P]wݕ^|O1cٔ  @V[m5έ @l[v @(Wnx-u @ @(`Dm+>}"@BFBP9=Xr%{v 06^>{%@O-2 >?mFm4z]I{ @h@kg-W_MClstB @s @_?oVj @ @ֿ @(pvN!@ @Fm( @e iVHW^y4um @ @)`Dm{>wwM @@ W]uU66mZKh] @ @@+ j[ @Z`U?nܸ$@ @G) @% ̘1#Ji޼y5*M>=-VA @D_AM @ѴBE&L(CZ#@ @ZJ@PRCg @,_|q[袋w @> @ &MJcǎ /FY<  @ @)`D| @~ >k֬4nܸ9 @F @'wy'5*kxi @ @PAP @'p%t+>iĉY- @ @& 7z  @ 0͛:֙g٣L @ S. @ho9mv=ɐ!CҴig>]d @ @`` Q; @ BvZ{3wtg: @ @[ځ @4iR;vlzbej^z^]" @ @` Q; @ Ђ'tR{&kυ @ @`@ Q; N @ J?pZӼyzݭZ*M:59u @8Fg @QGէ6n7HrJޡn @ @0٢#@ @n64垇 ~4jԨԧ @ @un @/0nܸ/}nGcjyĈi5׬{^  @ @/`D @Xxܹso&M= @ @ kԖ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @i݀f?IDATԖ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2e(k @ @ @9Am& @ @ @Ԗ  @ @ @9  @ @ @(C@P[6 @ @ @ 0l @ @ @ @ Am @ @ @ @@N@PðI @ @ @2ш6 @@+ L:5;/^`K'On{ @ @# mgN  @sM/ZH`MBCW @ @JԖ h]_<-청A=#0M  @ @vԶw I`]vI_|q} Pѣs=W^Z"@ @ "k @ @ @};%@ @ @ @E- t @ @ @Զϳv @ @ @En @ @ @ >y @ @ @Զȃ  @ @ @G@P>ڝ @ @ @ "yA @ @ @@jYS @ @ @ZD@P"B7 @ @ @hAmy @ @ @Զȃ  @ @ @G@P>ڝ @ @ @ "yA @ @ @@jYS @ @ @ZD@P"B7 @ @ @hAmy @ @ @Զȃ  @ @ @G@P>ڝ @ @ @ "yA @ @ @@jYS @ @ @ZD@P"B7 @ @ @hAmh@.h"CyٳgwYnҺ۠ƇgΜhͅ^8͝;7}O,؎JGn\y SO{.=gI&MJ?pv-u]#@O,}gQDM뜊 @ @ 0yw}٧ >zXuN&@ @ @@=jO-Xw{} )SzlqcOkVn|)F\+([?LiMQg:" @ @րM6$y_N'ON1¶Qہ.4]sqeSO>={Կ]+鐿򕯤_?Ŕ[ouS~z /nlw#}prJ/Mk]_|1ϝwޙyꩧ\1.믧o|P*/˵^w~ᇧB @ @f j(jC=t~=XZguW\qElA4ڞlNNwS1RnHlM^;WG'|2nG.9q{MuSkǚ<@k;~j-L\pAZuU;y%@@!e=?[;/Xw /L{nY'KFJ|7%\}*>2ˤ:^7#@ @ @@IՕ.@.:th{FmTpxY`qQLNX`]uΜ9u9}{gϞ[/Dςj7Ə_qxUVI&LH+R1; Vgy&Ũ(o⋳mS@95mڴN 6,+iĈiƌiu?Yu 03ꫯN~&py\ѣGg i̙ͭ\mZH ;wnxӤIZgB @p+BX68ҵW\qЇaawW}mvKrK͐6kǍN<Ċ[:uj~SL @1{QF;_+BO}nH:cX >#їX.@!@ @ @@"TsuhfM^SE];do^cWLeʦ )?Ϫ8TH >ᆰzSYG}T ں< @ @ AmsiO;3|ݩ9]慃`9Q)Q;n|Uύ1B/bۍikj#Xk:Gb{eg @ @! mb: RhCiV`7vmnT'N8w1vF\Ӎv-z/ (K`Kg}iSO=U~5;:_|zM @ @E@P^\5T8tUW f'x"o1iu91vتM0~; !+C=4i)}+_wiM4XûV1bDw @ @ @/ھ_WYeW}' uBLm  @ @ |^6{.O:]wuiM6I#YkGHS'?i}Ws-KVn֣6KՊ}-0}tac=6CFwO @ @" ^/<ȆN7#&V^{-kmpf)S:(Op7eܹSOM?|P8o{NOF{qǥ_hFϞ=; .tg߾%=> s=7Ya ,'tRJV{wMnfgT|6:Jk k믿>;#dkYͮsԩ_/*_c ZA& @ @46$*#8"=cK/mȭޚ"?)V+tÇϦC)w3f: VoW;X5w}]weUYlA{'#cM-Rp"*qZ:¿u~n|/K{ォя~/3]T팝 @ @ HCA7֫N7;tiu^#. ic$pQŚ12VQRx`jROOnaՐ67fqeOא6[oswk[Dԧ>6tԛJ1s#]!0b\f 1wݭQ>J_c>7 iz뭗:c}_8_LAc @ @ <#j1r7;=dG7cѣG2R3e]ҫ\ׯ~??СC_jQGbdg=~?F=)z%N8p)cN^~_"s9矟 ]_~wܑbݎAi7SVU!v]w]%-xԢ]Kju]#Q[U/WĴ5w|dJ]GgvNLgI9ڌ6_K͗d^;]#U ( @ @ڼF?m/rY_XD0yi6wj]tEy]6bԍ?uYzG}t:Z(kRv-.lǔ1im1UgS\Cڎsbjm6|:1 _B&SN9E]4 +#Ϝ7?1v-H7ӬfE`Bci|'U>f_v3uFb 7fCɿͶc4|8cƌi~j$@ @ Тy0뮻n˲m?k֬lؤIR%Fz51?? jܰa*ډ鄫q׿ #Cy}Ijqǧ7xpL ^{Uq6Ήv[%=ëOV\qŎ[J`P [;#'f'_={׵ԯ +tPCQ_Щ/~7Ǘ @ @! -BuN餓NJsLbX{m:y~{zjAh6![oݱkkQ>ͺo=:΋녘1J^yG.Xզ$mRY"4[` nk6MIuctM6?+@L Z~ZR܊*  @ @> jc{,:JL! 7ܐbDk_JL\|_w8;kt?xs6XZu%c}ɢW]g65jTcr1[oUuݱcǦK.Co}[)~5IeӮ ]z* @ @ ,ط]]@z;[n;;u9&L{2,SxX^+7.źtg>jC:/59sfce5vcpĿe*@Al|vK馛nI䧪 @ @}9}#8o4y睗V_}ṵNh4 hw]Z}; jLdksO#XrbZV-gqF~ooy6{M`@ 3&c/:K @ @-o BXq6Kfj#<2=: ϭvBFjL:5_EFu:oO.KvXĔW\qE>|x,H}ݳQ!@ @ @U-/1u{M-}6:[ovE=ӦMױ7,H߾6lUo1~=ꨣB^bt'O,LJi @ @hMa,]v%H{lFb׾,kxr?f4K-T Z@Xo ׮֭1V@v%Lq?:iN~mo=iܹYX3|@XkGsϴ + @ @+?)R.9Ug?YzSڨivhtZ#GڛjSW;oذavWmK,je K/4 :t}Fw1n7[oӟ4ŗUA @ @ 6 O /LoqzP t8ݭzn;朗uְu]OHZ%~kn1u^֯W^9{ C+2GiљXuU_zlT c @ @v9bE ėLkn]T?K @ @Ԗܤ6VXal F_{܆nXx񎃍7߼ .`4/OIZp]믿~1KLۦ7x#f8O0[ =Xv]/wyL1_F%@ @ @# -YF/sͿl.䒦6k~ 7Э6ZS7AN?wy ϩu&lRP?Fw}u鯃O=Tz+u!®[n%՛9G}4͙3'%&@f͚ @ @F~fϞ}oSNͥ5~놿Oy}ǁnc56uرcSu(9昺ag⧷e-l8ovKF ^se!/UqL1Uvw~ jS @@1c @ @& -W5'?IuڭJcv=-z%}gjrQGU=mUwva;NѽoVz7ߜ6l jye]6]ӦMK[mUMS}g7:ǧO{.Vy)t袋>,"iѽ  PMgooghf,LngbDW_] @ @\11_"DkV9sRu_U!裏zimI1Z9묳+_J?ӕW^mޔ7|e~x+G}TGy$p) Η_=]|)ڎgm>oϜ93 iN:.,OokZh^:TVfֹꪫm.su]72R|A( [l:H @ @zF?)FaK^~W#T_u|uwGydۊXo7]6NHֈ]mҥ^ZQ_~ꫯ9?^swZ>?~6mYp uYvNFUx$5i^Q=s\o.s j |xeDmsB_~>~:M}GpQ"$Sp  @ @]*1p,HZtEסCfԍ1]nLEiľYU}DtO>#jg @ @w@m' mG[X@PG @ @ 0q*'@ @ @ @@ @ @ @* -W @ @ @VC @ @ @B @ @ @J{ @ @ @ PP^ @ @ @ @R@P[ib @ @ @ ʫr @ @ @T j+M!@ @ @ @@ByUN @ @ @JAm= @ @ @(T@P[(  @ @ @ P) 4 @ @ @ j U9 @ @ @*& @ @ @ @PAm*'@ @ @ @@ @ @ @* -W @ @ @VC @ @ @B @ @ @J{ @ @ @ PP^ @ @ @ @R@P[ib @ @ @ ʫr @ @ @T j+M!@ @ @ @@ByUN @ @ @JAm= @ @ @(T@P[(  @ @ @ P) 4 @ @ @ j U9 @ @ @*& @ @ @ @PAm*'@ @ @ @@ @ @ @* -W @ @ @VC @ @ @B @ @ @J{ @ @ @ PP^ @ @ @ @R@P[ib @ @ @ ʫr @ @ @T j+M!@ @ @ @@ByUN @ @ @JAm= @ @ @(T@P[(  @ @ @ P) 4 @ @ @ j U9 @ @ @*& @ @ @ @PAm*'@ @ @ @@ @ @ @* -W @ @ @VC @ @ @BZ  @@ < .h^+oޛsg @ @j8D_`ĉ)~ @ @ P @ @ @>0?h;[.]tEmwn@+ 6o @ @.OJkU! @ @ @0qM @ @ @ PW @ @ @ @& @ @ @(F@P[Z  @ @ @ PS@P[ @ @ @# -U @ @ @) I @ @ @V @ @ @֤q @ @ @jqU+ @ @ @j jk8@ @ @ @bŸ @ @ @55i @ @ @ @@1b\J @ @ @ښ4 @ @ @ @Am1j%@ @ @ @@MAmM @ @ @ PW @ @ @ @& @ @ @(F@P[Z  @ @ @ PS@P[ @ @ @# -U @ @ @) I @ @ @V @ @ @֤q @ @ @jqU+ @ @ @j jk8@ @ @ @bŸ @ @ @55i @ @ @ @@1b\J @ @ @ښ4 @ @ @ @Am1j%@ @ @ @@MAmM @ @ @ PW @ @ @ @& @ @ @(F@P[Z  @ @ @ PS@P[ @ @ @# -U @ @ @) I @ @ @V @ @ @֤q @ @ @jqU+ @ @ @j jk8@ @ @ @bŸ @ @ @55i @ @ @ @@1b\J @ @ @ښ4 @ @ @ @Am1j%@ @ @ @@MAmM @מ k.@U jG @ @qJ @ @ @+ ^ @ @ @qJ @ @ @+ ^ @ @ @qJ @ @ @+ ^ @ @ @qJ @ @ @+ ^ @ @ @qJ @ @ @+02ćb(XIENDB`rocFFT-rocm-7.1.0/designdocs/images/bluestein_fig4.png000077500000000000000000002207751506652163400226300ustar00rootroot00000000000000PNG  IHDR r&"&iCCPkCGColorSpaceAdobeRGB1998(c``RH,(a``+) rwRR` `\\0|TuAfa\)@g'000f% v- f/ XMH3K@vC 6^tLOJU^BD?%% 9(3=DR yz: FF pOF31@͑``_!f@*BL͐A@aߜҢ21L@; )PJ[ԓ8eXIfMM*i rV@IDATx ܕsOHI!dm eɾNvCdeF2-[vcʒ* HJ))_s}-gzǹ9s:(         Hvb@@@@@@@@@p#         Tr0@@@@@@@@@@@@@@@@@@ *@ZAY         k          p, @@@@@@@@@5@@@@@@@@@ VPn         @         @A\+(7 C@@@@@@@@ p}@@@@@@@@@!        >         PP @@@@@@@@@\c@@@@@@@@@(kfa                  Tr0@@@@@@@@@@@@@@@@@@ *@ZAY         k          p, @@@@@@@@@5@@@@@@@@@ VPn         @         @A\+(7 C@@@@@@@@ p}@@@@@@@@@!        >         PP @@@@@@@@@\c@@@@@@@@@(kfa                  Tr0@@@@@@@@@@@@@@@@@@ *@ZAY         k          p, @@@@@@@@@5@@@@@@@@@ VPn         @         @A\+(7 C@@@@@@@@ p}@@@@@@@@@!        >         PP @@@@@@@@@\c@@@@@@@@@(kfa                  Tr0@@@@@@@@@@@@@@@@@@ *@ZAY         k          p, @@@@@@@@@5@@@@@@@@@ VPn         @         @A\+(7 C@@@@@@@@ p}@@@@@@@@@!        >         PP @@@@@@@@@\c@@@@@@@@@(kfa                  Tr0@@@@@@@@@@@@@@@@@@ *@ZAY         k          p, @@@@@@@@@5@@@@@@@@@ VPn         P@@@@@!b 0a{4h:t`]tZfdϷ_|Znm{΢fA@@@?Z^F0 @@@@@2~'4h 2Ė/_RC~% o~W2e V>|a={1cT$_XB@@JVk%ih    5K`ܹ6ydW^n]\j\HVZkZƍC:zhx_}駕߬Y>zL8ƍg-ʸrO>zfϞmZw=ZǞׯo뮻l=mz25k֬V6"  keh*    ,kG]UlԨ&k׮vmg;v, 믿=\T2:ud?s5;*`.~zU ?=ivڶ[nmh@@@Z@@@@Y`ɒ%6vXcvO?: y_~SO=rj6mY@Yك>Xec0a{p ?Z:ufc<   0@@@@ %;M7dϷI&2)E(ѦM]B.?e}vۘ1c⨮83M^tԨQ?e}oms3/ JTϞ{i+VHKeQS nFAǝc”WWfSM;@@@0ƙ     @g_5G}d[loUR_h{^{mkܸxbCHwTʕ+S}_Z>} /Ae[yj}{m24c gvO?dOF+WZnmR7U#|pB]@Y\-͚5fYczM6e Ze]s]ը*s=WmZjU6Xamb   @\+6     #o\-+q_NYl 6.BC]|6e{'2։? 6R@Ҝ9snР=ְaC7\^{6w\jo]w]ڵk΁k TCO?m˗/O"UZ>2ށ `{~mi׮1xSV:ye-^>!qD@@( yDi    T@۶mc[Epyo`4uNyꩧK.IgҢM ;ZК?*h-͛7'h-kVʐߞtI֫W/{G6((=zS?c?L:t8}b;$   )@ZinZ     (,s6f K/_|1MNe?~o]mڴqtjժ' w?7|թS'y`ĻVZY-Zs5^(CX   Pv     # gPݺu.3Du{5Ku}UeC)VZ97#*k[bX K@gNklJӦMfS=b:>A@@(k@@@@\V4:S#[7i${"+3f\8 De$g5ukTڵkq5f͚y/ >a   @ _)$@P0wqGUR  jݻw>*B瞳ŋOnki1,bY@XF Zx24k֬Ν8.jD۶mqnLj^]ao <#@ ̙3X+Ț! @Y >vGUKʖ EG-ʲY(  %{x@ &wqv7oڊ+lV #,YREmmMZe?m &hСCMLJ12i'"@y ,\h5 )ЦMײܲe l     PYvZd޳v)㕟7o=Xo&lbnu:t` 4ȸ^/d5_&3͞=>M`~-XZnm۷7ضrK[o22pZ=3zРA֩SiF,]Fv֭o,/ԩXoT39vڡSx֥K+DvP QF?'ĉ_4==za4MЛ|,qߺѣGۅ^>{ۣ Ǝk)(HY6pCܹ3W^IO/5jd[mӶb ?*   jff%@k]Lg>A@x衇GUR @ c̙s(#ɓ'ەW^iO=T?կ_?hW0u]RU]PQSN9hִiӠ|+c_wk2J>}"ԮS3˦@2pkw1u* R2 S ѪUL[_^yk[oVcǕ)S #FظqLV*RЬG}dw}׽ӢE K| ` SPe]a /K/4ٙ@JpYL@j.m۶+  Pl&M ,m]dSJY R,6߲e\З@n&k&Ϻ IUR^(`E䣠~ɴ,=׭[e5KŽ}wO:PLWJX|ILz( Ibqi?v뽼K(VPQWmUϺ&\ɔ/h_.}6lSȨCYXnaE~8z饗MͰPǪs_ TB=

묳l@uךnQ{yM֫W/PٖnS&t3: ߠ5ձkVUcU .quǺ.:}W4eM@L_n{챇n6eH(8vo6v1  Oٳd@@@@.\2CTblt90ۇvה㏯ҫpuֱ.̔J^}U^A~夓Nw]wݤʘu9fA&K.~vwWɀ nSVJ/r]jdS^{휃ִ=z;ຄ̤ڧO)sYjժY]1~W⋁ \ؘR;Uۘ}ekL>{g:+#  @ VUB@@@@2%СC]{l^{5Kqv-=Pn2L] ><3g5_LoR'Uv%u׺ukUiL]~W&MݪnIFNs][L>hԩ)pFY&LPmLZ 2̥lᆹ̞4^fF]{ceSq;();4e * @*k8VڔJ4BIMSUSu/hѢYRlt~e%$p- @@ՈJ"    @:|UYʯ(`D]wuMD6rHoP#?Ν;W W՟ګ /j| eRן~{~mvm4hkOI7q;tPY۶ms^61pPǪgJ]wze]v={@+˵kv]?а]2ms'Z&sAY4@@jk5{     РAw/7o;⢋.rRG*8#H\ &4iRRڨn?e|ɉZw~Er,M~hɓM /-lquqA \W=UMTW_]>V[m2a%[nu1ИreKUojt_mƑk mm5e+^rVc5ȸUb_;V_}uSתAڦ:^njc[o dWü~I]k׮+V۵kW5//@@@f p;K5    ,Ygh jd\ jӦs޽ KD,zc9eK*Eu.'Jb@QԴA#r,iFuu։mmP|Ȑ!ՙnEQ'ھvyg{gLt z:VKRĠKeB +82 ^?~a @@@'@ƵYc@@@@ # 7К6m<Şx֬Y._ .l_/lVu־ý-Zp٫ UֲBO>+-(cҩlj=ߑGir=v7QLuBm,})(+ѣYBWa^i\Rߨ{.uptWAeҥA  5HYU@@@@ u xMg޼y"xH Iouu ,H\z]j/ͫqƶh"ߦ)p(d:$NuUkuyqǹ;c> HʥkU5a۸Gt[jU_ 6 lt/[,~F"  @ pflg@@@@"޽{iO^m7GcZ aÆY=ּ:JYAj&L=/^ؼuW\ag϶#F믿n+Vh#G,hZXtl[JX-**6;j\2,F-   P>϶     gqƅ.AY:w\m~>jT6f1cukNQN Ǐ?hFrm><͜9un^˩ʱZU:9S@@ p-{;D@@@@  \{}@e+S\>}r&yW\2q~+l*"p-ϳkq3τ6d6gk׮]t6j9WQ]v=   Pv     2͝;7V_~opհ}VN}5e'jҤ{nѢm֩S'[p}];mԩMzvv}w\U#^ e믿 6N?t[BkRnk:V9Bw%F"  @ (U j#    @i DvuWߕhܸ)_QWh;SQ.랲F*8VK* D@@@F ԮQk"    >?{>ctUW&2͚5+ht W,; 'PUVEM9ᅬ '0m4=z;6iҤX4h`zhh^bD%6jW%Yh  Pjg@@@@@/ܖ,YǀW0ۧ :WcǎZZnf|lszbŊQiO'pM(2y~2-[Ϋq[_# }qUۇv!  @ pfmo@@@@RFa\sM?޶o?ڵ?s=)Wcƌ lفh>2?g:KjRL81uP|tҪ B ӧOl4jWyZ@@@ #2bb@@@@$ɓ'1cj5n؞{Ȯ1wqGo?{[r 00aB` PJ7pm5\ȑ#-QmCuY%ߙ#p d=EHlgԴq/Z*n;C@@ p-nQC@@@@:t~mo';O.j8keZ`A0+W6W^裏믿;Nq#6xķI5jԨa@)*xeܘI]sdMVO>^z):n +ڵ :oavH\VB>R=q,i@@@f V3;k    @Y(#o~g'p-[̷֭[m|ǧlѢ7up9sخj3fHFX>r-~ [|yaހ4oFvꫯ_|MnK,A>/|S'СCꠤGq;6itbSNM&&i׬Yj+駟z>Jm>}ݪaP={o&Be߻k#!l;fj10/m9j|TQ;HGx@@(LZ    5Q 1o ੉'ڸq)SUU5lv'|ZlY5,s=ÑZws뭷^Rʘ6*`m֬YnaK}5N),`/wRu{o={_~wāO|zM6 Z?O뻇lfаi%֣lda娣SO=նj+9s+O?Դ+)zҥK۹ 4h8m~\sx~p¤eG)ԱZ*K>/e ?G.Z(j#  @  pldV@@@@r\`OX;)HD묳{ke9CWW/hSW2O#<8*鵎:+iXЛ8tl8*ǏwϺ4IB:%1(k$:vÊ?& Ö:3#@@@b\MɊ     P9/|ɑgssD7E-Hu(, Օ Bfkf4 6̔IJ23L>b gͽk&MrYG6=.+6lc uTphb=M6;mix  /@Zoc@@@@P6*1(K]_*(kv݌87V&PR)V&E, dYvjAd'pvm6yȪ;u:vx͠B)8*5X.qA?MNhlnEu;lq>VP^<-?uR~}+O~ʨ >aÆ.S_ao5=)C5uIAnQחv+ \+ıRPbXg=+Ud*pM{P~II_pHA@@%Pf.k    .iQm3e:C;p~ u 7K1)o\Գ2;F+?;l2uPSˢvۅ֧`-uͨ&uCLde&Χ nUԖm/p5kiܸ 0S8p.U6=쳏1uљiКb8xNgXU[}\yaϞ68}*P  d*PK31= "nVGյ@(ZR|A;ꨣ]}[n[o^ڈEabQPb(/.4?SS0ZkoB)pJ=4\u.a̙3]f0u9k,[`ˈ)k2 &ekРAdSUndet{VWW ij={)0L]0*+G4+^:Sm`ժU֡CL,Ǫ׈BW:^u{N:^I/j}X,oo~k~XPmM 3 gmCA ]};if7|s2  ;c;찃sύ%v, +J8/\@@@@LHl馑+ Uz^\EuF(`$Ӣ`-uoG:Ew~EWPn Z`9?Ek^OP_Xtbz(W^Q6h8(2d՛c[V!+eA +>n9uit͵Rدgk7a   Pr?^XU@@@@@@@@@\ݐ@@@@@@@@@2 p-,&E@@@@@@@@] @@@@@@@@@ 2bR@@@@@@@@@\ݐ@@@@@@@@@2 p-,&E@@@@@@@@] @@@@@@@@@ 2bR@@@@@@@@@\ݐ@@@@@@@@@2 p-,&E@@@@@@@@] @@@@@@@@@ 2bR@@@@@@@@@\ݐ@@@@@@@@@2 p-,&E@@@@@@@@] @@@@@@@@@ 2bR@@@@@@@@@\ݐ@@@@@@@@@2 p-,&E@,X`˗//@ -[fK,p.&G@@@@@*Onk @e (SOc=\e(kT;`fͲvm%ͪ!   @Ϸ_|Znm{΢5ˊ+l„ ѠAСuZk!Qg֍7?~vQGYӦM+|+c|j+sZͫ g@~BJZY Y,\E}YԒY8ʯoj*_53\4ivX۶mCW-^ɓV[-t02qW   @~W2e V>| Ѱ={ezMO?u7-nFvqY&Mb%駟lРA6dȐj :Qgƍ3,Z]O1b7߸` l/@ՂP4x`UbS)p?J?^@ |ĉvז'W%rj'W9e=sOخYof\t.N̙3ͧ rJhÆ nݺټy3&kXÆ Cԑq_ Jug@@@(2gU]7nZs%clQ5\wuZ[o88ׯ LSp\\%uնrg̙6}tݳn:M׈Y sl+T~'Z5Te*oN*oF $wm7S6mƞ|I[9 'W٦>bƷ]g1cAeՕ+Wf ed{glw6?j(;C\ |I%qW   @:u䮉jy\qՂԆsn ?_v~@IDAT: i_ܹsWKy6Բ~}"zW$#+ @~N:餪7؞ykԨQ~Q+'W@e8lB\]ۮ/R^R;upGƎ9N;4k|и+q>   %(0f/[6uTSݻN6_~SO=뮰2:3jE jQV U~5?!-@ZS  P[oz!5XhѢ(m)Յrrۖ*7RZ[&]Gu}GvW~o-RX|~s.cM6;}"ܬ @ 7:묪P`prULUA5ޡC;Mk׭*6]$<;vqGkTOCϳ'3    x7yٶnky *BM6 ]ʕ+5Љ|Fʧw'$ٳqFٌ3{1. ׯ4m꛸\pvM7٬Y5kfL]t{UWU!C\ּEz}HSž{7T[>SmrTܵ+oX#&Md7|sh^9V`v衇Ȩ(A]J'KJprV*{L՝wi=M>_LH{zkTiMn^oF{My>i͗Dk׶x2ŝxUSbHM+XXu\X>C;MYFU.] 7pk:Ȯ4   yXwuk׮y8Unv ֵnݺN:RA`=z֠AK˦njճ7ܮZ3gM8є?Ue=ڎ8kժ ZӲFIk]>"J;|peM/Kl+El0Rk!YK._~ep K.uPYk` PSN9Ν^V-4R*\Ȯ-\e\>b溅ӧOY]k2lЫ:Jw.zkg}\`TyF@@@86R)e-~noe?\0Z<ӟaÆyV[U ΓN:ze<-_[DճR\sEzZ(bb tcr݋ҞJQ[:JW(e8 Lbt_}ݳjF-[?/.Sej2eԳ?{(Zo>EtZ\ٙ3gڻ&zS@raÆU5裏m٦}(ۗ& T@Nayrvvy}8#\/{e>۶mbRWCuկ].u]~røH@@@b˸ \\gu\3Wn3n3kЦMQsժUJuRWie={M<)C:u g@tĻfFZ^{Дn G_:57|3*uՀ2&~饗l/fd{ ~ulPmQt_.YJ~$I)}9*]#_5έ\.4`3<."`ݡ.+սC8/"Z{G@@(wRm$WOxW%\bӦMs,;vu3Y~رp8qˆ8댺AE KO8RY|q-s}wJFCt>hS{@C9Ė,YӺ:N gоƃ>Uj LI#8 0}t{'/ڵz_J/JJu'NtYOG PJj 'WM|\]u|m387n\ռCŋޗ8/ Z{F@@@:wuj8T_Q3g2nǾZ2+ >{VH_ 뮱SvM7mҤI lQAe VP75v ][ʔcw],r/T+釽FU-+({k̙SU]%Z2@l+^򜆀wrƤ5~N*s`vUК.R".,9眲^8/ "ZֻG@@@ #%vM^Q~w5oTRωmaq֩^łJÆ -l||QmFG`ѢEWkpϞ=-j31\KFmd}mT?uN>r$w'>sֻwAeSNv7Vdz>kU#+ŲeLD2,XךC2>S{GVfM7EprU2"ֆ]c,Ze: sYg B]bL/j.j=(    Pxo͛`%($HosW\j6 bD K/Tk~Wm@ 32j76xcO"?3mmnݺEN5֨QѧO(t~e۟}ŁB2nZOk^ 'WqjN]le˖fԩS]5!w1et"_Cew}Ÿ>@@@ 7e=z$ݘXƌ\…nFrz%NZ<ޛ MJeQڵk'͓͛Yf3?tm]vֱcGvcf&me_M\?n}Nz{sfͬGva%MFZtꫮ}ݷd'N>7߸ѥKyپ}>+z΋jSմ qmWA@@jofSLqH1nܸiW~ٻH֢E K/k ?u}ɓ'Wͮ Qk2wH?;8L4n]|A4aWAo7z4 \R\[oi^@(021`lҤIvgv-L]r-ַo_oP+Wtr FNz s="iMk) z۠AVխ&J>ZךL}!UiwЗE+(@' NLkO?VZa$'X\qr/s1.U$4ɿ ]wȡSN9ܴiӠW^y=CII1Ϣ̷zk"Q@@@@x=MYM<ÂGy(LXQ='|˦x߆6vm7zʓO>KMxA A$WnNV)ͯNR}}B7BΘ1UZ}z0(@H~gXl{u,eK-qשO*xbg]SF8-;:>kګL}>(bk~~( 8X.}U> #jzI-^U>_u).{vmE~)C2=S6dȐ)o& _>**VU4hL?؝zvwT+L}n=8]M,]ķFvvE[jB]" Z3X'X\qrw s1O%a4ɿݟ~ty 7<]{dwWnzд1C+r;${A{ӦMs@ML{/]V/9wuΝK4@@ &N:)2SW9 {i'L5 ]PL_n{챇riϮg}@]-rUWe;{Χ.O;Opw:HXQp~E]&T7ʜZAEuQ:,ŦL_eZ2p՗燲&jQWPk_U2nQpA:NunP"~~z92uM]Ygf j_/]EZ +d9MY:tࢄif2SgS' _ݝJ$#]Fآ?z_\io6N0a&'ͥx;|=8Xws1VZN:B?.֑͚5sݠ}^y\Rbhɯ&\`-e&;^z8@@ (t}[&o]ugy~}SF#]_=L&M@KQL*qש-.Φkv)Aί([~ *t燶.r_b *^Zxxm1e˦|W.X)~?Nmȥ䲽<@RkPMiOZUEF6,i$ nh]vYEOӱc6NtWPT:lOe;; UT}oB=*;NQw}Z.6,ְ3|}{ۻ,⃂  @ ({z3gN51(ۘ0*;P:5it+h^HaΉ/>hT*q|QhW^Y7y̙k\/p-:u*MEjQfC9eK Zkݺ^RNXxO-[lE꠬ޯZ[FmVתU7A%}U%$6dŹ {Cծ]yt4۠5G}pU1Rӵc^aԽQd\x {X4Z~Xсݶm[֭[ZC @ ( /t_\:A> iu\Ӊ{"?. ҉~]/q`J-Z݉$ONrx'W?og\߻Le *uxrN3k֬j-`\q.G#9ŵ-L6*X2_"Ÿyn7tSTSNbR&Я3ݮ:/3M]bn^3j_~] JůX;Z;찤(Vٍn֤| R?L|[q4PXO.Rz1o(.b)KV[mU>+@K@@jn82dI*ۛdKL7yE7[g]q.G7QE7MiO<;7 ˉ])bvjFxݨJ]pT\HZ(H]s־ٙ]TPuTu ^ JT뮯uXׅn&,բ2I(cXoS?i+lu=Rx=:2)6{n>iW-h+!nV*WxF.馛_W7*XZrRwQE< nKL3/E^(ע4^]*V>DVOLjcԩc֨iӦ.gϞIXW_m#Gt?dGV[u/ f| -_u rsTxmРAA(.t[2GZ)`qrIs{Rm .= g] |%3+bEJnNtQZ" wi޼y"rM0Oo~/iRm \z}?>y6f! @eҍbz&R] aA꫻ 0eQcƌqRP~&\A+I[Ee+  ]Kulת65?hnG+9ꫯ&=l{H@otﻴk6wou+lյkC1׿:ujf`]Ë \禌v[w~/p-^ / /o}.顛ܹsZڗNE_NkDTx`(HL_h5.'Nti9tjf3ΰ;5. O(/~EQ Kn)*> C_*8=]G& B2衻't'NCt9e~I,-[sN8V䊋qw>^r\JgsYgy/uc7Y ]Mߙ8,t~Ak*hZ])_syBWƉu]7qpɽи/֫W*k4o须u X  @G6=tY~/Ƭ%5eIK ZKV<֥nxu(C^{+xR=>*p-jR{G~0\o QC%{U5LXsl>?թ7pC`bLόk_砃2a*UE8,}ܰzx5jLkʺ:57't-4E k@횲ʤ0ͺkC86O;轺 E{7|3qTF2R Ӵv(.] kYl)3"MAMuzͫ8e(er)q`ym+ -TOȭ;Bub;3<0> ;RNڟŸGĨ5>:YN];;ӎ9=hbT +6\uUA #ӝ?֥o!)m~tQ˲UVQiHuU-<}5,5o,@f?(ͯիߘ.uۊ}nuuд~::KH6/-2PU|"[3j(25@YLe _?iuϩq}hPRl!CH~u 4ii=\ݢ!/mOݝ3Æ su=#|rW_~ե|՝sBO+%5pMwR:\N=z'}W7/ތQ_颞: ;LN;. urW~N"MJT_+̠zSmi >6SO݁tr+A!fΜi>WJJ찢|S¦; a%SP (Vz]~d/mF@@(sSuGÆ MuwfQlkQ߹q.:+S#\珪?~)3i4\rn5ZEG֭[JPF\_ K8ε&쳠G lJX֧B8AeڴiAlĈ鷳DGCE(+UvXW['kyyV )X=~xouX8}S6]4O\6CZ97~T 2eMjK5eDR𠺜;vB%D( e[]oEܡR,ʘXҹC$qq`y.tg[G}ִ\%N¦*LNju3I:f,"jiٹI3V_ޅ|@#ڧί,GAP eP6ur!_  5F@.o߾f VA0A+55yx%=i{̚5+^;T~߿`]qhӦM`Uvӝf4"-"~WTRԲۺ^yR}}2/%j}_ȶ.0gPfolgSӠ {챇=Iw4ihʮv-.V|SF?#G3䤠;,iv\z.X%iDoj~])kf;v]bԕxa6"_> 0f f#d{ꩧCwf1o3gSo9# >@(%VGQF=tcWnݒDZBԡr)Ο˲J}3FYo|1 L]R^%kEօ*n\/z\+}`uuM J :ƌ㾰ǹ{q0Λn))h-q:E+~E?|͡)w?|+ͮelSIjQ׀ Sv4Džu:{eS~嬳J ZQnf#G%=++2u9i8o{w%IQ'8tE`A9%( r " px(x,9 "3܊ (r=_TWfewgwge^ʌ"2 t< "P@ >w/c"rm ,_G.FMh4YF<]veK/;~jPsԧw}w^[lز&,zu[+׆T-=9siӦe7\u(68nSmkUCY nֽk?/tvjks͆-{Cկ~5SThďƫceGl \կ~5n~Zh2?>3Cc :j뮻nЇ>vqǁ.>$Lc}qթٮe\\⪉ͣ9?=;aft̞=;v$s1U}Q?hUOzϦ~͗ @~0ߍWSɁ @L@[weKOz}T`@nZB`עϦpGhUIOʫRyʂF nEG8,Y>^{qٝ)ac:^Mr5Ӡ\\5ygf]6_i^pN>t}[SL:ީ{= P$S~yF.:  @'[lE5FYuT&^WW 9J$"*km;˻9眓﫯z*.}k_YۯgE]3??\> <ݽWxӛޔN=J_cw9߾b:FhO=3lY'ȟccq oZk%^EѶS]`ƌL{o7ъhV[mXӖܵVf^6\U= WE)+Zߔθ&^4믿{A \YX|Ep<{`M} R[2.w=|iܹ 0Xq#G>,#_bJЩ#85!wE)J.䒞z9f$;5jsXT\>VYu#7xWy{ 曧|3=+kƓ|j*8*'$ pm`.=yO:C+_#7;Fb~{m",lQ'~{1%a߬Y:O?=Z6taz'0bᘾ-[}pjA_{C=4ct5jIIJj"/y<͛rYgʡO:Lַ K׎9ז_~ح}t RkWսl@̻^{;#ňkFˏ @ @+ō`n:=ƺxW~^NcWYe";S~0_Ys5-2Uەgս&.;u^oWY:|_;1~>s-oM6$m6US1{XRo}}?sk$(;^7 q3 /PH8Eowϣ38gq[G[w 6ؠl_D~q+>ݯn[LZ8wd_<̾{~M7cGiӦujB -Ի,5"[#]8ke#2;__鏣s{dK^~ kMo~^I~U4*S_`]xᅩyE.z[+z/}Yu.;W'tR*vUwK ]vn\4茋=O8׿|Zظ,oمh=ca~_XdEG#p}mV _B:;j8i+EgC|'> $:#=~eg+|,kz\x{Ee&`*1uD qw?7J-M: @&#kRܴo^u:MϙW~}Ѫ~_&Uk$Pl]w}̿l]&Y+XZ%,F*{ߘ|.yYW6)y)W_묳Nvk}q ߞ:ꨴ 竆#SC<8ϦO^㯋Dzu-^)?K,1__|q~Ga{ ƈG;MhMT2ϲu^4R्J WXcE |5Z>MWZv%ʸ=/ Q<⋹D Dh˿뿶>h-$;du9byXfY/b*NqþG~__{yePe*x_\#<]Wmٰ߸ ]\ T팋ѭSnK/h- /;kSl]]^3-첽v-`u]6bÇ~C9{cl1{|'Q>weow(kl}v2t]^֘6 FRsmo{[h>m;o|EZ3D- @zAxqǵ6h-;n+K7)w5Z<_C dߝ="а_bΠ)E/wz;w+ݢr#aX:]uU qJ)c֢tWdA;E{-~*WĹvg1xR 070K=X"3n*WXa7Ƶ{y~Uʨ\~_V'?I,Ce矟OЇs--ro[qzEf<_<ԯ-|^,s*~^|j6jG;/-o b]PøJkz|tR;0 ė/ZA@%E]tJ X-"x˫^Ve|N\#8nt Ѧ6_>,yheßT 2./SNSH&JʵD~ @ DJ[ӡ\}@b1;).?s쬏vn9u??وV} 7du)~?8>~m ʢ$nJ|;ޑ7E I v(,%Ykr.~qq"3\wu[VdvWge//?rb# R]JܸO:ev oxC~[#}*qb^% h1J[w :S7l??zwuGj <8Wc ̼bq^zY?\eEtJ1ӗ1waE F>/R֟>U>/ N, pmOގ7pTe!X"$R{w7]'EG^Y[EP^oVA @[ .bs>]k*ѩ14X#j-7)73摟E]g1uѡiRu?w[}2cYGw[!cx;Fz.Ǿ3<:(SfZwyٝ}|^ @ @@=s9}\Qtt'1}H\?MiQZlӹ} \|x` *3D-.˂vguVl"%\9vwqk1b\,[5E`9眓e<(?_,_ $y0eLc6evV0nsq8Gӊ$nnJ+OeGo61 /XjXvicً3/M \+wqYqYXMw4#mo)s Wq}=GU*_t#RzvF婲M<믿З^c~#vegEL[S~ec90_|aaM˟D:" w6{s%pm<.FeUg⪿eθ.KZ25FxZNgr~Y7p _bVyc <mio3st| fo1o޹#&SD#F cFZHL}# \޾%]E.q5ܙ:GQ\>}{ⷮ?}, 7t1EQz;ߙhؽ>Fňibߢ6z8+ :M!3\ k1x^,tk-0k֬b{L ,WUθɾh<󳳜I1."@(EWLm7}W,UV:d;tfbytbLљbJHWd>w|$etRDwvs @ /"ⷥMgYfc?< TecGCl}iB^xlJ>~3:ĬM{dgJ/@|)bʘ)6b:Y3{^6JNQo?88o]:'?G:=XQN_ܴٙƣK^u-?b8O:n鳟l6i#}=m]hw=6Ĝ;wn#d| + _uG#m-~Ϗ鈣{Ol3Tʹp8xh^GXSD?2n6c)cd o$)w:߫bx\wuGRLbbD.^7_j\rM74=_K^)2XV6f^WYY͞=;>1>ov|Ac^)pRlA!,_󧗥 9S˲$‹yR\ď^{euq6l3_Hl|ߘnv;a[lf_P\$+:+lWթp]wUKo#.Byg\t9/t?)f>θxo]M&s?wߝuBj|i,7bs!$@<"fK./way^!1?1^qNĹݱܹ^° ]}3E't> @4S A⷗c=U+@Y{?Qk#u \{mo{[V>FO\q]-(/~`Lo:_l҈T-KU'8Ƙ{ȏ׺:X,uЎ~6a=mڴa ;bў1UdJT=:ˈj︊meZawzn̢v-C8?}TUIEq>QQq~_QEbcM݁"xX?GG޺ˋ2ҚkO"@ @ @&F.,NS!bT:R5FBgõ卑b9;|ŏ77u0뮻ηl/bd#ňeo|ccb:(01"FE;D0DnpyZ GZ6]}v}ow!zz^ 01P|6r#W'Lrѿz  @ @ @mF\Gj(َ3W_7:)_~#6-"1W(]Ve]5va馛nJ1d#a1zO4hESTO1]3z7߲Jx{44na: G=[/ .D{MSƴ"{naY!_|ьӳ1 y睗Eb=ܓIqޝbFm6˦:) c9'R1+1ϡ?OT[{#@Rzꩡ]p)F @ @ @m2-kѷ;wno%SZRKY@c=6l# \??}Hs̰bALA`W6|tgfAbeu1ZW\MEzK#,b^LiV}[ʂH˩o}kg}fq;F]]p= k=i,$0|Yg j9gΜ.wy; @ T~Ad_z$@ @ @RTЬ@6YiUWF6Ǻ3<ӽuLA/})]uU)FUiZ.Ak>VYet7vۭH-*52"8+,JvGk&mEz.A{?яK/3_ cķ1-FN~1rS8uV_}ŲV[-Ԭ1r-Xb#EΩ 먟2.'pwuW:Ӈ^{Bю}ѡ={vC=!@ @ @ ơb4}"xE@#EQdJ1]hLyuץ_W[o{Ul.y+2{)Fu{dz VX!mFY[E%-ԟ19眓.#p#i}/j5LrK,91cJ3۞{]zM]0tee#ŔaӦF}"P,9b:owY:Fhnf͚MGuᇧvaS#H |zU4Fn^{B @ @hqjŪ(E , ̘1(ۘJ)Lh]1D5l7+E XR|d,ؖ}/|;Yl- @@;f:18F>ꨣqp @ @hR P# S- H F ^8ӡڢ#:j sr[FIZ,R) @ @SY',9͞=;IL;Ak,Lj  @ @}z'?} `3g^{jA V26W9y x=cG}tp ^{B @ @$0wt-?{^(<jh&lf̘Yd4k֬ ncܶQ3 |ς>gy&m'?IZve 5PUECK;cAw\G?:ic @ @L@^pqW{b/= 0@Tn햦Mvi]v%͛7o4]\5qT- r?'¾!/O'FN @ @XtEG]WF[ Lo5 @~y-oyKmdxWonխ+gL AkgN뭷xF @ @`]wbοOϧ{.OK.d6] 08 @b-RIpqܶQrτ9sLn @ @ >}zz;ѰZ! pm"2.LS:?T @ @ J @ @ @ @4V@ZcF @ @ @ @ NklWGE @ @ @ @ \klӨ @ @ @ @) p @ @ @ @ Xkm#@ @ @ @ @@;] @ @ @ @+ pMb @ @ @ @hv"@ @ @ @ @@c5iT @ @ @ @vuT @ @ @ @h6 @ @ @ @ @ٮ @ @ @ @ئQ1 @ @ @ @S@Z;Q @ @ @ @ @4*F @ @ @ @v \kg:* @ @ @ @4V@ZcF @ @ @ @ NklWGE @ @ @ @ \klӨ @ @ @ @) p @ @ @ @ Xkm#@ @ @ @ @@;] @ @ @ @+ pMb @ @ @ @hv"@ @ @ @ @@c5iT @ @ @ @vuT @ @ @ @h6 @ @ @ @ @ٮ @ @ @ @ئQ1 @ @ @ @S@Z;Q @ @ @ @ @4*F @ @ @ @v \kg:* @ @ @ @4V@ZcF @ @ @ @ NklWGE @ @ @ @ 5S1 @ @F @ @&A@$% @ @` @ @0Uh]!@ @ @ @ @J*1D @ @ @ @u \KR9 @ @ @ @ PI@Z%& @ @ @ @ @.Ur @ @[`ƌi{/?a{K9avD @ @>@V @ @^O})fΜ9a# @ @! p ~8y * @SYGʇا5\M&pm%@)^n"@* T. pZB_? @ꪫB -vy١ @NSOM' @ KjL @ 0~{ҼyPԟ @ @Qqmh6!@i֬Y9 GBJ`̙:CS`ܹ˻;iV\9 @B[Ln @D Z%/9h @ @@l .zǧ=s ꭒ @ @#`zB @ PQgo< @ @ k-hD@ @$y楧~zʗ_~yzꩧ^{B @ @@! @ @Q|yg_<2/ @ @hv#@ @4JOwް:} @ @+ pm @ @;wnzdž뢋.J- @ @v \kg:* @ @@#<̞z\s @ @O@Z @ @)3&4g?H @ @@ @ @)sNzꩧ s|p @ @G@Z{ґ @ @-p'OrJi+  @ @hK^sjǡ8  @ @ yiUVI"P/ @ @j0ZM!@ @(8餓ֿW\Q\5 @ @B@Z+A @ @+O\c=r^  @ @LknjM @38#=c{{キr~  @ @<kfjL @(c9fD}җm#3 @ @%jK @ 0(sM[lň;}t=iӦx[ @ @ |#5Ԑ @ 0sUݟxtǏj[ @ @ |#5Ԑ @ 0\sMZG]%\2}iu6$@ @ @F\kf @ @`;1?  @ @ L#5]Ԋ @ 0W\qEdM| K,D;Ӣ.:@ @ @@s֜P @ @@+^||cyGGQKY !@ @ @9F\kN[  @ @W_}uG?Zz,1 hE9sfz+_Y?͛7ϨkBV @ @<kfjL @xKsɎ+Ln @ @ @B[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@ukխ$@ @ @ @ @Հ @ @ @ @. p @ @ @ @ PA @ @ @ @U @ @ @ @jV" @ @ @ @ @Vr @ @ @ @ @@ j@T @ @ @ @TVJN @ @ @ @A@Z @ @ @ @ @ת[I @ @ @ @5\Q @ @ @ @ P]@Zu+9  @ @ @ @ @k5 * @ @ @ @ \n%' @ @ @ @ pDE @ @ @ @ @@uU*' k6}ká8b~z<Ŏ @ @ @v \kg:* P(Ϧ @M0 @ @hv @6mZ>}<=S+ @ @ @&P@bh^{fϞݴj?:ꨡמ @ @ @xi;Q @ @ @ @ @\RO @ @ @ @D@ZKa @ @ @ @ @`P JK' @ @ @ @Z" p% 0 @ @ @ @ 0(ԓ @ @ @ @-֒t @ @ @ @kRI @ @ @ @\kIC:  @ @ @ @ Ai)$@ @ @ @ @@K! @ @ @ @E@ڠz @ @ @ @ @%ZҐ @ @ @ @" pmPZJ=  @ @ @ @ k-iHA @ @ @ @A6(- @ @ @ @h4 @ @ @D[@IDAT @ @\RO @ @ @ @D@ZKa @ @ @ @ @`P JK' @ @ @ @Z" p% 0 @ @ @ @ 0(ԓ @ @ @ @-֒t @ @ @ @kRI @ @ @ @\kIC:  @ @ @ @ Ai)$@ @ @ @ @@K! @ @ @ @E@ڠz @ @ @ @ @%ZҐ @ @ @ @" pmPZJ=  @ @ @ @ k-iHA @ @ @ @A6(- @ @ @ @h4 @ @ @ @ @\RO @ @ @ @D@ZKa @ @ @ @ @`P JK' @ @ @ @Z" p% 0 @ @ @ @ 0(ԓ @ @ @ @-֒t @ @ @ @kRI @ @ @ @\kIC:  @ @ @ @ Ai)$@ @ @ @ @@K! @ @ @ @E@ڠz @ @{wXywwŶB -E"ťHq]lqwwYh۟'L&'d2!ɑs' W>< @ @ 0@2 @7*}/ @ @ @@ךh  @.b)"va @ @ @#3w @xotI'Ygm~J>`zҦn&`97'R] -"M4D[s'x"yiNSM5U Ge駟/=O:j}~mzG1#4LiyM;n.u$D @ @\[^z)-7LlI3Oΐ!CRL34ij٩}sM/rqOj4>H#/ҽޛ~uOco_ _~yZh{w>(Ų~u_~8t)eyim鰬ރfWx @ @ @" - @*3ϤVXku(kذaO/Bvպ~>oaxw[/=,,4|uX/W_oǢ.Z' /W^y%)~_?T"*]yieI17ޘ ,Ve;ȂiNmh|8's @ @ @5'@L /+bsf\rIKg93-=hEō7޸Sh-v{뭷=U#*ǯE7 0t^2A¨&fegq 7w1z Kӟm|CU @ @ @&0b!@?^{-*c U~~)~YԢWų>2Y ZVkTv٦n{Ivi裏.=wۦvۭ)f7P] @ @ k<8  0 |iWO^OO}rwN[mUZzKw{0 V駟6fy4묳V.f)Ni-L+raثMܡ۞{'g[K+R:3r͞/?1 6: @ @ @DϾ\'H'tRJ=m~ N.Ձ jX>]wR|Yn(igL= @ @ ٗ@ |馛nʪL;iyJkw: ~x[JW>DSO=/זּj}yұYdjw"B U[5\/ꫫ^6UYpUW~vwqӮZgIGuT@ꫯZ*K.'Z$_{iYfI2Jc91^㏧hZm4s zi7NO>y\p n  @ @ @=[DŽ&zSOmGn2ĨK$Lg|n߾Y8fnj/ 7|D~a}̾1_LJ/ xK͸fA^HvXS=#OaY_=gq;sm&uYvmae@vTZb%:nQ3^|%юc*~ż_*SM5U3'sE 70 UK.k)Sm4{ۮfn @ @ @% ,InKݢу>U _|w]wuiF^]ڧ6:thZk\,m 3ϜU.o) 4xn[l6<$O9_2M~{VN^KO>d*z#4Rt_ ^Eյ]v%xى}Yʨ>GWϿƔSNYmqe͞~y<M{tp;?6`BR̋fm/~4de]6;}; é@q;\}F}2~mj͋Q>K[Wjkqnkf9FWewN3<3}駽rVNzw.z]͞/BFTĬu @ @ @G}?fδ@`O'xbzOFtM7̃`kF3%a*T.M"C9$;sgw߽MϸӊVEcꩧ.ZUkZSa @ @ n٥O?V>3i袋K,?ʝ^nmfk& f-p kꪫRCi /_ .*#YeYfIъ6*=+ڇ/R: Z'*AQ5ZsNÑ-';ӴK/tWMN@|6̃kq"$vu׬xO젃ʪFK=3M4DFffXc}*Zo9 @ @ @k]}s1GqS]v%-Yz[Zku=Zk^uUm2WiKh!uY'Em&]r%嫳-\;TSMa1i7߼2 tG`СZK_ݙ>}(0餓g>*vZz:LŚ]^5'f7UB+ @ @ @n un Dڷ~ yW M9iFhV,*}7 vQm%mVOO2J$Q=/N<:lkV[-=5谓t[/Lm'xQtצ[o> 5䓧hշ"#XDEYg5*u\[oMC IqMWv/mgNqޫzVɲ;?x '0-Ri 7,ߤ?0;[n%s5]ceţs<;o ufvUW~j琯+<{^JzSSL*{4dU֣юnHÆ Kj*d{4#;G8g-mooJ;B=9dJsE^Ѣ,'@ @ @ X1yK7馛nJ#fe P?0sx]qYP-vg7LE#~衇:O?>{@@nNU -P7 <8飏>9o~ߥ}٧v0mi+_Q'tR.__=kyuץ[FFk?CbJF;<sOw^ӻ{bExcYƞx≴;Wm,3|)Zve|M>fi*wz.Q~b{O?p?Beo}О` :GGqD Rj#>E49"tx}Ź @ @ @_] ŗ;C)ԝS`d_JFjn/[tNEvYE{Z/~!V : @ @ @P@pƊ0ڴN[wm66V/KѲ1,T4ȫ=#iVJ,HVѪ'ᕾ|͝V߄&z 2ؠA AF|L1Y9Zsv2jf暫Ӳ,JC7_V^X.֦hĹN޸hQNG}ty+WwxA.òypmUGydn*78BF\d\r%)(y m"l~Uk<ǕW^9{qTjZ`_:=[ ☯:wҌj\&U>~m{ذa3d۝Z m]y>|#@ @ @Vm6_Q eOy B{ʭA\B_G[ɨ5H#ha5d^:E`wI^/|WӨrʬ:Ff|^kDfu֬cif.dINxm [[u~ B2-ZklFiGoũ1xNs>vڰbAT V^'oaV- dDC9$(}gi=HQ x衇m>[eU wV?_vztJ=y˃kw<~bļ2@C~/~fE)A8E;z##V ,  u;DZ{8 E+Z#B>)ڥV|A%XrUxS*hRqmf˞h+ys!SOA<38cSyEjSqOGw @ @׆-6^xaZguR%5"$kd_hG5VzhF /p*f7 ?v\+?p,3oQG?p;Ӛ6SpDyV @ @ @#kum(m{キE5h? *JY js{n]1>裬J>SmvmUHW(&8vekfi҂ .&l-X|wUn_|q֖믿0W_C9ڹƎq6^O>$k{ZoX Пбc\4hPvn8\1S|Oъy駯 cŃFZת֕ZšVoJRΨ5oEp6*K!CtZE:m :ΫhymYm6[_-s$Y,*3F2\kd|)~^ذѹlG @ @׆g1 /p*+:䋚v͢ZO6V^y+Wc{'\m6(ŗVh_aAr!o[ݏpZM:)*:묥_eLkѶ-EmqՄib-KV<0Osjű}B$a='%XaFvP9f0kZƼnzfZ*^s^4owwfwQoh-F`Y#]9Eo+Sqw<ՖEHG*->éV(\ٌߣj[y\j]wGc` |^ px. @ @\@p-pgr"UL|ɿ馛o=k̓>s/<;ʷJi~e|m7vDŶ#8|~T曳Iʉ'8 -)q=_Fjc]wZ˷+n|QۨO9眳z_ry䑝i"W#* tM}uxN7s'^^QQ+Z!hj;UEե}qvښGrD?\\sNZ;Qtn9ZF/F Dvq^kkswN @ @ @\@v\pAVIˬP#*-zU|:ꨢYjg.\m6|"*URbZjh-#D|jx`$jm7K,BsO{/aÆ/Xn媮khSU窍)Ҳږ=䓥m'|~u:||yyZ]Eky?{\VTJS&ݾ4Jpm馫, +VQh-. ڊ{,U Ze8^G~M |F6"L;u]7M8&]w]apJY>g|(rח&D6:~ߤ. 4)Rx7z_ 6 @ @ @/ wQ_Nk:thViZX o;S57yeT?U!塇0_\Oa|p뭷N_/ & EEjVUjƲ]wݵS!BkAR2~چnYgT-}QeE "*,6@,̒UR+¢6}\u5E6۬sA>RrEfS|>, E F}ZWaFt34Si @ @ @M)@"HtWYO?tG7mFעXtXꫯvx\GnYgM7_Zb%jNQ .*]D:LG?t9t8RK-ng "PTstǦF-آCkq^R9*+U|- FۡcY^_7,h6괮Zp-ZH|_^GI'*?'vz~mMjx az{@ @ @ Ж*&ttUWeA/.^{핢zHTWs(Ȩ]qj7W#6g /4:U[mwvۭ9Z颋.Jc9fc1FdMַ]i.V{ץӋpZh=t饗viN-Rd"h-2GâUhXeU𪜣rj-7ݩj^pJtPݪ=\1]TA @ @ пT\߀>/jǴ馛VEfU 6O;믿qV!*߷*tu ,]۵^jT^jΪ ?S@P4L<Ȭgch-^|֟՞]vY0wePk[GyG{ٻ\w\+m+FeeXZc{YB @ @(Pqƚ6Xwu!8g_ŗf+믿yTJku_sT+Ekz 6ؠ0P0A\{FԹ+6l)*ňJ7 *s<3pc_chDXX>»-X{:L6\[vekmķv[2dHÉT+L2$YEFBko[otT_G-$vu4{馛)lY /0k(Qm;^@V @ @ @Åp4/tʚ_*_?>_Y/63f*vi/zz>?x'+`+{ST78∬%1SoS 'zT-k탉]_T\ʈ~{oz4deh#O;)IoQurUk[_k|]#swޙ6ºv7{OnCl[㪫ʂOs1Goo*)<gUƊ?UkͼsP.Z67?%E'c]v{T+jWZsDzQQG)-K,Dqy,~{Jy E2`;蠃6,'@ @ @ \ m&xJng S4~6.-ps/kguV`jhW]ѺR+{{yTVZqK_sVjU)g6[- DPgxR{%N0AT׋/>1 7ir!Y寘o}_YU7ﳐ\6G#HQ3cmv;]wU:WZi?ҋ`^>OXNIFmy/WvT06v8ktv}g;͸` R^4>'뮅ᴷz+}jwQ.#G]w]ۖߏ)>?|Uby^CB yxQ^kӵtU_~)+Y Dexeq;8dsϝbmŕExy\1 @ @ @Z]"D"ߓ ,}ҸY#ZI^ve]s5SO-\߭zZTVvO򅧟~z~[J8 -Q'ۈpI ϭ:颋.J~uD$ku9dGhg c5V~KvβS^uUY?vpGg;? IDAT-IWJ @ @Kv:Wgn8h81L̗*3}42XbrDѶQ#ڔ⋝69蠃 CkO?}:;/qӖ[n?mT@5ngQo|* EsA^ EU5sDhZ]w Z'm*s"\dk8&駟;tZ9%A @ @ 0p_/sM| EhY9\^xi&kKU]W^E-"_UݺkBTS/Z$E/*+!Dhai @ @ @ \Dpc{|I馛&xbs 7d?\ySm҉'iX4#V-iWL_|q˗Foڐ[EuX+2vV-$a8mᆩ^G3m-;t7z+=v @ @ @``h:0Ƕ՘y晳fQr|Wj>'pB;j57WUV=ol(O1L3eڪnE8FTyۤFFGEgш_tUVYhˣ5!/M6dU1OŵD6 +7_b8ޫٶ9-W *~ .t-W#p6`.d'wyӳ>- hg7ߤ}sRMh>W~_}6lX[pю5?ZD׸j(zv|fk짟~j#@]!C83<3mY>>5'@y椓N*5ڥ @ @ @ \ŕL7t!QF% 6V f!O3GT⊟$֪ע]g4:*Ex'#m=ƾ(me_|,=c5+DNhR [lG/I @ @ @*- @^zꩳZo6L^voQٳhD2uQ97FKP @ @ @C@pm`<{+ :4ᵨv.O?-Z wεֺ>:uس:+ <8hG'|@]!@ @ @( # @^xt- & ;.ơҰa 7@vwv~C Im]iZjcw @ @ @\ _t뭷_?uQ2ytMgMoV+?Pxj[zӲ.4c7<:ꨅj@Vyq5lFat'߾7eN @ @ @6\k' @)3<^@Ѧ.>+O>]wRU|Y_ fs]vI_}UeQgbfMm @ @ @6\k' @M`q)>mQm-.`\G61ZFhmio3r  @ @ @\k @@T*r-Z,R>}nO Kanika Yadav (external) Microsoft Office User 2020-09-25T06:54:04Z 2021-12-22T19:07:50Z 16.00 true 2021-02-23T09:13:03Z Standard 90c2fedb-0da6-4717-8531-d16a1b9930f4 45597f60-6e37-4be7-acfb-4c9e23b261ea 0 true 2022-01-14T16:33:39Z Privileged AMD Official Use Only-AIP 2.0 3dd8961f-e488-4e60-8e11-a82d994e183d 3ab6c0f7-c658-4f6f-bd9d-6ef921551ff7 1 14235 32767 32767 32767 False False Filename Title Categories Version Doc Type MAP rocm;hip-sdk;hip;gpu;amd;rocfft;fft 4-5 apply-ALL default rocFFT API Guide reference