pax_global_header00006660000000000000000000000064144647362470014533gustar00rootroot0000000000000052 comment=7520fc656124d3094ecd99e260209d9b143a5248 rocFFT-rocm-5.7.1/000077500000000000000000000000001446473624700136265ustar00rootroot00000000000000rocFFT-rocm-5.7.1/.clang-format000066400000000000000000000065421446473624700162100ustar00rootroot00000000000000# Style file for MLSE Libraries based on the modified rocBLAS style # Common settings BasedOnStyle: WebKit TabWidth: 4 IndentWidth: 4 UseTab: Never ColumnLimit: 100 # Other languages JavaScript, Proto --- Language: Cpp # http://releases.llvm.org/6.0.1/tools/clang/docs/ClangFormatStyleOptions.html#disabling-formatting-on-a-piece-of-code # int formatted_code; # // clang-format off # void unformatted_code ; # // clang-format on # void formatted_code_again; DisableFormat: false Standard: Cpp11 AccessModifierOffset: -4 AlignAfterOpenBracket: Align AlignConsecutiveAssignments: true AlignConsecutiveDeclarations: true AlignEscapedNewlines: Left AlignOperands: true AlignTrailingComments: false AllowAllArgumentsOnNextLine: true AllowAllConstructorInitializersOnNextLine: true AllowAllParametersOfDeclarationOnNextLine: true AllowShortBlocksOnASingleLine: false AllowShortCaseLabelsOnASingleLine: false AllowShortFunctionsOnASingleLine: Empty AllowShortIfStatementsOnASingleLine: false AllowShortLoopsOnASingleLine: false AlwaysBreakAfterDefinitionReturnType: false AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: false AlwaysBreakTemplateDeclarations: true BinPackArguments: false BinPackParameters: false # Configure each individual brace in BraceWrapping BreakBeforeBraces: Custom # Control of individual brace wrapping cases BraceWrapping: { AfterCaseLabel: 'true' AfterClass: 'true' AfterControlStatement: 'true' AfterEnum : 'true' AfterFunction : 'true' AfterNamespace : 'true' AfterStruct : 'true' AfterUnion : 'true' BeforeCatch : 'true' BeforeElse : 'true' IndentBraces : 'false' # AfterExternBlock : 'true' } #BreakAfterJavaFieldAnnotations: true #BreakBeforeInheritanceComma: false #BreakBeforeBinaryOperators: None #BreakBeforeTernaryOperators: true #BreakConstructorInitializersBeforeComma: true #BreakStringLiterals: true CommentPragmas: '^ IWYU pragma:' #CompactNamespaces: false ConstructorInitializerAllOnOneLineOrOnePerLine: false ConstructorInitializerIndentWidth: 4 ContinuationIndentWidth: 4 Cpp11BracedListStyle: true SpaceBeforeCpp11BracedList: false DerivePointerAlignment: false ExperimentalAutoDetectBinPacking: false ForEachMacros: [ foreach, Q_FOREACH, BOOST_FOREACH ] IndentCaseLabels: false IndentPPDirectives: None #FixNamespaceComments: true IndentWrappedFunctionNames: true KeepEmptyLinesAtTheStartOfBlocks: true MacroBlockBegin: '' MacroBlockEnd: '' #JavaScriptQuotes: Double MaxEmptyLinesToKeep: 1 NamespaceIndentation: All ObjCBlockIndentWidth: 4 #ObjCSpaceAfterProperty: true #ObjCSpaceBeforeProtocolList: true PenaltyBreakBeforeFirstCallParameter: 19 PenaltyBreakComment: 300 PenaltyBreakFirstLessLess: 120 PenaltyBreakString: 1000 PenaltyExcessCharacter: 1000000 PenaltyReturnTypeOnItsOwnLine: 60 PointerAlignment: Left SpaceAfterCStyleCast: false SpaceBeforeAssignmentOperators: true SpaceBeforeParens: Never SpaceInEmptyBlock: false SpaceInEmptyParentheses: false SpacesBeforeTrailingComments: 1 SpacesInAngles: false SpacesInContainerLiterals: true SpacesInCStyleCastParentheses: false SpacesInParentheses: false SpacesInSquareBrackets: false #SpaceAfterTemplateKeyword: true #SpaceBeforeInheritanceColon: true #SortUsingDeclarations: true SortIncludes: true # Comments are for developers, they should arrange them ReflowComments: false #IncludeBlocks: Preserve --- rocFFT-rocm-5.7.1/.githooks/000077500000000000000000000000001446473624700155335ustar00rootroot00000000000000rocFFT-rocm-5.7.1/.githooks/install000077500000000000000000000002221446473624700171230ustar00rootroot00000000000000#!/usr/bin/env bash cd $(git rev-parse --git-dir) cd hooks echo "Installing hooks..." ln -s ../../.githooks/pre-commit pre-commit echo "Done!" rocFFT-rocm-5.7.1/.githooks/pre-commit000077500000000000000000000017671446473624700175500ustar00rootroot00000000000000#!/bin/sh # # This pre-commit hook checks if any versions of clang-format # are installed, and if so, uses the installed version to format # the staged changes. base=/opt/rocm/llvm/bin/clang-format format="" # Redirect output to stderr. exec 1>&2 # check if clang-format is installed type "$base" >/dev/null 2>&1 && format="$base" # no versions of clang-format are installed if [ -z "$format" ] then echo "$base is not installed. Pre-commit hook will not be executed." exit 0 fi # Do everything from top - level cd $(git rev-parse --show-toplevel) if git rev-parse --verify HEAD >/dev/null 2>&1 then against=HEAD else # Initial commit: diff against an empty tree object against=4b825dc642cb6eb9a060e54bf8d69288fbee4904 fi # do the formatting for file in $(git diff-index --cached --name-only $against | grep -E '\.h$|\.hpp$|\.cpp$|\.cl$|\.h\.in$|\.hpp\.in$|\.cpp\.in$') do if [ -e "$file" ] then echo "$format $file" "$format" -i -style=file "$file" fi done rocFFT-rocm-5.7.1/.github/000077500000000000000000000000001446473624700151665ustar00rootroot00000000000000rocFFT-rocm-5.7.1/.github/CONTRIBUTING.md000066400000000000000000000031411446473624700174160ustar00rootroot00000000000000 ## Contribution License Agreement 1. The code I am contributing is mine, and I have the right to license it. 2. By submitting a pull request for this project I am granting you a license to distribute said code under the MIT License for the project. ## How to contribute Our code contriubtion guidelines closely follows the model of [GitHub pull-requests](https://help.github.com/articles/using-pull-requests/). This repository follows the [git flow](http://nvie.com/posts/a-successful-git-branching-model/) workflow, which dictates a /master branch where releases are cut, and a /develop branch which serves as an integration branch for new code. * A [git extension](https://github.com/nvie/gitflow) has been developed to ease the use of the 'git flow' methodology, but requires manual installation by the user. Refer to the projects wiki ## Pull-request guidelines * target the **develop** branch for integration * ensure code builds successfully. * do not break existing test cases * new functionality will only be merged with new unit tests * new unit tests should integrate within the existing [googletest framework](https://github.com/google/googletest/blob/master/googletest/docs/Primer.md) * tests must have good code coverage * code must also have benchmark tests, and performance must approach the compute bound limit or memory bound limit. ## Interface * All public APIs are C89 compatible; all other library code should use c++17 * Our minimum supported compiler is clang 3.6 * Avoid CamelCase * This rule applies specifically to publicly visible APIs, but is also encouraged (not mandated) for internal code rocFFT-rocm-5.7.1/.github/ISSUE_TEMPLATE.md000066400000000000000000000004611446473624700176740ustar00rootroot00000000000000### What is the expected behavior - ### What actually happens - ### How to reproduce - ### Environment | Hardware | description | |-----|-----| | GPU | device string | | CPU | device string | | Software | version | |-----|-----| | ROCK | v0.0 | | ROCR | v0.0 | | HCC | v0.0 | | Library | v0.0 | rocFFT-rocm-5.7.1/.github/PULL_REQUEST_TEMPLATE.md000066400000000000000000000000701446473624700207640ustar00rootroot00000000000000resolves #___ Summary of proposed changes: - - - rocFFT-rocm-5.7.1/.github/dependabot.yml000066400000000000000000000010421446473624700200130ustar00rootroot00000000000000# To get started with Dependabot version updates, you'll need to specify which # package ecosystems to update and where the package manifests are located. # Please see the documentation for all configuration options: # https://docs.github.com/github/administering-a-repository/configuration-options-for-dependency-updates version: 2 updates: - package-ecosystem: "pip" # See documentation for possible values directory: "/docs/.sphinx" # Location of package manifests open-pull-requests-limit: 10 schedule: interval: "daily" rocFFT-rocm-5.7.1/.gitignore000066400000000000000000000007101446473624700156140ustar00rootroot00000000000000# Compiled Object files *.slo *.lo *.o *.obj # Precompiled Headers *.gch *.pch # Compiled Dynamic libraries *.so *.dylib *.dll # Fortran module files *.mod # Compiled Static libraries *.lai *.la *.a *.lib # Executables *.exe *.out *.app # vim tags tags .tags .*.swp # Visual Studio Code .vscode # install.sh build dir build/ # python bytecode __pycache__ # documentation artifacts _build/ _images/ _static/ _templates/ _toc.yml docBin/ _doxygen/ rocFFT-rocm-5.7.1/.jenkins/000077500000000000000000000000001446473624700153455ustar00rootroot00000000000000rocFFT-rocm-5.7.1/.jenkins/application.groovy000066400000000000000000000247451446473624700211330ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'application') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocFFT', 'hipFFT'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} ${getDependenciesCommand} git clone -b develop-2021 https://github.com/ROCmSoftwarePlatform/Gromacs.git cd Gromacs mkdir build_tmpi cd build_tmpi cmake -DCMAKE_HIP_ARCHITECTURES=gfx90a -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++ -DGMX_MPI=OFF -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. make make install cd .. mkdir build_mpi cd build_mpi cmake -DCMAKE_HIP_ARCHITECTURES=gfx908 -DBUILD_SHARED_LIBS=ON -DGMX_BUILD_FOR_COVERAGE=ON -DCMAKE_BUILD_TYPE=Release -DCMAKE_C_COMPILER=mpicc -DCMAKE_CXX_COMPILER=mpic++ -DGMX_MPI=ON -DGMX_GPU=hip -DGMX_OPENMP=ON -DGMX_SIMD=AVX2_256 -DREGRESSIONTEST_DOWNLOAD=OFF -DGMX_GPU_USE_VKFFT=OFF -DCMAKE_PREFIX_PATH=/opt/rocm -DCMAKE_INSTALL_PREFIX=../gromacs-install .. make make install cd .. """ platform.runCommand(this, command) } def testCommand = { platform, project-> def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} cd Gromacs source gromacs-install/bin/GMXRC gmx --version export LD_LIBRARY_PATH=\$LD_LIBRARY_PATH:/opt/rocm/lib echo \$LD_LIBRARY_PATH git clone https://github.com/jychang48/benchmark-gromacs.git cd benchmark-gromacs export GMX_MAXBACKUP=-1 echo "* Threaded MPI ******************************************************************************************************" #ADH_DODEC cd adh_dodec tar zxf adh_dodec.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 150 # 8 GPUs # STMV cd .. cd stmv/ tar zxf stmv.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 200 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 400 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 400 # 8 GPUs # CELLULOSE_NVE cd .. cd cellulose_nve/ tar zxf cellulose_nve.tar.gz gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 1 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr -nstlist 100 # 1 GPU gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 4 -ntomp 16 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr -nstlist 200 # 2 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr -nstlist 200 # 4 GPUs gmx --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntmpi 8 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr -nstlist 200 # 8 GPUs echo "* MPI ***************************************************************************************************************" # ADH_DODEC cd .. cd adh_dodec/ tar zxf adh_dodec.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs # STMV cd .. cd stmv/ tar zxf stmv.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -nstlist 400 -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs # CELLULOSE_NVE cd .. cd cellulose_nve/ tar zxf cellulose_nve.tar.gz mpirun -np 1 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 64 -noconfout -nb gpu -bonded gpu -pme gpu -v -gpu_id 0 -s topol.tpr # 1 GPU mpirun -np 4 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01 -s topol.tpr # 2 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 6 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 0123 -s topol.tpr # 4 GPUs mpirun -np 8 gmx_mpi --quiet mdrun -pin on -nsteps 100000 -resetstep 90000 -ntomp 8 -noconfout -nb gpu -bonded gpu -pme gpu -npme 1 -v -gpu_id 01234567 -s topol.tpr # 8 GPUs """ platform.runCommand(this, command) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 5')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu20:['8gfx90a']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['8gfx90a']], urlJobName) } } } rocFFT-rocm-5.7.1/.jenkins/common.groovy000066400000000000000000000145421446473624700201120ustar00rootroot00000000000000// This file is for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false) { project.paths.construct_build_prefix() def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' String warningArgs = '-DWERROR=ON' String buildTunerArgs = '-DROCFFT_BUILD_OFFLINE_TUNER=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_DIR/rocfft_build_cache.db" def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getDependenciesCommand} set -e mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${warningArgs} ${buildTunerArgs} ${staticArg} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) sudo make install """ platform.runCommand(this, command) } def runCompileClientCommand(platform, project, jobName, boolean debug=false) { String sudo = auxiliary.sudo(platform.jenkinsLabel) project.paths.construct_build_prefix() String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' String warningArgs = '-DWERROR=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' //String staticArg = buildStatic ? '-DBUILD_SHARED_LIBS=off' : '' String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String buildTypeArgClients = debug ? '-DCMAKE_BUILD_TYPE=Debug' : '-DCMAKE_BUILD_TYPE=Release' String cmakePrefixPathArg = "-DCMAKE_PREFIX_PATH=${project.paths.project_build_prefix}" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/clients mkdir -p build && cd build ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArgClients} ${cmakePrefixPathArg} ../ make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project, boolean debug=false) { String sudo = auxiliary.sudo(platform.jenkinsLabel) String testBinaryName = debug ? 'rocfft-test-d' : 'rocfft-test' String directory = debug ? 'debug' : 'release' def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${directory}/clients/staging ROCM_PATH=/opt/rocm GTEST_LISTENER=NO_PASS_LINE_IN_LOG ./${testBinaryName} --precompile=rocfft-test-precompile.db --gtest_color=yes --R 80 """ platform.runCommand(this, command) } def runPackageCommand(platform, project, jobName, boolean debug=false) { String directory = debug ? 'debug' : 'release' def packageHelper = platform.makePackage(platform.jenkinsLabel,"${project.paths.project_build_prefix}/build/${directory}",false) platform.runCommand(this, packageHelper[0]) platform.archiveArtifacts(this, packageHelper[1]) //trim temp files def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix}/build/${directory}/ rm -rf _CPack_Packages/ find -name '*.o' -delete """ platform.runCommand(this, command) } def runSubsetBuildCommand(platform, project, jobName, genPattern, genSmall, genLarge, boolean onlyDouble) { project.paths.construct_build_prefix() // Don't build clients, since we're just testing if the library can build String clientArgs = '' String warningArgs = '-DWERROR=ON' String buildTypeArg = '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = 'release' String genPatternArgs = "-DGENERATOR_PATTERN=${genPattern}" String manualSmallArgs = (genSmall != null) ? "-DGENERATOR_MANUAL_SMALL_SIZE=${genSmall}" : '' String manualLargeArgs = (genLarge != null) ? "-DGENERATOR_MANUAL_LARGE_SIZE=${genLarge}" : '' String precisionArgs = onlyDouble ? '-DGENERATOR_PRECISION=double' : '' String kernelArgs = "${genPatternArgs} ${manualSmallArgs} ${manualLargeArgs} ${precisionArgs}" String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' //Set CI node's gfx arch as target if PR, otherwise use default targets of the library String amdgpuTargets = env.BRANCH_NAME.startsWith('PR-') ? '-DAMDGPU_TARGETS=\$gfx_arch' : '' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_DIR/rocfft_build_cache.db" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} rm -rf build/${buildTypeDir} mkdir -p build/${buildTypeDir} && cd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc ${buildTypeArg} ${clientArgs} ${kernelArgs} ${warningArgs} ${amdgpuTargets} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) } return this rocFFT-rocm-5.7.1/.jenkins/debug.groovy000066400000000000000000000046401446473624700177060ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'Debug') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project, true) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName, true) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['any']], urlJobName) } } } rocFFT-rocm-5.7.1/.jenkins/performance.groovy000066400000000000000000000207771446473624700211320ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false, boolean buildStatic=false) { def reference = (env.BRANCH_NAME ==~ /PR-\d+/) ? 'develop' : 'master' project.paths.construct_build_prefix() def getDependenciesCommand = "" if (project.installLibraryDependenciesFromCI) { project.libraryDependencies.each { libraryName -> getDependenciesCommand += auxiliary.getLibrary(libraryName, platform.jenkinsLabel, null, false) } } dir("${project.paths.project_build_prefix}/ref-repo") { git branch: "${reference}", url: 'https://github.com/ROCmSoftwarePlatform/rocFFT.git' } String clientArgs = '-DBUILD_CLIENTS_SAMPLES=ON -DBUILD_CLIENTS_TESTS=ON -DBUILD_CLIENTS_RIDER=ON' String noclientArgs = '-DBUILD_CLIENTS_SAMPLES=OFF -DBUILD_CLIENTS_TESTS=OFF -DBUILD_CLIENTS_RIDER=OFF' String warningArgs = '-DWERROR=ON' String buildTypeArg = debug ? '-DCMAKE_BUILD_TYPE=Debug -DROCFFT_DEVICE_FORCE_RELEASE=ON' : '-DCMAKE_BUILD_TYPE=Release' String buildTypeDir = debug ? 'debug' : 'release' String rtcBuildCache = "-DROCFFT_BUILD_KERNEL_CACHE_PATH=\$JENKINS_HOME_DIR/rocfft_build_cache.db" String cmake = platform.jenkinsLabel.contains('centos') ? 'cmake3' : 'cmake' def command = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} ${getDependenciesCommand} set -e mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${clientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) popd cd ref-repo mkdir -p build/${buildTypeDir} && pushd build/${buildTypeDir} ${auxiliary.gfxTargetParser()} ${cmake} -DCMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc -DCMAKE_C_COMPILER=/opt/rocm/bin/hipcc -DAMDGPU_TARGETS=\$gfx_arch -DSINGLELIB=on ${buildTypeArg} ${noclientArgs} ${warningArgs} ${rtcBuildCache} ../.. make -j\$(nproc) """ platform.runCommand(this, command) } def runTestCommand (platform, project, boolean debug=false) { String sudo = auxiliary.sudo(platform.jenkinsLabel) String directory = debug ? 'debug' : 'release' def dataTypes = ['single', 'double'] for (def dataType in dataTypes) { def command = """#!/usr/bin/env bash set -ex pwd cd ${project.paths.project_build_prefix} ./scripts/perf/rocfft-perf run --rider ./build/${directory}/clients/staging/dyna-rocfft-rider --lib ./ref-repo/build/${directory}/library/src/librocfft.so --lib ./build/${directory}/library/src/librocfft.so --out ./${dataType}_ref --out ./${dataType}_change --device 0 --precision ${dataType} --suite benchmarks ls ${dataType}_change ls ${dataType}_ref mkdir ${dataType}_results ./scripts/perf/rocfft-perf post ./${dataType}_results ./${dataType}_ref ./${dataType}_change ls ${dataType}_change/*.mdat ./scripts/perf/rocfft-perf html ./${dataType}_results ./${dataType}_ref ./${dataType}_change mv ${dataType}_results/figs.html ${dataType}_results/figs_${platform.gpu}.html """ platform.runCommand(this, command) archiveArtifacts "${project.paths.project_build_prefix}/${dataType}_results/*.html" publishHTML([allowMissing: false, alwaysLinkToLastBuild: false, keepAll: false, reportDir: "${project.paths.project_build_prefix}/${dataType}_results", reportFiles: "figs_${platform.gpu}.html", reportName: "${dataType}-precision-${platform.gpu}", reportTitles: "${dataType}-precision-${platform.gpu}"]) } withCredentials([gitUsernamePassword(credentialsId: 'GitHub-ROCmMathLibrariesBot-Token', gitToolName: 'git-tool')]) { platform.runCommand( this, """ cd ${project.paths.build_prefix} git clone https://github.com/ROCmSoftwarePlatform/rocPTS.git -b release/rocpts-rel-1.1.0 cd rocPTS python3 -m pip install build python3 -m build python3 -m pip install . """ ) } writeFile( file: project.paths.project_build_prefix + "/record_pts.py", text: libraryResource("com/amd/scripts/record_pts.py")) def setupBranch = env.CHANGE_ID ? "git branch \$BRANCH_NAME" : "" def command = """#!/usr/bin/env bash set -ex cd ${project.paths.project_build_prefix} ${setupBranch} git checkout \$BRANCH_NAME benchmark_folder=rocFFT_Benchmark_Dataset_\$(date +%Y%m%d) mkdir -p \${benchmark_folder}/all_change \${benchmark_folder}/all_ref cp -uf ./*_change/* \${benchmark_folder}/all_change cp -uf ./*_ref/* \${benchmark_folder}/all_ref python3 ./record_pts.py --dataset-path \$PWD/\${benchmark_folder} --reference-dataset all_ref --new-dataset all_change -v 5.5 -l pts_rocfft_benchmark_data-v1.0.0 """ withCredentials([usernamePassword(credentialsId: 'PTS_API_ID_KEY_PROD', usernameVariable: 'PTS_API_ID', passwordVariable: 'PTS_API_KEY')]) { platform.runCommand(this, command) } } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'Performance') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def gpus = [] def dataTypes = ['single', 'double'] def compileCommand = { platform, project-> gpus.add(platform.gpu) commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" runCompileCommand(platform, project, jobName) } def testCommand = { platform, project-> runTestCommand(platform, project) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, null) def commentString = "Performance reports: \n" + "Commit hashes: \n" for(parentHash in prj.gitParentHashes) { commentString += "${parentHash} \n" } for (gpu in gpus) { for (dataType in dataTypes) { commentString += "[${gpu} ${dataType} report](${JOB_URL}/${dataType}-precision-${gpu})\n" } } boolean commentExists = false for (prComment in pullRequest.comments) { if (prComment.body.contains("Performance reports:")) { commentExists = true prComment.body = commentString } } if (!commentExists) { def comment = pullRequest.comment(commentString) } } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900','gfx906']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocFFT-rocm-5.7.1/.jenkins/precheckin.groovy000066400000000000000000000050471446473624700207350ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'PreCheckin') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName) commonGroovy.runCompileClientCommand(platform, project, jobName, false) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu18:['gfx900'],centos7:['gfx906'],centos8:['gfx906'],sles15sp1:['gfx908']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu18:['gfx906']], urlJobName) } } } rocFFT-rocm-5.7.1/.jenkins/staticanalysis.groovy000066400000000000000000000062341446473624700216540ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCompileCommand(platform, project, jobName, boolean debug=false) { project.paths.construct_build_prefix() def yapfCommand = """#!/usr/bin/env bash set -x cd ${project.paths.project_build_prefix} yapf --version find . -iname '*.py' \ | grep -v 'build/' \ | xargs -n 1 -P 1 -I{} -t sh -c 'yapf --style pep8 {} | diff - {}' """ platform.runCommand(this, yapfCommand) } def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'StaticAnalysis') prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = true boolean staticAnalysis = true def compileCommand = { platform, project-> runCompileCommand(platform, project, jobName, false) } buildProject(prj , formatCheck, nodes.dockerArray, compileCommand, null, null, staticAnalysis) def kernelSubsetPrj = new rocProject('rocFFT-internal', 'BuildKernelSubset') def nodesForPrj2 = new dockerNodes(nodeDetails, jobName, kernelSubsetPrj) def commonGroovy def compileSubsetCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" // build pattern pow2,pow7 no manual small and large, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'pow2,pow7', null, null, true) // build large sizes, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'large', null, null, true) // build 2D sizes, dp only commonGroovy.runSubsetBuildCommand(platform, project, jobName, '2D', null, null, true) // put an extra unsupported size(10) in manual large to see if it will be filtered correctly commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', null, '10,50,100,200,336', true) // put an extra unsupported size(23) in manual small to see if it will be filtered correctly commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23,1024', '10,50,100,200,336', true) // all the manual sizes are not supported //commonGroovy.runSubsetBuildCommand(platform, project, jobName, 'none', '23', '10', true) } buildProject(kernelSubsetPrj , formatCheck, nodesForPrj2.dockerArray, compileSubsetCommand, null, null) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * 6')])])) stage(urlJobName) { runCI([ubuntu20:['any']], urlJobName) } } rocFFT-rocm-5.7.1/.jenkins/staticlibrary.groovy000066400000000000000000000046541446473624700215010ustar00rootroot00000000000000#!/usr/bin/env groovy // This shared library is available at https://github.com/ROCmSoftwarePlatform/rocJENKINS/ @Library('rocJenkins@pong') _ // This is file for internal AMD use. // If you are interested in running your own Jenkins, please raise a github issue for assistance. import com.amd.project.* import com.amd.docker.* import java.nio.file.Path def runCI = { nodeDetails, jobName-> def prj = new rocProject('rocFFT-internal', 'StaticLibrary') prj.defaults.ccache = true prj.timeout.compile = 600 prj.timeout.test = 600 prj.libraryDependencies = ['rocRAND','hipRAND'] // Define test architectures, optional rocm version argument is available def nodes = new dockerNodes(nodeDetails, jobName, prj) boolean formatCheck = false def commonGroovy def compileCommand = { platform, project-> commonGroovy = load "${project.paths.project_src_prefix}/.jenkins/common.groovy" commonGroovy.runCompileCommand(platform, project, jobName, false, true) } def testCommand = { platform, project-> commonGroovy.runTestCommand(platform, project) } def packageCommand = { platform, project-> commonGroovy.runPackageCommand(platform, project, jobName) } buildProject(prj, formatCheck, nodes.dockerArray, compileCommand, testCommand, packageCommand) } ci: { String urlJobName = auxiliary.getTopJobName(env.BUILD_URL) def propertyList = ["compute-rocm-dkms-no-npi-hipclang":[pipelineTriggers([cron('0 1 * * 0')])]] propertyList = auxiliary.appendPropertyList(propertyList) def jobNameList = ["compute-rocm-dkms-no-npi-hipclang":([ubuntu16:['gfx900']])] jobNameList = auxiliary.appendJobNameList(jobNameList) propertyList.each { jobName, property-> if (urlJobName == jobName) properties(auxiliary.addCommonProperties(property)) } jobNameList.each { jobName, nodeDetails-> if (urlJobName == jobName) stage(jobName) { runCI(nodeDetails, jobName) } } // For url job names that are not listed by the jobNameList i.e. compute-rocm-dkms-no-npi-1901 if(!jobNameList.keySet().contains(urlJobName)) { properties(auxiliary.addCommonProperties([pipelineTriggers([cron('0 1 * * *')])])) stage(urlJobName) { runCI([ubuntu16:['gfx906']], urlJobName) } } } rocFFT-rocm-5.7.1/.readthedocs.yaml000066400000000000000000000004171446473624700170570ustar00rootroot00000000000000# Read the Docs configuration file # See https://docs.readthedocs.io/en/stable/config-file/v2.html for details version: 2 sphinx: configuration: docs/conf.py formats: [htmlzip] python: version: "3.8" install: - requirements: docs/.sphinx/requirements.txt rocFFT-rocm-5.7.1/CHANGELOG.md000066400000000000000000000320621446473624700154420ustar00rootroot00000000000000# Change Log for rocFFT Full documentation for rocFFT is available at [rocfft.readthedocs.io](https://rocfft.readthedocs.io/en/latest/). ## rocFFT 1.0.24 for ROCm 5.7.0 ### Optimizations - Improved performance of complex forward/inverse 1D FFTs (2049 <= length <= 131071) that use Bluestein's algorithm. ### Added - Implemented a solution map version converter and finish the first conversion from ver.0 to ver.1. Where version 1 removes some incorrect kernels (sbrc/sbcr using half_lds) ### Changed - Moved rocfft_rtc_helper executable to lib/rocFFT directory on Linux. - Moved library kernel cache to lib/rocFFT directory. ## rocFFT 1.0.23 for ROCm 5.6.0 ### Added - Implemented half-precision transforms, which can be requested by passing rocfft_precision_half to rocfft_plan_create. - Implemented a hierarchical solution map which saves how to decompose a problem and the kernels to be used. - Implemented a first version of offline-tuner to support tuning kernels for C2C/Z2Z problems. ### Changed - Replaced std::complex with hipComplex data types for data generator. - FFT plan dimensions are now sorted to be row-major internally where possible, which produces better plans if the dimensions were accidentally specified in a different order (column-major, for example). - Added --precision argument to benchmark/test clients. --double is still accepted but is deprecated as a method to request a double-precision transform. - Improved performance test suite statistical framework. ### Fixed - Fixed over-allocation of LDS in some real-complex kernels, which was resulting in kernel launch failure. ## rocFFT 1.0.22 for ROCm 5.5.0 ### Optimizations - Improved performance of 1D lengths < 2048 that use Bluestein's algorithm. - Reduced time for generating code during plan creation. - Optimized 3D R2C/C2R lengths 32, 84, 128. - Optimized batched small 1D R2C/C2R cases. ### Added - Added gfx1101 to default AMDGPU_TARGETS. ### Changed - Moved client programs to C++17. - Moved planar kernels and infrequently used Stockham kernels to be runtime-compiled. - Moved transpose, real-complex, Bluestein, and Stockham kernels to library kernel cache. ### Fixed - Removed zero-length twiddle table allocations, which fixes errors from hipMallocManaged. - Fixed incorrect freeing of HIP stream handles during twiddle computation when multiple devices are present. ## rocFFT 1.0.21 for ROCm 5.4.3 ### Fixed - Removed source directory from rocm_install_targets call to prevent installation of rocfft.h in an unintended location. ## rocFFT 1.0.20 for ROCm 5.4.1 ### Fixed - Fixed incorrect results on strided large 1D FFTs where batch size does not equal the stride. ## rocFFT 1.0.19 for ROCm 5.4.0 ### Optimizations - Optimized some strided large 1D plans. ### Added - Added rocfft_plan_description_set_scale_factor API to efficiently multiply each output element of a FFT by a given scaling factor. - Created a rocfft_kernel_cache.db file next to the installed library. SBCC/CR/RC kernels are moved to this file when built with the library, and are runtime-compiled for new GPU architectures. - Added gfx1100 and gfx1102 to default AMDGPU_TARGETS. ### Changed - Moved runtime compilation cache to in-memory by default. A default on-disk cache can encounter contention problems on multi-node clusters with a shared filesystem. rocFFT can still be told to use an on-disk cache by setting the ROCFFT_RTC_CACHE_PATH environment variable. ## rocFFT 1.0.18 for ROCm 5.3.0 ### Changed - Runtime compilation cache now looks for environment variables XDG_CACHE_HOME (on Linux) and LOCALAPPDATA (on Windows) before falling back to HOME. - Moved computation of the twiddle table from host to the device. ### Optimizations - Optimized 2D R2C/C2R to use 2-kernel plans where possible. - Improved performance of the Bluestein algorithm. - Optimized sbcc-168 and 100 by using half-lds. - Optimized length-280 2D/3D transforms. - Added kernels for factorizable 1D lengths < 128 ### Fixed - Fixed occasional failures to parallelize runtime compilation of kernels. Failures would be retried serially and ultimately succeed, but this would take extra time. - Fixed failures of some R2C 3D transforms that use the unsupported TILE_UNALGNED SBRC kernels. An example is 98^3 R2C out-of-place. - Fixed bugs in SBRC_ERC type. ## rocFFT 1.0.17 for ROCm 5.2.0 ### Added - Packages for test and benchmark executables on all supported OSes using CPack. - Added File/Folder Reorg Changes with backward compatibility support using ROCM-CMAKE wrapper functions. ### Changed - Improved reuse of twiddle memory between plans. - Set a default load/store callback when only one callback type is set via the API for improved performance. - Updated googletest dependency to version 1.11. ### Optimizations - Introduced a new access pattern of lds (non-linear) and applied it on sbcc kernels len 64 and 81 to get performance improvement. - Applied lds-non-linear and direct-load-to-register on sbcr kernels to get performance improvement. - Applied lds-non-linear and direct-store-from-register on sbrc kernels to get performance improvement. ### Fixed - Fixed correctness of certain transforms with unusual strides. - Fixed incorrect handling of user-specified stream for runtime-compiled kernels. - Fixed incorrect buffer allocation in rocfft-test on in-place transforms with different input and output sizes. ## rocFFT 1.0.16 for ROCm 5.1.0 ### Changed - Supported unaligned tile dimension for SBRC_2D kernels. - Improved (more RAII) test and benchmark infrastructure. - Enabled runtime compilation of length-2304 FFT kernel during plan creation. - Added tokenizer for test suite. - Reduce twiddle memory requirements for even-length real-complex transforms. - Clients can now be built separately from the main library. ### Optimizations - Optimized more large 1D cases by using L1D_CC plan. - Optimized 3D 200^3 C2R case. - Optimized 1D 2^30 double precision on MI200. - Added padding to work buffer sizes to improve performance in many cases. ### Fixed - Fixed correctness of some R2C transforms with unusual strides. ### Removed - The hipFFT API (header) has been removed from after a long deprecation period. Please use the [hipFFT](https://github.com/ROCmSoftwarePlatform/hipFFT) package/repository to obtain the hipFFT API. ## rocFFT 1.0.15 for ROCm 5.0.0 ### Changed - Enabled runtime compilation of single FFT kernels > length 1024. - Re-aligned split device library into 4 roughly equal libraries. - Implemented the FuseShim framework to replace the original OptimizePlan - Implemented the generic buffer-assignment framework. The buffer assignment is no longer performed by each node. We designed a generic algorithm to test and pick the best assignment path. With the help of FuseShim, we can achieve more kernel-fusions as possible. - Do not read the imaginary part of the DC and Nyquist modes for even-length complex-to-real transforms. ### Optimizations - Optimized twiddle-conjugation; complex-to-complex inverse transforms should have similar performance to foward transforms now. - Improved performance of single-kernel small 2D transforms. ## rocFFT 1.0.14 for ROCm 4.5.0 ### Optimizations - Optimized SBCC kernels of length 52, 60, 72, 80, 84, 96, 104, 108, 112, 160, 168, 208, 216, 224, 240 with new kernel generator. ### Added - Added support for Windows 10 as a build target. ### Changed - Packaging split into a runtime package called rocfft and a development package called rocfft-devel. The development package depends on runtime. The runtime package suggests the development package for all supported OSes except CentOS 7 to aid in the transition. The suggests feature in packaging is introduced as a deprecated feature and will be removed in a future rocm release. ### Fixed - Fixed a few validation failures of even-length R2C inplace. 2D, 3D cubics sizes such as 100^2 (or ^3), 200^2 (or ^3), 256^2 (or ^3)...etc. We don't combine the three kernels (stockham-r2c-transpose). We only combine two kernels (r2c-transpose) instead. ### Changed - Split 2D device code into separate libraries. ## rocFFT 1.0.13 for ROCm 4.4.0 ### Optimizations - Improved many plans by removing unnecessary transpose steps. - Optimized scheme selection for 3D problems. - Imposed less restrictions on 3D_BLOCK_RC selection. More problems can use 3D_BLOCK_RC and have some performance gain. - Enabled 3D_RC. Some 3D problems with SBCC-supported z-dim can use less kernels and get benefit. - Force --length 336 336 56 (dp) use faster 3D_RC to avoid it from being skipped by conservative threshold test. - Optimized some even-length R2C/C2R cases by doing more operations in-place and combining pre/post processing into Stockham kernels. - Added radix-17. ### Added - Added new kernel generator for select fused-2D transforms. ### Fixed - Improved large 1D transform decompositions. ## rocFFT 1.0.12 for ROCm 4.3.0 ### Changed Re-split device code into single-precision, double-precision, and miscellaneous kernels. ### Fixed - Fixed potential crashes in double-precision planar->planar transpose. - Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes. - Improved buffer placement logic. ### Added - Added new kernel generator for select lengths. New kernels have improved performance. - Added public `rocfft_execution_info_set_load_callback` and `rocfft_execution_info_set_store_callback` API functions to allow executing extra logic when loading/storing data from/to global memory during a transform. ### Removed - Removed R2C pair schemes and kernels. ### Optimizations - Optimized 2D/3D R2C 100 and 1D Z2Z 2500. - Reduced number of kernels for 2D/3D sizes where higher dimension is 64, 128, 256. ### Fixed - Fixed potential crashes in 3D transforms with unusual strides, for SBCC-optimized sizes. ## rocFFT 1.0.11 for ROCm 4.2.0 ### Changed Move device code into main library. ### Optimizations - Improved performance for single precision kernels exercising all except radix-2/7 butterfly ops. - Minor optimization for C2R 3D 100, 200 cube sizes. - Optimized some C2C/R2C 3D 64, 81, 100, 128, 200, 256 rectangular sizes. - When factoring, test to see if remaining length is explicitly supported. - Explicitly add radix-7 lengths 14, 21, and 224 to list of supported lengths. - Optimized R2C 2D/3D 128, 200, 256 cube sizes. ### Fixed - Fixed potential crashes in small 3D transforms with unusual strides. (https://github.com/ROCmSoftwarePlatform/rocFFT/issues/311) - Fixed potential crashes when executing transforms on multiple devices. (https://github.com/ROCmSoftwarePlatform/rocFFT/issues/310) ## rocFFT 1.0.10 for ROCm 4.1.0 ### Added - Explicitly specify MAX_THREADS_PER_BLOCK through _\_launch\_bounds\_ for all kernels. - Switch to new syntax for specifying AMD GPU architecture names and features. ### Optimizations - Optimized C2C/R2C 3D 64, 81, 100, 128, 200, 256 cube sizes. - Improved performance of the standalone out-of-place transpose kernel. - Optimized 1D length 40000 C2C case. - Enabled radix-7 for size 336. - New radix-11 and radix-13 kernels; used in length 11 and 13 (and some of their multiples) transforms. ### Changed - rocFFT now automatically allocates a work buffer if the plan requires one but none is provided. - An explicit `rocfft_status_invalid_work_buffer` error is now returned when a work buffer of insufficient size is provided. - Updated online documentation. - Updated debian package name version with separated '_'. - Adjusted accuracy test tolerances and how they are compared. ### Fixed - Fixed 4x4x8192 accuracy failure. ## rocFFT 1.0.8 for ROCm 3.10.0 ### Optimizations - Optimized 1D length 10000 C2C case. ### Changed - Added BUILD_CLIENTS_ALL CMake option. ### Fixed - Fixed correctness of SBCC/SBRC kernels with non-unit strides. - Fixed fused C2R kernel when a Bluestein transform follows it. ## rocFFT 1.0.7 for ROCm 3.9.0 ### Optimizations - New R2C and C2R fused kernels to combine pre/post processing steps with transpose. - Enabled diagonal transpose for 1D and 2D power-of-2 cases. - New single kernels for small power-of-2, 3, 5 sizes. - Added more radix-7 kernels. ### Changed - Explicitly disable XNACK and SRAM-ECC features on AMDGPU hardware. ### Fixed - Fixed 2D C2R transform with length 1 on one dimension. - Fixed potential thread unsafety in logging. ## rocFFT 1.0.6 for ROCm 3.8.0 ### Optimizations - Improved performance of 1D batch-paired R2C transforms of odd length. - Added some radix-7 kernels. - Improved performance for 1D length 6561, 10000. - Improved performance for certain 2D transform sizes. ### Changed - Allow static library build with BUILD_SHARED_LIBS=OFF CMake option. - Updated googletest dependency to version 1.10. ### Fixed - Fixed correctness of certain large 2D sizes. ## rocFFT 1.0.5 for ROCM 3.7.0 ### Optimizations - Optimized C2C power-of-2 middle sizes. ### Changed - Parallelized work in unit tests and eliminate duplicate cases. ### Fixed - Fixed correctness of certain large 1D, and 2D power-of-3, 5 sizes. - Fixed incorrect buffer assignment for some even-length R2C transforms. - Fixed `` inclusion on C compilers. - Fixed incorrect results on non-unit strides with SBCC/SBRC kernels. rocFFT-rocm-5.7.1/CMakeLists.txt000066400000000000000000000251261446473624700163740ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # We use C++17 features, this will add compile option: -std=c++17 set( CMAKE_CXX_STANDARD 17 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_BUILD_SCOPE ON ) project( rocfft LANGUAGES CXX C ) # This finds the rocm-cmake project, and installs it if not found # rocm-cmake contains common cmake code for rocm projects to help setup and install set( PROJECT_EXTERN_DIR ${CMAKE_CURRENT_BINARY_DIR}/extern ) find_package( ROCM 0.7.3 CONFIG QUIET PATHS ${ROCM_PATH} /opt/rocm ) if( NOT ROCM_FOUND ) set( rocm_cmake_tag "master" CACHE STRING "rocm-cmake tag to download" ) file( DOWNLOAD https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip STATUS status LOG log) list(GET status 0 status_code) list(GET status 1 status_string) if(NOT status_code EQUAL 0) message(FATAL_ERROR "error: downloading 'https://github.com/RadeonOpenCompute/rocm-cmake/archive/${rocm_cmake_tag}.zip' failed status_code: ${status_code} status_string: ${status_string} log: ${log} ") endif() message(STATUS "downloading... done") execute_process( COMMAND ${CMAKE_COMMAND} -E tar xzvf ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag}.zip WORKING_DIRECTORY ${PROJECT_EXTERN_DIR} ) execute_process( COMMAND ${CMAKE_COMMAND} -DCMAKE_INSTALL_PREFIX=${PROJECT_EXTERN_DIR}/rocm-cmake . WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}/rocm-cmake-${rocm_cmake_tag} ) execute_process( COMMAND ${CMAKE_COMMAND} --build rocm-cmake-${rocm_cmake_tag} --target install WORKING_DIRECTORY ${PROJECT_EXTERN_DIR}) find_package( ROCM 0.7.3 REQUIRED CONFIG PATHS ${PROJECT_EXTERN_DIR}/rocm-cmake ) endif( ) include( ROCMSetupVersion ) include( ROCMCreatePackage ) include( ROCMInstallTargets ) include( ROCMPackageConfigHelpers ) include( ROCMInstallSymlinks ) include( ROCMCheckTargetIds ) include( ROCMClients ) include( ROCMHeaderWrapper ) # Using standardized versioning from rocm-cmake set ( VERSION_STRING "1.0.23" ) rocm_setup_version( VERSION ${VERSION_STRING} ) # Append our library helper cmake path and the cmake path for hip (for # convenience). # Users may override HIP path by specifying their own in CMAKE_MODULE_PATH list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) # Enable verbose output option( BUILD_VERBOSE "Output additional build information" OFF ) # Build the reference C++ kernel implementations. option( BUILD_CPUREF "Build CPU reference debug code" OFF ) # BUILD_SHARED_LIBS is a cmake built-in; we make it an explicit option # such that it shows in cmake-gui option( BUILD_SHARED_LIBS "Build rocFFT as a shared library" ON ) option( WERROR "Treat warnings as errors" OFF ) option(BUILD_ADDRESS_SANITIZER "Build with address sanitizer enabled" OFF) option(ROCFFT_RUNTIME_COMPILE "Enable runtime compilation of kernels" ON) option(ROCFFT_RUNTIME_COMPILE_DEFAULT "Compile kernels at runtime by default" OFF) # Using -DROCFFT_BUILD_OFFLINE_TUNER=ON to compile an executable, # Set default to OFF since users are not likely to tune option(ROCFFT_BUILD_OFFLINE_TUNER "Build with offline tuner executable rocfft_offline_tuner" OFF) if(BUILD_ADDRESS_SANITIZER) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fsanitize=address -shared-libasan") set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fsanitize=address -shared-libasan") add_link_options(-fuse-ld=lld) endif() # FOR HANDLING ENABLE/DISABLE OPTIONAL BACKWARD COMPATIBILITY for FILE/FOLDER REORG option(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY "Build with file/folder reorg with backward compatibility enabled" ON) if(BUILD_FILE_REORG_BACKWARD_COMPATIBILITY AND NOT WIN32) rocm_wrap_header_dir( ${CMAKE_SOURCE_DIR}/library/include PATTERNS "*.h" GUARDS SYMLINK WRAPPER WRAPPER_LOCATIONS ${CMAKE_INSTALL_INCLUDEDIR} ) endif() set( WARNING_FLAGS -Wall -Wno-unused-function -Wimplicit-fallthrough -Wunreachable-code -Wsign-compare ) if( WERROR ) set( WARNING_FLAGS ${WARNING_FLAGS} -Werror ) endif( ) # Use target ID syntax if supported for AMDGPU_TARGETS rocm_check_target_ids(DEFAULT_AMDGPU_TARGETS TARGETS "gfx803;gfx900;gfx906;gfx908;gfx90a;gfx940;gfx941;gfx942;gfx1030;gfx1100;gfx1101;gfx1102") set(AMDGPU_TARGETS "${DEFAULT_AMDGPU_TARGETS}" CACHE STRING "List of specific machine types for library to target") list(LENGTH AMDGPU_TARGETS AMDGPU_TARGETS_LENGTH) if(SINGLELIB AND ${AMDGPU_TARGETS_LENGTH} GREATER 2 ) message( FATAL_ERROR "SINGLELIB build requires at most two architecture in AMDGPU_TARGETS. " "A multi-arch SINGLELIB library may exceed size limits and fail to build. " "SINGLELIB builds are only for benchmarking and should be built for exactly " "the architectures being benchmarked." ) endif() # HIP is required - library and clients use HIP to access the device find_package( HIP REQUIRED ) # The nvidia backend can be used to compile for CUDA devices. # Specify the CUDA prefix in the CUDA_PREFIX variable. # CUDA_ARCH (e.g. sm_75) is also required. if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required (e.g. /usr/local/cuda-11.4)" ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required. (e.g. sm_75)" ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) endif( ) # hipcc automatically provides HIP include dirs and HIP platform, # but plain clang needs to be told if( NOT CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) include_directories( ${HIP_INCLUDE_DIRS} ) if( USE_CUDA ) add_compile_definitions( __HIP_PLATFORM_NVIDIA__ ) else() add_compile_definitions( __HIP_PLATFORM_AMD__ ) endif() endif() add_subdirectory( library ) include( clients/cmake/build-options.cmake ) # Build clients of the library if( BUILD_CLIENTS ) set( BUILD_CLIENTS_RIDER ON ) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) endif( ) if( BUILD_CLIENTS_SAMPLES OR BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_RIDER ) if(BUILD_CLIENTS_TESTS OR BUILD_CLIENTS_SELFTEST OR BUILD_CLIENTS_RIDER) find_package( Boost COMPONENTS program_options REQUIRED) set(BOOST_DEB "libboost-program-options${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}") set(BOOST_RPM "boost-program-options = ${Boost_VERSION_MAJOR}.${Boost_VERSION_MINOR}.${Boost_VERSION_PATCH}") endif() if( NOT CLIENTS_OS ) rocm_set_os_id( CLIENTS_OS ) endif() if(BUILD_CLIENTS_TESTS AND (NOT DEFINED BUILD_CLIENTS_TESTS_OPENMP OR BUILD_CLIENTS_TESTS_OPENMP)) set(OPENMP_DEB "libgomp1") set(FFTW_DEB "libfftw3-bin") if(CLIENTS_OS STREQUAL "sles") set(OPENMP_RPM "libgomp1") set(FFTW_RPM "libfftw3-3") else() set(OPENMP_RPM "libgomp") set(FFTW_RPM "fftw-libs") endif() endif() if(CLIENTS_OS STREQUAL "sles") set(BOOST_RPM RPM "libboost_program_options${Boost_VERSION_MAJOR}_${Boost_VERSION_MINOR}_${Boost_VERSION_PATCH}") endif() rocm_package_setup_component(clients) if(BUILD_CLIENTS_TESTS) rocm_package_setup_client_component( tests DEPENDS DEB ${BOOST_DEB} ${OPENMP_DEB} ${FFTW_DEB} rocrand RPM ${BOOST_RPM} ${OPENMP_RPM} ${FFTW_RPM} rocrand ) endif() if(BUILD_CLIENTS_RIDER) rocm_package_setup_client_component( benchmarks DEPENDS DEB ${BOOST_DEB} rocrand RPM ${BOOST_RPM} rocrand ) rocm_install( DIRECTORY scripts/perf DESTINATION ${CMAKE_INSTALL_BINDIR} COMPONENT benchmarks ) endif() add_subdirectory( clients ) endif( ) if(WIN32) set(CPACK_SOURCE_GENERATOR "ZIP") set(CPACK_GENERATOR "ZIP") set(CMAKE_INSTALL_PREFIX "C:/hipSDK" CACHE PATH "Install path" FORCE) set(INSTALL_PREFIX "C:/hipSDK") set(CPACK_SET_DESTDIR OFF) set(CPACK_PACKAGE_INSTALL_DIRECTORY "C:/hipSDK") set(CPACK_PACKAGING_INSTALL_PREFIX "") set(CPACK_INCLUDE_TOPLEVEL_DIRECTORY OFF) endif() # Package specific CPACK vars string( TOLOWER "${HIP_RUNTIME}" HIP_RUNTIME_LOWER ) if( HIP_RUNTIME_LOWER STREQUAL "rocclr" ) rocm_package_add_dependencies("hip-rocclr >= 3.5.0") endif( ) set( CPACK_RESOURCE_FILE_LICENSE "${CMAKE_CURRENT_SOURCE_DIR}/LICENSE.md" ) set( CPACK_RPM_PACKAGE_LICENSE "MIT" ) set( CPACK_RPM_EXCLUDE_FROM_AUTO_FILELIST_ADDITION "\${CPACK_PACKAGING_INSTALL_PREFIX}" ) set( ROCFFT_CONFIG_DIR "\${CPACK_PACKAGING_INSTALL_PREFIX}/${CMAKE_INSTALL_LIBDIR}" CACHE PATH "Path placed into ldconfig file" ) set( package_name rocfft ) rocm_create_package( NAME ${package_name} DESCRIPTION "ROCm FFT library" MAINTAINER "rocfft-maintainer@amd.com" LDCONFIG LDCONFIG_DIR ${ROCFFT_CONFIG_DIR} ) rocFFT-rocm-5.7.1/CppCheckSuppressions.txt000066400000000000000000000003421446473624700205040ustar00rootroot00000000000000// generator uses implicit constructors for convenience noExplicitConstructor:library/src/device/generator/generator.h // has some false positives and isn't hard to run manually for periodic // dead code sweeps unusedFunction rocFFT-rocm-5.7.1/LICENSE.md000066400000000000000000000021151446473624700152310ustar00rootroot00000000000000Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. rocFFT-rocm-5.7.1/README.md000066400000000000000000000076511446473624700151160ustar00rootroot00000000000000# rocFFT rocFFT is a software library for computing Fast Fourier Transforms (FFT) written in HIP. It is part of AMD's software ecosystem based on [ROCm][1]. In addition to AMD GPU devices, the library can also be compiled with the CUDA compiler using HIP tools for running on Nvidia GPU devices. ## Installing pre-built packages Download pre-built packages either from [ROCm's package servers][2] or by clicking the github releases tab and downloading the source, which could be more recent than the pre-build packages. Release notes are available for each release on the releases tab. * `sudo apt update && sudo apt install rocfft` ## Dependencies rocFFT requires python3 libraries to be present at runtime. These are typically present by default in most Linux distributions. ## Building from source rocFFT is compiled with hipcc and uses cmake. There are a number of options that can be provided to cmake to customize the build, but the following commands will build a shared library for supported AMD GPUs: ``` mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_C_COMPILER=hipcc .. make -j ``` A static library can be compiled by using the option `-DBUILD_SHARED_LIBS=off` rocFFT enables use of indirect function calls by default and requires ROCm 4.3 or higher to build successfully. `-DROCFFT_CALLBACKS_ENABLED=off` may be specified to cmake to disable those calls on older ROCm compilers, though callbacks will not work correctly in this configuration. There are several clients included with rocFFT: 1. rocfft-rider runs general transforms and is useful for performance analysis; 2. rocfft-test runs various regression tests; and 3. various small samples are included. Clients are not built by default. To build them: | Client | CMake option | Dependencies | |-----------------|-------------------------------|------------------------------------------| | rocfft-rider | `-DBUILD_CLIENTS_RIDER=on` | Boost program options | | rocfft-test | `-DBUILD_CLIENTS_TESTS=on` | Boost program options, FFTW, Google Test | | samples | `-DBUILD_CLIENTS_SAMPLES=on` | Boost program options, FFTW | To build all of the above clients, use `-DBUILD_CLIENTS=on`. The build process will download and build Google Test and FFTW if they are not installed. Clients may be built separately from the main library. For example, one may build all the clients with an existing rocFFT library by invoking cmake from within the rocFFT-src/clients folder: ``` mkdir build && cd build cmake -DCMAKE_CXX_COMPILER=hipcc -DCMAKE_C_COMPILER=hipcc -DCMAKE_PREFIX_PATH=/path/to/rocFFT-lib .. make -j ``` To install the client dependencies on Ubuntu, run: ``` sudo apt install libgtest-dev libfftw3-dev libboost-program-options-dev` ``` We use version 1.11 of Google Test (gtest). `install.sh` is a bash script that will install dependencies on certain Linux distributions, such as Ubuntu, CentOS, RHEL, Fedora, and SLES and invoke cmake. However, the preferred method for compiling rocFFT is to call cmake directly. ## Library and API Documentation Please refer to the [library documentation][4] for current documentation. ### How to build documentation Please follow the steps below to build the documentation. ``` cd docs pip3 install -r .sphinx/requirements.txt python3 -m sphinx -T -E -b html -d _build/doctrees -D language=en . _build/html ``` ## Examples Examples may be found in the [clients/samples][5] subdirectory. [1]: https://github.com/RadeonOpenCompute [2]: https://rocmdocs.amd.com/en/latest/Installation_Guide/Installation-Guide.html [3]: https://github.com/ROCm-Developer-Tools/HIP/blob/master/INSTALL.md#hip-clang [4]: https://rocfft.readthedocs.io/ [5]: clients/samples ## Contribution Rules ### Source code formatting * C++ source code must be formatted with clang-format with .clang-format * Python source code must be formatted with yapf --style pep8 rocFFT-rocm-5.7.1/ValgrindSuppressions.txt000066400000000000000000000002671446473624700206000ustar00rootroot00000000000000{ Memcheck:Param sched_setaffinity(mask) ... fun:hipMalloc } { Memcheck:Param sched_setaffinity(mask) ... fun:hipMemGetInfo }rocFFT-rocm-5.7.1/clients/000077500000000000000000000000001446473624700152675ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/CMakeLists.txt000066400000000000000000000076431446473624700200410ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) set( CPACK_PACKAGING_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_BUILD_SCOPE ON ) # This project may compile dependencies for clients project( rocfft-clients LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) # This option only works for make/nmake and the ninja generators, but # no reason it shouldn't be on all the time. # This tells cmake to create a compile_commands.json file that can be # used with clang tooling or vim. set( CMAKE_EXPORT_COMPILE_COMMANDS ON ) if(NOT ROCFFT_BUILD_SCOPE AND NOT BUILD_CLIENTS_SAMPLES AND NOT BUILD_CLIENTS_TESTS AND NOT BUILD_CLIENTS_RIDER) set( BUILD_CLIENTS_SAMPLES ON ) set( BUILD_CLIENTS_TESTS ON ) set( BUILD_CLIENTS_RIDER ON ) endif() # each backend requires different libraries for host and device code if( USE_CUDA ) if( NOT DEFINED CUDA_PREFIX ) message( FATAL_ERROR "CUDA_PREFIX variable is required." ) endif() if( NOT DEFINED CUDA_ARCH ) message( FATAL_ERROR "CUDA_ARCH variable is required." ) endif() add_compile_options(-I${HIP_ROOT_DIR}/include -I${CUDA_PREFIX}/include -D__HIP_PLATFORM_NVIDIA__) add_link_options(-L${CUDA_PREFIX}/lib64 -pthread) add_compile_options(--cuda-path=${CUDA_PREFIX} --cuda-gpu-arch=${CUDA_ARCH} -xcuda) set( ROCFFT_CLIENTS_HOST_LINK_LIBS -lcudart -ldl -lrt ) else() set( ROCFFT_CLIENTS_HOST_LINK_LIBS hip::host ) set( ROCFFT_CLIENTS_DEVICE_LINK_LIBS hip::device ) endif() if( BUILD_CLIENTS_SAMPLES ) add_subdirectory( samples ) endif( ) if( BUILD_CLIENTS_TESTS ) add_subdirectory( tests ) endif( ) if( BUILD_CLIENTS_RIDER ) add_subdirectory( rider ) endif( ) rocFFT-rocm-5.7.1/clients/cmake/000077500000000000000000000000001446473624700163475ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/cmake/build-gtest.cmake000066400000000000000000000046041446473624700216000ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. include( ExternalProject ) option( BUILD_GTEST "Download and build GoogleTest" OFF ) if( NOT BUILD_GTEST ) find_package( GTest 1.11.0 ) endif() if( (BUILD_GTEST OR NOT GTEST_FOUND) AND (NOT TARGET gtest) ) set(GTEST_INCLUDE_DIRS ${CMAKE_CURRENT_BINARY_DIR}/src/gtest/googletest/include) set(GTEST_LIBRARIES ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest${CMAKE_STATIC_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/gtest-build/lib/${CMAKE_STATIC_LIBRARY_PREFIX}gtest_main${CMAKE_STATIC_LIBRARY_SUFFIX}) set(GTEST_SRC_URL https://github.com/google/googletest/archive/release-1.11.0.tar.gz CACHE STRING "Location of GTest source code") set(GTEST_SRC_SHA256 b4870bf121ff7795ba20d20bcdd8627b8e088f2d1dab299a031c1034eddc93d5 CACHE STRING "SHA256 hash of GTest source code") ExternalProject_Add(gtest URL ${GTEST_SRC_URL} URL_HASH SHA256=${GTEST_SRC_SHA256} PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS -DCMAKE_CXX_COMPILER_LAUNCHER=${CMAKE_CXX_COMPILER_LAUNCHER} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${GTEST_LIBRARIES}) ExternalProject_Get_Property( gtest source_dir binary_dir ) endif() rocFFT-rocm-5.7.1/clients/cmake/build-options.cmake000066400000000000000000000035731446473624700221510ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # This file is intended to be used in two ways; independently in a stand alone PROJECT # and as part of a superbuild. If the file is included in a stand alone project, the # variables are not expected to be preset, and this will produce options() in the GUI # for the user to examine. If this file is included in a superbuild, the options will be # presented in the superbuild GUI, but then passed into the ExternalProject as -D # parameters, which would already define them. if( NOT BUILD_CLIENTS_TESTS ) option( BUILD_CLIENTS_TESTS "Build rocFFT unit tests" OFF ) endif( ) if( NOT BUILD_CLIENTS_RIDER ) option( BUILD_CLIENTS_RIDER "Build rocFFT rider" OFF ) endif( ) if( NOT BUILD_CLIENTS_SAMPLES ) option( BUILD_CLIENTS_SAMPLES "Build rocFFT samples" OFF ) endif( ) rocFFT-rocm-5.7.1/clients/rider/000077500000000000000000000000001446473624700163745ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/rider/CMakeLists.txt000066400000000000000000000110511446473624700211320ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-rider LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT ROCM_FOUND ) find_package( ROCM 0.7.3 REQUIRED ) endif() if( NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) find_package( Boost COMPONENTS program_options REQUIRED) set( rider_list rocfft-rider dyna-rocfft-rider ) foreach( rider ${rider_list}) if(${rider} STREQUAL "rocfft-rider") add_executable( ${rider} ../../shared/array_validator.cpp rider.cpp rider.h ) else() add_executable( ${rider} ../../shared/array_validator.cpp dyna-rider.cpp rider.h ) endif() target_compile_options( ${rider} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) # NB: hip-clang includes omp.h, so we need to specify the location # of ROCM_CLANG_ROOT at cmake config time if we are using clang++. target_include_directories( ${rider} PRIVATE $ $ ${HIP_CLANG_ROOT}/include ${ROCM_CLANG_ROOT}/include ) if(${rider} STREQUAL "rocfft-rider") target_link_libraries( ${rider} PRIVATE hip::device roc::rocfft hip::hiprand Boost::program_options ) else() target_link_libraries( ${rider} PRIVATE ${CMAKE_DL_LIBS} hip::device hip::hiprand ${Boost_LIBRARIES} ) # We need to include both rocfft.h and rocfft-export.h target_include_directories( ${rider} PRIVATE ${CMAKE_BINARY_DIR}/include/rocfft ${CMAKE_CURRENT_SOURCE_DIR}/../../library/include/ ${HIP_CLANG_ROOT}/include ) endif() target_link_libraries( ${rider} PUBLIC ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) set_target_properties( ${rider} PROPERTIES DEBUG_POSTFIX "-d" CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( RIDER_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( RIDER_OUT_DIR "/../bin" ) else() set( RIDER_OUT_DIR "/bin") endif() string( CONCAT RIDER_OUT_DIR "${PROJECT_BINARY_DIR}" ${RIDER_OUT_DIR} ) set_target_properties(${rider} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${RIDER_OUT_DIR} ) rocm_install(TARGETS ${rider} COMPONENT benchmarks) endforeach() rocFFT-rocm-5.7.1/clients/rider/dyna-rider.cpp000066400000000000000000000663731446473624700211550ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // This file allows one to run tests multiple different rocFFT libraries at the same time. // This allows one to randomize the execution order for better a better experimental setup // which produces fewer type 1 errors where one incorrectly rejects the null hypothesis. #include #include #include #include #include #ifdef WIN32 #include #include #else #include #include #endif #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "rider.h" #include "rocfft.h" #include namespace po = boost::program_options; #ifdef WIN32 typedef HMODULE ROCFFT_LIB; #else typedef void* ROCFFT_LIB; #endif // Load the rocfft library ROCFFT_LIB rocfft_lib_load(const std::string& path) { #ifdef WIN32 return LoadLibraryA(path.c_str()); #else return dlopen(path.c_str(), RTLD_LAZY); #endif } // Return a string describing the error loading rocfft const char* rocfft_lib_load_error() { #ifdef WIN32 // just return the error number static std::string error_str; error_str = std::to_string(GetLastError()); return error_str.c_str(); #else return dlerror(); #endif } // Return true if rocfft_device is loaded, which indicates that the // library was not built with -DSINGLELIB=ON. bool rocfft_lib_device_loaded(ROCFFT_LIB libhandle) { #ifdef WIN32 DWORD arraySize = 0; EnumProcessModules(GetCurrentProcess(), NULL, 0, &arraySize); std::vector modules(arraySize); if(EnumProcessModules(GetCurrentProcess(), modules.data(), modules.size(), &arraySize)) { for(auto& mod : modules) { char name[MAX_PATH] = {0}; GetModuleFileNameA(mod, name, MAX_PATH); // poor man's stristr on windows std::transform(name, name + strlen(name), name, [](char c) { return std::tolower(c); }); if(strstr(name, "rocfft-device.dll")) return true; } } return false; #else struct link_map* link = nullptr; dlinfo(libhandle, RTLD_DI_LINKMAP, &link); for(; link != nullptr; link = link->l_next) { if(strstr(link->l_name, "librocfft-device") != nullptr) { return true; } } return false; #endif } // Get symbol from rocfft lib void* rocfft_lib_symbol(ROCFFT_LIB libhandle, const char* sym) { #ifdef WIN32 return reinterpret_cast(GetProcAddress(libhandle, sym)); #else return dlsym(libhandle, sym); #endif } void rocfft_lib_close(ROCFFT_LIB libhandle) { #ifdef WIN32 FreeLibrary(libhandle); #else dlclose(libhandle); #endif } // Given a libhandle from dload, return a plan to a rocFFT plan with the given parameters. rocfft_plan make_plan(ROCFFT_LIB libhandle, const rocfft_result_placement place, const fft_transform_type transform_type, const std::vector& length, const std::vector& istride, const std::vector& ostride, const size_t idist, const size_t odist, const std::vector& ioffset, const std::vector& ooffset, const size_t nbatch, const rocfft_precision precision, const rocfft_array_type itype, const rocfft_array_type otype) { auto procfft_setup = (decltype(&rocfft_setup))rocfft_lib_symbol(libhandle, "rocfft_setup"); if(procfft_setup == NULL) exit(1); auto procfft_plan_description_create = (decltype(&rocfft_plan_description_create))rocfft_lib_symbol( libhandle, "rocfft_plan_description_create"); auto procfft_plan_description_destroy = (decltype(&rocfft_plan_description_destroy))rocfft_lib_symbol( libhandle, "rocfft_plan_description_destroy"); auto procfft_plan_description_set_data_layout = (decltype(&rocfft_plan_description_set_data_layout))rocfft_lib_symbol( libhandle, "rocfft_plan_description_set_data_layout"); auto procfft_plan_create = (decltype(&rocfft_plan_create))rocfft_lib_symbol(libhandle, "rocfft_plan_create"); procfft_setup(); rocfft_plan_description desc = NULL; LIB_V_THROW(procfft_plan_description_create(&desc), "rocfft_plan_description_create failed"); LIB_V_THROW(procfft_plan_description_set_data_layout(desc, itype, otype, ioffset.data(), ooffset.data(), istride.size(), istride.data(), idist, ostride.size(), ostride.data(), odist), "rocfft_plan_description_data_layout failed"); rocfft_plan plan = NULL; LIB_V_THROW(procfft_plan_create(&plan, place, rocfft_transform_type_from_fftparams(transform_type), precision, length.size(), length.data(), nbatch, desc), "rocfft_plan_create failed"); LIB_V_THROW(procfft_plan_description_destroy(desc), "rocfft_plan_description_destroy failed"); return plan; } // Given a libhandle from dload and a rocFFT plan, destroy the plan. void destroy_plan(ROCFFT_LIB libhandle, rocfft_plan& plan) { auto procfft_plan_destroy = (decltype(&rocfft_plan_destroy))rocfft_lib_symbol(libhandle, "rocfft_plan_destroy"); LIB_V_THROW(procfft_plan_destroy(plan), "rocfft_plan_destroy failed"); auto procfft_cleanup = (decltype(&rocfft_cleanup))rocfft_lib_symbol(libhandle, "rocfft_cleanup"); if(procfft_cleanup) LIB_V_THROW(procfft_cleanup(), "rocfft_cleanup failed"); } // Given a libhandle from dload and a rocFFT execution info structure, destroy the info. void destroy_info(ROCFFT_LIB libhandle, rocfft_execution_info& info) { auto procfft_execution_info_destroy = (decltype(&rocfft_execution_info_destroy))rocfft_lib_symbol( libhandle, "rocfft_execution_info_destroy"); LIB_V_THROW(procfft_execution_info_destroy(info), "rocfft_execution_info_destroy failed"); } // Given a libhandle from dload, and a corresponding rocFFT plan, return how much work // buffer is required. size_t get_wbuffersize(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_work_buffer_size = (decltype(&rocfft_plan_get_work_buffer_size))rocfft_lib_symbol( libhandle, "rocfft_plan_get_work_buffer_size"); // Get the buffersize size_t workBufferSize = 0; LIB_V_THROW(procfft_plan_get_work_buffer_size(plan, &workBufferSize), "rocfft_plan_get_work_buffer_size failed"); return workBufferSize; } // Given a libhandle from dload and a corresponding rocFFT plan, print the plan information. void show_plan(ROCFFT_LIB libhandle, const rocfft_plan& plan) { auto procfft_plan_get_print = (decltype(&rocfft_plan_get_print))rocfft_lib_symbol(libhandle, "rocfft_plan_get_print"); LIB_V_THROW(procfft_plan_get_print(plan), "rocfft_plan_get_print failed"); } // Given a libhandle from dload and a corresponding rocFFT plan, a work buffer size and an // allocated work buffer, return a rocFFT execution info for the plan. rocfft_execution_info make_execinfo(ROCFFT_LIB libhandle, const size_t wbuffersize, void* wbuffer) { auto procfft_execution_info_create = (decltype(&rocfft_execution_info_create))rocfft_lib_symbol( libhandle, "rocfft_execution_info_create"); auto procfft_execution_info_set_work_buffer = (decltype(&rocfft_execution_info_set_work_buffer))rocfft_lib_symbol( libhandle, "rocfft_execution_info_set_work_buffer"); rocfft_execution_info info = NULL; LIB_V_THROW(procfft_execution_info_create(&info), "rocfft_execution_info_create failed"); if(wbuffer != NULL) { LIB_V_THROW(procfft_execution_info_set_work_buffer(info, wbuffer, wbuffersize), "rocfft_execution_info_set_work_buffer failed"); } return info; } // Given a libhandle from dload and a corresponding rocFFT plan and execution info, // execute a transform on the given input and output buffers and return the kernel // execution time. float run_plan( ROCFFT_LIB libhandle, rocfft_plan plan, rocfft_execution_info info, void** in, void** out) { auto procfft_execute = (decltype(&rocfft_execute))rocfft_lib_symbol(libhandle, "rocfft_execute"); hipEvent_t start, stop; HIP_V_THROW(hipEventCreate(&start), "hipEventCreate failed"); HIP_V_THROW(hipEventCreate(&stop), "hipEventCreate failed"); HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); procfft_execute(plan, in, out, info); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); return time; HIP_V_THROW(hipEventDestroy(start), "hipEventDestroy failed"); HIP_V_THROW(hipEventDestroy(stop), "hipEventDestroy failed"); } // Load python library with RTLD_GLOBAL so that rocfft is free to // import python modules that need all of the symbols in libpython. // Normally, dyna-rider will want to dlopen rocfft's with RTLD_LOCAL. // If libpython is brought in this way, python modules might not be // able to find the symbols they need and import will fail. #ifndef WIN32 static void* python_dl = nullptr; void load_python(const std::vector& libs) { // dlopen each lib, taking note of the python library that it needs std::string pythonlib; for(const auto& lib : libs) { void* handle = dlopen(lib.c_str(), RTLD_LAZY); if(handle) { // look through the link map to see what libpython it needs (if any) struct link_map* map; if(dlinfo(handle, RTLD_DI_LINKMAP, &map) == 0) { for(struct link_map* ptr = map; ptr != nullptr; ptr = ptr->l_next) { std::string libname = ptr->l_name; if(libname.find("/libpython3.") != std::string::npos) { if(!pythonlib.empty() && pythonlib != libname) throw std::runtime_error("multiple distinct libpythons required"); pythonlib = libname; } } } } dlclose(handle); } if(!pythonlib.empty()) { // explicitly dlopen python with RTLD_GLOBAL python_dl = dlopen(pythonlib.c_str(), RTLD_LAZY | RTLD_GLOBAL); } } #endif int main(int argc, char* argv[]) { // Control output verbosity: int verbose{}; // hip Device number for running tests: int deviceId{}; // Number of performance trial samples: int ntrial{}; // Test sequence choice: int test_sequence{}; // Vector of test target libraries std::vector libs; // FFT parameters: fft_params params; // Token string to fully specify fft params. std::string token; // Declare the supported options. // clang-format doesn't handle boost program options very well: // clang-format off po::options_description opdesc("rocfft rider command line options"); opdesc.add_options()("help,h", "Produces this help message") ("version,v", "Print queryable version information from the rocfft library") ("device", po::value(&deviceId)->default_value(0), "Select a specific device id") ("verbose", po::value(&verbose)->default_value(0), "Control output verbosity") ("ntrial,N", po::value(&ntrial)->default_value(1), "Trial size for the problem") ("sequence", po::value(&test_sequence)->default_value(0), "Test sequence: random(0), alternating(1) sequential(2)") ("notInPlace,o", "Not in-place FFT transform (default: in-place)") ("double", "Double precision transform (deprecated: use --precision double)") ("precision", po::value(¶ms.precision), "Transform precision: single (default), double, half") ("transformType,t", po::value(¶ms.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ( "batchSize,b", po::value(¶ms.nbatch)->default_value(1), "If this value is greater than one, arrays will be used ") ( "itype", po::value(¶ms.itype) ->default_value(fft_array_type_unset), "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ( "otype", po::value(¶ms.otype) ->default_value(fft_array_type_unset), "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ("lib", po::value>(&libs)->multitoken(), "Set test target library full path(appendable).") ("length", po::value>(¶ms.length)->multitoken(), "Lengths.") ("istride", po::value>(¶ms.istride)->multitoken(), "Input strides.") ("ostride", po::value>(¶ms.ostride)->multitoken(), "Output strides.") ("idist", po::value(¶ms.idist)->default_value(0), "Logical distance between input batches.") ("odist", po::value(¶ms.odist)->default_value(0), "Logical distance between output batches.") ("isize", po::value>(¶ms.isize)->multitoken(), "Logical size of input buffer.") ("osize", po::value>(¶ms.osize)->multitoken(), "Logical size of output.") ("ioffset", po::value>(¶ms.ioffset)->multitoken(), "Input offsets.") ("ooffset", po::value>(¶ms.ooffset)->multitoken(), "Output offsets.") ("scalefactor", po::value(¶ms.scale_factor), "Scale factor to apply to output.") ("token", po::value(&token)); // clang-format on po::variables_map vm; po::store(po::parse_command_line(argc, argv, opdesc), vm); po::notify(vm); if(vm.count("help")) { std::cout << opdesc << std::endl; return EXIT_SUCCESS; } if(vm.count("notInPlace")) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(vm.count("ntrial")) { std::cout << "Running profile with " << ntrial << " samples\n"; } if(token != "") { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return 1; } } else { if(!vm.count("length")) { std::cout << "Please specify transform length!" << std::endl; std::cout << opdesc << std::endl; return EXIT_SUCCESS; } params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; if(vm.count("double")) params.precision = fft_precision_double; if(vm.count("notInPlace")) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(vm.count("length")) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(vm.count("istride")) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(vm.count("ostride")) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(params.idist > 0) { std::cout << "idist: " << params.idist << "\n"; } if(params.odist > 0) { std::cout << "odist: " << params.odist << "\n"; } if(vm.count("ioffset")) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(vm.count("ooffset")) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; // Fixme: set the device id properly after the IDs are synced // bewteen hip runtime and rocm-smi. // HIP_V_THROW(hipSetDevice(deviceId), "set device failed!"); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str() << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } std::vector plan; size_t wbuffer_size = 0; #ifndef WIN32 load_python(libs); #endif // Set up shared object handles std::vector handles; for(unsigned int idx = 0; idx < libs.size(); ++idx) { auto libhandle = rocfft_lib_load(libs[idx]); if(libhandle == NULL) { std::cout << "Failed to open " << libs[idx] << ", error: " << rocfft_lib_load_error() << std::endl; return 1; } if(rocfft_lib_device_loaded(libhandle)) { std::cerr << "Error: Library " << libs[idx] << " depends on librocfft-device.\n"; std::cerr << "All libraries need to be built with -DSINGLELIB=on.\n"; return 1; } handles.push_back(libhandle); } // Set up plans: for(unsigned int idx = 0; idx < libs.size(); ++idx) { std::cout << idx << ": " << libs[idx] << std::endl; plan.push_back(make_plan(handles[idx], rocfft_result_placement_from_fftparams(params.placement), params.transform_type, params.length_cm(), params.istride_cm(), params.ostride_cm(), params.idist, params.odist, params.ioffset, params.ooffset, params.nbatch, rocfft_precision_from_fftparams(params.precision), rocfft_array_type_from_fftparams(params.itype), rocfft_array_type_from_fftparams(params.otype))); show_plan(handles[idx], plan[idx]); wbuffer_size = std::max(wbuffer_size, get_wbuffersize(handles[idx], plan[idx])); } std::cout << "Work buffer size: " << wbuffer_size << std::endl; // Allocate the work buffer: just one, big enough for any dloaded library. gpubuf wbuffer; if(wbuffer_size) { HIP_V_THROW(wbuffer.alloc(wbuffer_size), "Creating intermediate Buffer failed"); } // Associate the work buffer to the invidual libraries: std::vector info; for(unsigned int idx = 0; idx < libs.size(); ++idx) { info.push_back(make_execinfo(handles[idx], wbuffer_size, wbuffer.data())); } // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); pibuffer[i] = ibuffer[i].data(); } // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU auto cpu_input = allocate_host_buffer(params.precision, params.itype, params.isize); for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { HIP_V_THROW(hipMemcpy(cpu_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU input:\n"; params.print_ibuffer(cpu_input); } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed"); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } if(handles.size()) { // Run the plan using its associated rocFFT library: for(unsigned int idx = 0; idx < handles.size(); ++idx) { run_plan(handles[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data()); } } // Execution times for loaded libraries: std::vector> time(libs.size()); std::vector testcase(ntrial * libs.size()); switch(test_sequence) { case 0: { // Random order: for(int itrial = 0; itrial < ntrial; ++itrial) { for(size_t ilib = 0; ilib < libs.size(); ++ilib) { testcase[libs.size() * itrial + ilib] = ilib; } } std::random_device rd; std::mt19937 g(rd()); std::shuffle(testcase.begin(), testcase.end(), g); break; } case 1: // Alternating order: for(int itrial = 0; itrial < ntrial; ++itrial) { for(size_t ilib = 0; ilib < libs.size(); ++ilib) { testcase[libs.size() * itrial + ilib] = ilib; } } break; case 2: // Sequential order: for(int itrial = 0; itrial < ntrial; ++itrial) { for(size_t ilib = 0; ilib < libs.size(); ++ilib) { testcase[ilib * ntrial + itrial] = ilib; } } break; default: throw std::runtime_error("Invalid test sequence choice."); } std::cout << "test case:"; for(const auto i : testcase) std::cout << " " << i; std::cout << "\n"; // Run the FFTs from the different libraries in random order until they all have at // least ntrial times. std::vector ndone(libs.size()); std::fill(ndone.begin(), ndone.end(), 0); for(size_t itest = 0; itest < testcase.size(); ++itest) { const int idx = testcase[itest]; params.compute_input(ibuffer); // Run the plan using its associated rocFFT library: time[idx].push_back( run_plan(handles[idx], plan[idx], info[idx], pibuffer.data(), pobuffer.data())); if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int iout = 0; iout < output.size(); ++iout) { HIP_V_THROW(hipMemcpy(output[iout].data(), pobuffer[iout], output[iout].size(), hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU output:\n"; params.print_obuffer(output); } } std::cout << "Execution times in ms:\n"; for(unsigned int idx = 0; idx < time.size(); ++idx) { std::cout << "\nExecution gpu time:"; for(auto& i : time[idx]) { std::cout << " " << i; } std::cout << " ms" << std::endl; } // Clean up: for(unsigned int idx = 0; idx < handles.size(); ++idx) { destroy_info(handles[idx], info[idx]); destroy_plan(handles[idx], plan[idx]); rocfft_lib_close(handles[idx]); } #ifndef WIN32 if(python_dl) dlclose(python_dl); #endif return EXIT_SUCCESS; } rocFFT-rocm-5.7.1/clients/rider/rider.cpp000066400000000000000000000313451446473624700202130ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "rider.h" #include "rocfft.h" #include namespace po = boost::program_options; int main(int argc, char* argv[]) { // This helps with mixing output of both wide and narrow characters to the screen std::ios::sync_with_stdio(false); // Control output verbosity: int verbose{}; // hip Device number for running tests: int deviceId{}; // Number of performance trial samples int ntrial{}; // FFT parameters: rocfft_params params; // Token string to fully specify fft params. std::string token; // Declare the supported options. // clang-format doesn't handle boost program options very well: // clang-format off po::options_description opdesc("rocfft rider command line options"); opdesc.add_options()("help,h", "produces this help message") ("version,v", "Print queryable version information from the rocfft library") ("device", po::value(&deviceId)->default_value(0), "Select a specific device id") ("verbose", po::value(&verbose)->default_value(0), "Control output verbosity") ("ntrial,N", po::value(&ntrial)->default_value(1), "Trial size for the problem") ("notInPlace,o", "Not in-place FFT transform (default: in-place)") ("double", "Double precision transform (deprecated: use --precision double)") ("precision", po::value(¶ms.precision), "Transform precision: single (default), double, half") ("transformType,t", po::value(¶ms.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ( "batchSize,b", po::value(¶ms.nbatch)->default_value(1), "If this value is greater than one, arrays will be used ") ( "itype", po::value(¶ms.itype) ->default_value(fft_array_type_unset), "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ( "otype", po::value(¶ms.otype) ->default_value(fft_array_type_unset), "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ("length", po::value>(¶ms.length)->multitoken(), "Lengths.") ("istride", po::value>(¶ms.istride)->multitoken(), "Input strides.") ("ostride", po::value>(¶ms.ostride)->multitoken(), "Output strides.") ("idist", po::value(¶ms.idist)->default_value(0), "Logical distance between input batches.") ("odist", po::value(¶ms.odist)->default_value(0), "Logical distance between output batches.") ("isize", po::value>(¶ms.isize)->multitoken(), "Logical size of input buffer.") ("osize", po::value>(¶ms.osize)->multitoken(), "Logical size of output buffer.") ("ioffset", po::value>(¶ms.ioffset)->multitoken(), "Input offsets.") ("ooffset", po::value>(¶ms.ooffset)->multitoken(), "Output offsets.") ("scalefactor", po::value(¶ms.scale_factor), "Scale factor to apply to output.") ("token", po::value(&token)); // clang-format on po::variables_map vm; po::store(po::parse_command_line(argc, argv, opdesc), vm); po::notify(vm); if(vm.count("help")) { std::cout << opdesc << std::endl; return EXIT_SUCCESS; } if(vm.count("version")) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << std::endl; return EXIT_SUCCESS; } if(vm.count("ntrial")) { std::cout << "Running profile with " << ntrial << " samples\n"; } if(token != "") { std::cout << "Reading fft params from token:\n" << token << std::endl; try { params.from_token(token); } catch(...) { std::cout << "Unable to parse token." << std::endl; return 1; } } else { if(!vm.count("length")) { std::cout << "Please specify transform length!" << std::endl; std::cout << opdesc << std::endl; return EXIT_SUCCESS; } params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; if(vm.count("double")) params.precision = fft_precision_double; if(vm.count("notInPlace")) { std::cout << "out-of-place\n"; } else { std::cout << "in-place\n"; } if(vm.count("length")) { std::cout << "length:"; for(auto& i : params.length) std::cout << " " << i; std::cout << "\n"; } if(vm.count("istride")) { std::cout << "istride:"; for(auto& i : params.istride) std::cout << " " << i; std::cout << "\n"; } if(vm.count("ostride")) { std::cout << "ostride:"; for(auto& i : params.ostride) std::cout << " " << i; std::cout << "\n"; } if(params.idist > 0) { std::cout << "idist: " << params.idist << "\n"; } if(params.odist > 0) { std::cout << "odist: " << params.odist << "\n"; } if(vm.count("ioffset")) { std::cout << "ioffset:"; for(auto& i : params.ioffset) std::cout << " " << i; std::cout << "\n"; } if(vm.count("ooffset")) { std::cout << "ooffset:"; for(auto& i : params.ooffset) std::cout << " " << i; std::cout << "\n"; } } std::cout << std::flush; rocfft_setup(); // Fixme: set the device id properly after the IDs are synced // bewteen hip runtime and rocm-smi. // HIP_V_THROW(hipSetDevice(deviceId), "set device failed!"); params.validate(); if(!params.valid(verbose)) { throw std::runtime_error("Invalid parameters, add --verbose=1 for detail"); } std::cout << "Token: " << params.token() << std::endl; if(verbose) { std::cout << params.str(" ") << std::endl; } // Check free and total available memory: size_t free = 0; size_t total = 0; HIP_V_THROW(hipMemGetInfo(&free, &total), "hipMemGetInfo failed"); const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << raw_vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, free)) { std::cout << "SKIPPED: Problem size (" << vram_footprint << ") raw data too large for device.\n"; return EXIT_SUCCESS; } auto ret = params.create_plan(); if(ret != fft_status_success) LIB_V_THROW(rocfft_status_failure, "Plan creation failed"); // GPU input buffer: auto ibuffer_sizes = params.ibuffer_sizes(); std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { HIP_V_THROW(ibuffer[i].alloc(ibuffer_sizes[i]), "Creating input Buffer failed"); pibuffer[i] = ibuffer[i].data(); } // Input data: params.compute_input(ibuffer); if(verbose > 1) { // Copy input to CPU auto cpu_input = allocate_host_buffer(params.precision, params.itype, params.isize); for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { HIP_V_THROW(hipMemcpy(cpu_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU input:\n"; params.print_ibuffer(cpu_input); } // GPU output buffer: std::vector obuffer_data; std::vector* obuffer = &obuffer_data; if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { HIP_V_THROW(obuffer_data[i].alloc(obuffer_sizes[i]), "Creating output Buffer failed"); } } std::vector pobuffer(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } params.execute(pibuffer.data(), pobuffer.data()); // Run the transform several times and record the execution time: std::vector gpu_time(ntrial); hipEvent_t start, stop; HIP_V_THROW(hipEventCreate(&start), "hipEventCreate failed"); HIP_V_THROW(hipEventCreate(&stop), "hipEventCreate failed"); for(unsigned int itrial = 0; itrial < gpu_time.size(); ++itrial) { params.compute_input(ibuffer); HIP_V_THROW(hipEventRecord(start), "hipEventRecord failed"); params.execute(pibuffer.data(), pobuffer.data()); HIP_V_THROW(hipEventRecord(stop), "hipEventRecord failed"); HIP_V_THROW(hipEventSynchronize(stop), "hipEventSynchronize failed"); float time; HIP_V_THROW(hipEventElapsedTime(&time, start, stop), "hipEventElapsedTime failed"); gpu_time[itrial] = time; if(verbose > 2) { auto output = allocate_host_buffer(params.precision, params.otype, params.osize); for(unsigned int idx = 0; idx < output.size(); ++idx) { HIP_V_THROW(hipMemcpy(output[idx].data(), pobuffer[idx], output[idx].size(), hipMemcpyDeviceToHost), "hipMemcpy failed"); } std::cout << "GPU output:\n"; params.print_obuffer(output); } } std::cout << "\nExecution gpu time:"; for(const auto& i : gpu_time) { std::cout << " " << i; } std::cout << " ms" << std::endl; std::cout << "Execution gflops: "; const double totsize = std::accumulate(params.length.begin(), params.length.end(), 1, std::multiplies()); const double k = ((params.itype == fft_array_type_real) || (params.otype == fft_array_type_real)) ? 2.5 : 5.0; const double opscount = (double)params.nbatch * k * totsize * log(totsize) / log(2.0); for(const auto& i : gpu_time) { std::cout << " " << opscount / (1e6 * i); } std::cout << std::endl; rocfft_cleanup(); HIP_V_THROW(hipEventDestroy(start), "hipEventDestroy failed"); HIP_V_THROW(hipEventDestroy(stop), "hipEventDestroy failed"); } rocFFT-rocm-5.7.1/clients/rider/rider.h000066400000000000000000000054371446473624700176630ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef RIDER_H #define RIDER_H #include "rocfft.h" #include #include #include // This is used to either wrap a HIP function call, or to explicitly check a variable // for an error condition. If an error occurs, we throw. // Note: std::runtime_error does not take unicode strings as input, so only strings // supported inline void hip_V_Throw(hipError_t res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != hipSuccess) { std::stringstream tmp; tmp << "HIP_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } inline void lib_V_Throw(rocfft_status res, const std::string& msg, size_t lineno, const std::string& fileName) { if(res != rocfft_status_success) { std::stringstream tmp; tmp << "LIB_V_THROWERROR< "; tmp << res; tmp << " > ("; tmp << fileName; tmp << " Line: "; tmp << lineno; tmp << "): "; tmp << msg; std::string errorm(tmp.str()); std::cout << errorm << std::endl; throw std::runtime_error(errorm); } } #define HIP_V_THROW(_status, _message) hip_V_Throw(_status, _message, __LINE__, __FILE__) #define LIB_V_THROW(_status, _message) lib_V_Throw(_status, _message, __LINE__, __FILE__) #endif // RIDER_H rocFFT-rocm-5.7.1/clients/samples/000077500000000000000000000000001446473624700167335ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/samples/CMakeLists.txt000066400000000000000000000046711446473624700215030ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() set( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ON ) project( rocfft-clients-samples LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) list( APPEND samples_subdirs "fixed-16" ) list( APPEND samples_subdirs "fixed-large" ) list( APPEND samples_subdirs "rocfft" ) foreach( client ${samples_subdirs} ) add_subdirectory( ${client} ) endforeach( ) rocFFT-rocm-5.7.1/clients/samples/fixed-16/000077500000000000000000000000001446473624700202565ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/samples/fixed-16/CMakeLists.txt000066400000000000000000000074211446473624700230220ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-16 LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() set( sample_list fixed-16-float fixed-16-double fixed-16-half ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft hip::device ${FFTW_LIBRARIES} ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_16_OUT_DIR "/../bin" ) else() set( FIXED_16_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_16_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_16_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_16_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-5.7.1/clients/samples/fixed-16/fixed-16-double.cpp000066400000000000000000000114251446473624700235600ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/fixed-16/fixed-16-float.cpp000066400000000000000000000114211446473624700234070ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/fixed-16/fixed-16-half.cpp000066400000000000000000000116511446473624700232210ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft.h" #include #include #include #include int main() { const size_t N = 16; std::vector<_Float16_2> cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = static_cast<_Float16>(i + (i % 3) - (i % 7)); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(_Float16_2); // Create HIP device object. _Float16_2* x = nullptr; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = NULL; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_half, 1, &length, 1, NULL) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(work_buf_size) { if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, NULL, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector<_Float16_2> y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << static_cast(cx[i].x) << "," << static_cast(cx[i].y) << ")" << " output: (" << static_cast(y[i].x) << "," << static_cast(y[i].y) << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/fixed-large/000077500000000000000000000000001446473624700211225ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/samples/fixed-large/CMakeLists.txt000066400000000000000000000073231446473624700236670ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-fixed-large LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() set( sample_list fixed-large-float fixed-large-double ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} ) set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( FIXED_LARGE_OUT_DIR "/../bin" ) else() set( FIXED_LARGE_OUT_DIR "/bin" ) endif() string( CONCAT FIXED_LARGE_OUT_DIR "${PROJECT_BINARY_DIR}" ${FIXED_LARGE_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${FIXED_LARGE_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ) endforeach( ) rocFFT-rocm-5.7.1/clients/samples/fixed-large/fixed-large-double.cpp000066400000000000000000000116451446473624700252740ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/fixed-large/fixed-large-float.cpp000066400000000000000000000116411446473624700251230ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include #include #include #include "rocfft.h" #include #include int main() { // For size N >= 8192, temporary buffer is required to allocated const size_t N = 64 * 2048; std::vector cx(N); for(size_t i = 0; i < N; i++) { cx[i].x = i + (i % 3) - (i % 7); cx[i].y = 0; } // rocfft gpu compute // ======================================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(float2); // Create HIP device object. float2* x; if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device if(hipMemcpy(x, &cx[0], Nbytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Setup work buffer void* workBuffer = nullptr; size_t workBufferSize = 0; if(rocfft_plan_get_work_buffer_size(plan, &workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); // Setup exec info to pass work buffer to the library rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(workBufferSize > 0) { printf("size of workbuffer=%d\n", (int)workBufferSize); if(hipMalloc(&workBuffer, workBufferSize) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, workBuffer, workBufferSize) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; if(workBuffer) if(hipFree(workBuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; // Copy result back to host std::vector y(N); if(hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/rocfft/000077500000000000000000000000001446473624700202165ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/samples/rocfft/CMakeLists.txt000066400000000000000000000103771446473624700227660ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-samples-rocfft LANGUAGES CXX ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() find_package( Boost COMPONENTS program_options REQUIRED) set( Boost_DEBUG ON ) set( Boost_USE_MULTITHREADED ON ) if(NOT Boost_LIBRARIES) # Fixes a bug in Boost's CMAKE where Boost_LIBRARIES is not set. set(Boost_LIBRARIES Boost::program_options) endif() set( sample_list rocfft_example_complexcomplex rocfft_example_realcomplex rocfft_example_set_stream rocfft_example_callback ) foreach( sample ${sample_list} ) add_executable( ${sample} ${sample}.cpp ) target_include_directories( ${sample} PRIVATE $ $ ) target_link_libraries( ${sample} PRIVATE roc::rocfft hip::hiprand ${Boost_LIBRARIES} ) target_compile_options( ${sample} PRIVATE ${WARNING_FLAGS} -Wno-cpp ) set_target_properties( ${sample} PROPERTIES DEBUG_POSTFIX "-d" CXX_STANDARD 17 CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../../bin" ) elseif( ROCFFT_CLIENTS_SAMPLES_BUILD_SCOPE ) set( SAMPLES_ROCFFT_OUT_DIR "/../bin" ) else() set( SAMPLES_ROCFFT_OUT_DIR "/bin" ) endif() string( CONCAT SAMPLES_ROCFFT_OUT_DIR "${PROJECT_BINARY_DIR}" ${SAMPLES_ROCFFT_OUT_DIR} ) set_target_properties(${sample} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${SAMPLES_ROCFFT_OUT_DIR}) if( CUDA_FOUND ) target_include_directories( ${sample} PRIVATE $ $ ) target_compile_definitions( ${sample} PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( ${sample} PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) endforeach( ) rocFFT-rocm-5.7.1/clients/samples/rocfft/examplekernels.h000066400000000000000000000357701446473624700234220ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEKERNELS_H #define EXAMPLEKERNELS_H #include "../../../shared/data_gen.h" #include #include #include // Kernel for initializing 1D real input data on the GPU. __global__ void initrdata1(double* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos] = idx + 1; } } // Kernel for initializing 2D real input data on the GPU. __global__ void initrdata2( double* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos] = idx + idy; } } // Kernel for initializing 3D real input data on the GPU. __global__ void initrdata3(double* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos] = cos(cos(idx + 2)) * sin(idy * idy + 1) / (idz + 1); } } // Kernel for initializing 1D complex data on the GPU. __global__ void initcdata1(hipDoubleComplex* x, const size_t Nx, const size_t xstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < Nx) { const auto pos = idx * xstride; x[pos].x = 1 + idx; x[pos].y = 1 + idx; } } // Kernel for initializing 2D complex input data on the GPU. __global__ void initcdata2(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t xstride, const size_t ystride) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; const auto idy = blockIdx.y * blockDim.y + threadIdx.y; if(idx < Nx && idy < Ny) { const auto pos = idx * xstride + idy * ystride; x[pos].x = idx + 1; x[pos].y = idy + 1; } } // Kernel for initializing 3D complex input data on the GPU. __global__ void initcdata3(hipDoubleComplex* x, const size_t Nx, const size_t Ny, const size_t Nz, const size_t xstride, const size_t ystride, const size_t zstride) { const size_t idx = blockIdx.x * blockDim.x + threadIdx.x; const size_t idy = blockIdx.y * blockDim.y + threadIdx.y; const size_t idz = blockIdx.z * blockDim.z + threadIdx.z; if(idx < Nx && idy < Ny && idz < Nz) { const auto pos = idx * xstride + idy * ystride + idz * zstride; x[pos].x = idx + 10.0 * idz + 1; x[pos].y = idy + 10; } } // Helper function for determining grid dimensions template Tint1 ceildiv(const Tint1 nominator, const Tint2 denominator) { return (nominator + denominator - 1) / denominator; } // The following functions call the above kernels to initalize the input data for the transform. void initcomplex_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { switch(length_cm.size()) { case 1: { const dim3 blockdim(256); const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL(initcdata1, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 blockdim(64, 64); const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 blockdim(32, 32, 32); const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } } // Initialize the real input buffer where the data has lengths given in length and stride given in // stride. The device buffer is assumed to have been allocated. void initreal_cm(const std::vector& length_cm, const std::vector& stride_cm, void* gpu_in) { switch(length_cm.size()) { case 1: { const dim3 blockdim(256); const dim3 griddim(ceildiv(length_cm[0], blockdim.x)); hipLaunchKernelGGL( initrdata1, blockdim, griddim, 0, 0, (double*)gpu_in, length_cm[0], stride_cm[0]); break; } case 2: { const dim3 blockdim(64, 64); const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y)); hipLaunchKernelGGL(initrdata2, blockdim, griddim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], stride_cm[0], stride_cm[1]); break; } case 3: { const dim3 blockdim(32, 32, 32); const dim3 griddim(ceildiv(length_cm[0], blockdim.x), ceildiv(length_cm[1], blockdim.y), ceildiv(length_cm[2], blockdim.z)); hipLaunchKernelGGL(initrdata3, blockdim, griddim, 0, 0, (double*)gpu_in, length_cm[0], length_cm[1], length_cm[2], stride_cm[0], stride_cm[1], stride_cm[2]); break; } default: std::cout << "invalid dimension!\n"; exit(1); } } void impose_hermitian_symmetry_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { switch(length.size()) { case 1: { hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, dim3(1), dim3(1), 0, 0, (hipDoubleComplex*)gpu_in, length[0], stride[0], 1, 1, length[0] % 2 == 0); break; } case 2: { hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, dim3(256), dim3(ceildiv(ceildiv(ilength[1], 2), 256)), 0, 0, (hipDoubleComplex*)gpu_in, length[0], length[1], stride[0], stride[1], 1, 1, length[0] % 2 == 0, length[1] % 2 == 0); break; } case 3: { hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, dim3(64, 64), dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)), 0, 0, (hipDoubleComplex*)gpu_in, length[0], length[1], length[2], stride[0], stride[1], stride[2], 1, 1, length[0] % 2 == 0, length[1] % 2 == 0, length[2] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension"); } } // Initialize the real input buffer where the data has lengths given in length, the transform has // lengths given in length and stride given in stride. The device buffer is assumed to have been // allocated. void init_hermitiancomplex_cm(const std::vector& length, const std::vector& ilength, const std::vector& stride, void* gpu_in) { switch(length.size()) { case 1: { const dim3 blockdim(256); const dim3 griddim(ceildiv(ilength[0], blockdim.x)); hipLaunchKernelGGL( initcdata1, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], stride[0]); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_1, dim3(1), dim3(1), 0, 0, (hipDoubleComplex*)gpu_in, length[0], stride[0], 1, 1, length[0] % 2 == 0); break; } case 2: { const dim3 blockdim(64, 64); const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y)); hipLaunchKernelGGL(initcdata2, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], stride[0], stride[1]); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_2, dim3(256), dim3(ceildiv(ceildiv(ilength[1], 2), 256)), 0, 0, (hipDoubleComplex*)gpu_in, length[0], length[1], stride[0], stride[1], 1, 1, length[0] % 2 == 0, length[1] % 2 == 0); break; } case 3: { const dim3 blockdim(32, 32, 32); const dim3 griddim(ceildiv(ilength[0], blockdim.x), ceildiv(ilength[1], blockdim.y), ceildiv(ilength[2], blockdim.z)); hipLaunchKernelGGL(initcdata3, blockdim, griddim, 0, 0, (hipDoubleComplex*)gpu_in, ilength[0], ilength[1], ilength[2], stride[0], stride[1], stride[2]); hipLaunchKernelGGL(impose_hermitian_symmetry_interleaved_3, dim3(64, 64), dim3(ceildiv(ilength[1], 64), ceildiv(ceildiv(ilength[2], 2), 64)), 0, 0, (hipDoubleComplex*)gpu_in, length[0], length[1], length[2], stride[0], stride[1], stride[2], 1, 1, length[0] % 2 == 0, length[1] % 2 == 0, length[2] % 2 == 0); break; } default: throw std::runtime_error("Invalid dimension"); } impose_hermitian_symmetry_cm(length, ilength, stride, gpu_in); } #endif /* EXAMPLEKERNELS_H */ rocFFT-rocm-5.7.1/clients/samples/rocfft/exampleutils.h000066400000000000000000000136441446473624700231130ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef EXAMPLEUTILS_H #define EXAMPLEUTILS_H std::ostream& operator<<(std::ostream& stream, hipDoubleComplex c) { stream << "(" << c.x << "," << c.y << ")"; return stream; } // Increment the index (column-major) for looping over arbitrary dimensional loops with // dimensions length. template bool increment_cm(std::vector& index, const std::vector& length) { for(unsigned int idim = 0; idim < length.size(); ++idim) { if(index[idim] < length[idim]) { if(++index[idim] == length[idim]) { index[idim] = 0; continue; } break; } } // End the loop when we get back to the start: return !std::all_of(index.begin(), index.end(), [](int i) { return i == 0; }); } // Output a formatted general-dimensional array with given length and stride in batches // separated by dist, in column-major order. template void printbuffer_cm(const std::vector& data, const std::vector& length, const std::vector& stride, const size_t nbatch, const size_t dist) { for(size_t b = 0; b < nbatch; b++) { std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const auto i = std::inner_product(index.begin(), index.end(), stride.begin(), b * dist); assert(i >= 0); assert(i < data.size()); std::cout << data[i] << " "; for(size_t idx = 0; idx < index.size(); ++idx) { if(index[idx] == (length[idx] - 1)) { std::cout << "\n"; } else { break; } } } while(increment_cm(index, length)); std::cout << std::endl; } } // Check that an multi-dimensional array of complex values with dimensions length // and straide stride, with nbatch copies separated by dist is Hermitian-symmetric. // Column-major version. template bool check_symmetry_cm(const std::vector& data, const std::vector& length_cm, const std::vector& stride_cm, const size_t nbatch, const size_t dist, const bool verbose = true) { bool issymmetric = true; for(size_t b = 0; b < nbatch; b++) { std::vector index(length_cm.size()); std::fill(index.begin(), index.end(), 0); do { bool skip = false; std::vector negindex(index.size()); for(size_t idx = 0; idx < index.size(); ++idx) { if(index[0] > length_cm[0] / 2) { skip = true; break; } negindex[idx] = (length_cm[idx] - index[idx]) % length_cm[idx]; } if(negindex[0] > length_cm[0] / 2) { skip = true; } if(!skip) { const auto i = std::inner_product(index.begin(), index.end(), stride_cm.begin(), b * dist); const auto j = std::inner_product( negindex.begin(), negindex.end(), stride_cm.begin(), b * dist); if((data[i].x != data[j].x) or (data[i].y != -data[j].y)) { if(verbose) { std::cout << "("; std::string separator; for(auto val : index) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << i << "\t"; std::cout << "("; separator = ""; for(auto val : negindex) { std::cout << separator << val; separator = ","; } std::cout << ")->"; std::cout << j << ":\t"; std::cout << data[i] << " " << data[j]; std::cout << "\tnot conjugate!" << std::endl; } issymmetric = false; } } } while(increment_cm(index, length_cm)); } return issymmetric; } #endif /* EXAMPLEUTILS_H */ rocFFT-rocm-5.7.1/clients/samples/rocfft/rocfft_example_callback.cpp000066400000000000000000000156611446473624700255450ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "rocfft.h" #include #include #include #include #include #include #include // example of using load/store callbacks with rocfft struct load_cbdata { double2* filter; double scale; }; __device__ double2 load_callback(double2* input, size_t offset, void* cbdata, void* sharedMem) { auto data = static_cast(cbdata); // multiply each element by filter element and scale return hipCmul(hipCmul(input[offset], data->filter[offset]), make_hipDoubleComplex(data->scale, data->scale)); } __device__ auto load_callback_dev = load_callback; int main() { const size_t N = 8; std::vector cx(N), filter(N); // initialize data and filter for(size_t i = 0; i < N; i++) { cx[i].x = i; cx[i].y = i; filter[i].x = rand() / static_cast(RAND_MAX); filter[i].y = 0; } // rocfft gpu compute // ================== if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); size_t Nbytes = N * sizeof(double2); // Create HIP device object. double2 *x, *filter_dev; // create buffers if(hipMalloc(&x, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(hipMalloc(&filter_dev, Nbytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Copy data to device hipError_t hip_status = hipMemcpy(x, cx.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); hip_status = hipMemcpy(filter_dev, filter.data(), Nbytes, hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Create plan rocfft_plan plan = nullptr; size_t length = N; if(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr) != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); // Check if the plan requires a work buffer size_t work_buf_size = 0; if(rocfft_plan_get_work_buffer_size(plan, &work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); void* work_buf = nullptr; rocfft_execution_info info = nullptr; if(rocfft_execution_info_create(&info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); if(work_buf_size) { if(hipMalloc(&work_buf, work_buf_size) != hipSuccess) throw std::runtime_error("hipMalloc failed."); if(rocfft_execution_info_set_work_buffer(info, work_buf, work_buf_size) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_work_buffer failed."); } // Prepare callback load_cbdata cbdata_host; cbdata_host.filter = filter_dev; cbdata_host.scale = 1.0 / static_cast(N); void* cbdata_dev; if(hipMalloc(&cbdata_dev, sizeof(load_cbdata)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); hip_status = hipMemcpy(cbdata_dev, &cbdata_host, sizeof(load_cbdata), hipMemcpyHostToDevice); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // Get a properly-typed host pointer to the device function, as // rocfft_execution_info_set_load_callback expects void*. void* cbptr_host = nullptr; hip_status = hipMemcpyFromSymbol(&cbptr_host, HIP_SYMBOL(load_callback_dev), sizeof(void*)); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpyFromSymbol failed."); // set callback if(rocfft_execution_info_set_load_callback(info, &cbptr_host, &cbdata_dev, 0) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_load_callback failed."); // Execute plan if(rocfft_execute(plan, (void**)&x, nullptr, info) != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); // Clean up work buffer if(work_buf_size) { if(hipFree(work_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_execution_info_destroy(info) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); info = nullptr; } // Destroy plan if(rocfft_plan_destroy(plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); plan = nullptr; // Copy result back to host std::vector y(N); hip_status = hipMemcpy(&y[0], x, Nbytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); for(size_t i = 0; i < N; i++) { std::cout << "element " << i << " input: (" << cx[i].x << "," << cx[i].y << ")" << " output: (" << y[i].x << "," << y[i].y << ")" << std::endl; } if(hipFree(cbdata_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(filter_dev) != hipSuccess) throw std::runtime_error("hipFree failed."); if(hipFree(x) != hipSuccess) throw std::runtime_error("hipFree failed."); if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/rocfft/rocfft_example_complexcomplex.cpp000066400000000000000000000251121446473624700270400ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include namespace po = boost::program_options; #include #include #include #include #include #include #include #include #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision complex-to-complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: int deviceId = 0; // Command-line options: // clang-format off po::options_description desc("rocfft sample command line options"); desc.add_options()("help,h", "produces this help message") ("device", po::value(&deviceId)->default_value(0), "Select a specific device id") ("outofplace,o", "Perform an out-of-place transform") ("inverse,i", "Perform an inverse transform") ("length", po::value>(&length)->multitoken(), "Lengths of the transform separated by spaces (eg: --length 4 4)."); // clang-format on po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); if(vm.count("help")) { std::cout << desc << std::endl; return 0; } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = vm.count("inverse") ? rocfft_transform_type_complex_forward : rocfft_transform_type_complex_inverse; // Set up the strides and buffer size for the input: std::vector istride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { istride.push_back(length[i - 1] * istride[i - 1]); } const size_t isize = length[length.size() - 1] * istride[istride.size() - 1]; // Set up the strides and buffer size for the output: std::vector ostride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { ostride.push_back(length[i - 1] * ostride[i - 1]); } const size_t osize = length[length.size() - 1] * ostride[ostride.size() - 1]; // Print information about the transform: std::cout << "direction: "; if(direction == rocfft_transform_type_complex_forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input strides:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "output strides:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "input size: " << isize << "\n"; std::cout << "output size: " << isize << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and allocate data hipDoubleComplex* gpu_in = nullptr; if(hipMalloc(&gpu_in, isize * sizeof(hipDoubleComplex)) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // Inititalize the data on the device initcomplex_cm(length, istride, gpu_in); if(hipDeviceSynchronize() != hipSuccess) throw std::runtime_error("hipDeviceSynchronize failed."); hipError_t hip_status = hipGetLastError(); if(hip_status != hipSuccess) throw std::runtime_error("device error"); std::cout << "input:\n"; std::vector idata(isize); hip_status = hipMemcpy(idata.data(), gpu_in, isize * sizeof(hipDoubleComplex), hipMemcpyDefault); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(idata, length, istride, 1, isize); // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout(gpu_description, rocfft_array_type_complex_interleaved, rocfft_array_type_complex_interleaved, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the plan rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer."); } // If the transform is out-of-place, allocate the output buffer as well: double2* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, osize * sizeof(hipDoubleComplex)); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed."); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute."); // Get the output from the device and print to cout: std::cout << "output:\n"; std::vector odata(osize); hip_status = hipMemcpy(odata.data(), gpu_out, osize * sizeof(hipDoubleComplex), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(odata, length, istride, 1, isize); // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/samples/rocfft/rocfft_example_realcomplex.cpp000066400000000000000000000303621446473624700263170ustar00rootroot00000000000000// Copyright (C) 2019 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include namespace po = boost::program_options; #include #include #include #include #include #include #include #include #include "examplekernels.h" #include "exampleutils.h" #include int main(int argc, char* argv[]) { std::cout << "rocfft double-precision real/complex transform\n" << std::endl; // Length of transform: std::vector length = {8}; // Gpu device id: int deviceId = 0; // Command-line options: // clang-format off po::options_description desc("rocfft sample command line options"); desc.add_options()("help,h", "produces this help message") ("device", po::value(&deviceId)->default_value(0), "Select a specific device id") ("outofplace,o", "Perform an out-of-place transform") ("inverse,i", "Perform an inverse transform") ("length", po::value>(&length)->multitoken(), "Lengths of the transform separated by spaces"); // clang-format on po::variables_map vm; po::store(po::parse_command_line(argc, argv, desc), vm); po::notify(vm); if(vm.count("help")) { std::cout << desc << std::endl; return 0; } // Placeness for the transform if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); const rocfft_result_placement place = vm.count("outofplace") ? rocfft_placement_notinplace : rocfft_placement_inplace; const bool inplace = place == rocfft_placement_inplace; // Direction of transform const rocfft_transform_type direction = vm.count("inverse") ? rocfft_transform_type_real_inverse : rocfft_transform_type_real_forward; const bool forward = direction == rocfft_transform_type_real_forward; // Set up the strides and buffer size for the real values: std::vector rstride = {1}; for(unsigned int i = 1; i < length.size(); ++i) { // In-place transforms need space for two extra real values in the contiguous // direction. auto val = (length[i - 1] + ((inplace && i == 1) ? 2 : 0)) * rstride[i - 1]; rstride.push_back(val); } // NB: not tight, but hey const size_t real_size = length[length.size() - 1] * rstride[rstride.size() - 1]; std::vector rdata(real_size); // host storage // The complex data length is half + 1 of the real data length in the contiguous // dimensions. Since rocFFT is column-major, this is the first index. std::vector clength = length; clength[0] = clength[0] / 2 + 1; std::vector cstride = {1}; for(unsigned int i = 1; i < clength.size(); ++i) { cstride.push_back(clength[i - 1] * cstride[i - 1]); } const size_t complex_size = clength[clength.size() - 1] * cstride[cstride.size() - 1]; std::vector cdata(complex_size); // host storage // Based on the direction, we set the input and output parameters appropriately. const size_t isize = forward ? real_size : complex_size; const size_t ibytes = isize * (forward ? sizeof(double) : sizeof(hipDoubleComplex)); const std::vector ilength = forward ? length : clength; const std::vector istride = forward ? rstride : cstride; const size_t osize = forward ? complex_size : real_size; const size_t obytes = osize * (forward ? sizeof(hipDoubleComplex) : sizeof(double)); const std::vector olength = forward ? clength : length; const std::vector ostride = forward ? cstride : rstride; // Print information about the transform: std::cout << "direction: "; if(forward) std::cout << "forward\n"; else std::cout << "inverse\n"; std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; if(inplace) std::cout << "in-place transform\n"; else std::cout << "out-of-place transform\n"; std::cout << "deviceID: " << deviceId << "\n"; std::cout << "input length:"; for(auto i : ilength) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer stride:"; for(auto i : istride) std::cout << " " << i; std::cout << "\n"; std::cout << "input buffer size: " << ibytes << "\n"; std::cout << "output length:"; for(auto i : olength) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer stride:"; for(auto i : ostride) std::cout << " " << i; std::cout << "\n"; std::cout << "output buffer size: " << obytes << "\n"; std::cout << std::endl; // Set the device: if(hipSetDevice(deviceId) != hipSuccess) throw std::runtime_error("hipSetDevice failed."); // Create HIP device object and initialize data // Kernels are provided in examplekernels.h void* gpu_in = nullptr; hipError_t hip_status = hipMalloc(&gpu_in, inplace ? std::max(ibytes, obytes) : ibytes); if(hip_status != hipSuccess) throw std::runtime_error("device error"); if(forward) { initreal_cm(length, istride, gpu_in); } else { init_hermitiancomplex_cm(length, ilength, istride, gpu_in); } // Print the input: std::cout << "input:\n"; if(forward) { hip_status = hipMemcpy(rdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, ilength, istride, 1, isize); } else { hip_status = hipMemcpy(cdata.data(), gpu_in, ibytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, ilength, istride, 1, isize); // Check that the buffer is Hermitian symmetric: check_symmetry_cm(cdata, length, istride, 1, isize); } // rocfft_status can be used to capture API status info rocfft_status rc = rocfft_status_success; // Create the a descrition struct to set data layout: rocfft_plan_description gpu_description = nullptr; rc = rocfft_plan_description_create(&gpu_description); if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan description"); rc = rocfft_plan_description_set_data_layout( gpu_description, // input data format: forward ? rocfft_array_type_real : rocfft_array_type_hermitian_interleaved, // output data format: forward ? rocfft_array_type_hermitian_interleaved : rocfft_array_type_real, nullptr, nullptr, istride.size(), // input stride length istride.data(), // input stride data 0, // input batch distance ostride.size(), // output stride length ostride.data(), // output stride data 0); // ouptut batch distance if(rc != rocfft_status_success) throw std::runtime_error("failed to set data layout"); // We can also pass "nullptr" instead of a description; rocFFT will use reasonable // default parameters. If the data isn't contiguous, we need to set strides, etc, // using the description. // Create the FFT plan: rocfft_plan gpu_plan = nullptr; rc = rocfft_plan_create(&gpu_plan, place, direction, rocfft_precision_double, length.size(), // Dimension length.data(), // lengths 1, // Number of transforms gpu_description); // Description if(rc != rocfft_status_success) throw std::runtime_error("failed to create plan"); // Get the execution info for the fft plan (in particular, work memory requirements): rocfft_execution_info planinfo = nullptr; rc = rocfft_execution_info_create(&planinfo); if(rc != rocfft_status_success) throw std::runtime_error("failed to create execution info"); size_t workbuffersize = 0; rc = rocfft_plan_get_work_buffer_size(gpu_plan, &workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to get work buffer size"); // If the transform requires work memory, allocate a work buffer: void* wbuffer = nullptr; if(workbuffersize > 0) { hip_status = hipMalloc(&wbuffer, workbuffersize); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); rc = rocfft_execution_info_set_work_buffer(planinfo, wbuffer, workbuffersize); if(rc != rocfft_status_success) throw std::runtime_error("failed to set work buffer"); } // If the transform is out-of-place, allocate the output buffer as well: void* gpu_out = inplace ? gpu_in : nullptr; if(!inplace) { hip_status = hipMalloc(&gpu_out, obytes); if(hip_status != hipSuccess) throw std::runtime_error("hipMalloc failed"); } // Execute the GPU transform: rc = rocfft_execute(gpu_plan, // plan (void**)&gpu_in, // in_buffer (void**)&gpu_out, // out_buffer planinfo); // execution info if(rc != rocfft_status_success) throw std::runtime_error("failed to execute"); // Get the output from the device and print to cout: std::cout << "output:\n"; if(forward) { hip_status = hipMemcpy(cdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(cdata, olength, ostride, 1, osize); } else { hip_status = hipMemcpy(rdata.data(), gpu_out, obytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); printbuffer_cm(rdata, olength, ostride, 1, osize); } // Clean up: free GPU memory: if(hipFree(gpu_in) != hipSuccess) throw std::runtime_error("hipFree failed."); if(!inplace) { if(hipFree(gpu_out) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(wbuffer != nullptr) { if(hipFree(wbuffer) != hipSuccess) throw std::runtime_error("hipFree failed."); } // Clean up: destroy plans: if(rocfft_execution_info_destroy(planinfo) != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); planinfo = nullptr; if(rocfft_plan_description_destroy(gpu_description) != rocfft_status_success) throw std::runtime_error("rocfft_plan_description_destroy failed."); gpu_description = nullptr; if(rocfft_plan_destroy(gpu_plan) != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); gpu_plan = nullptr; rocfft_cleanup(); return 0; } rocFFT-rocm-5.7.1/clients/samples/rocfft/rocfft_example_set_stream.cpp000066400000000000000000000126401446473624700261510ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft.h" #include #include #include #include #include struct fft_fixture_t { std::vector cpu_buf; double2* gpu_buf = nullptr; hipStream_t stream = nullptr; rocfft_execution_info info = nullptr; rocfft_plan plan = nullptr; }; int main(int argc, char* argv[]) { std::cout << "rocfft example of 2 inplace transforms with 2 streams.\n" << std::endl; size_t length = 8; size_t total_bytes = length * sizeof(double2); hipError_t hip_status; rocfft_status fft_status; fft_fixture_t ffts[2]; /// preparation if(rocfft_setup() != rocfft_status_success) throw std::runtime_error("rocfft_setup failed."); for(auto& it : ffts) { // create cpu buffer it.cpu_buf.resize(length); // init cpu buffer... // create gpu buffer if(hipMalloc(&(it.gpu_buf), total_bytes) != hipSuccess) throw std::runtime_error("hipMalloc failed."); // copy host to device if(hipMemcpy(it.gpu_buf, it.cpu_buf.data(), total_bytes, hipMemcpyHostToDevice) != hipSuccess) throw std::runtime_error("hipMemcpy failed."); // create stream if(hipStreamCreate(&(it.stream)) != hipSuccess) throw std::runtime_error("hipStreamCreate failed."); // create execution info fft_status = rocfft_execution_info_create(&(it.info)); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_create failed."); // set stream // NOTE: The stream must be of type hipStream_t. // It is an error to pass the address of a hipStream_t object. fft_status = rocfft_execution_info_set_stream(it.info, it.stream); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_set_stream failed."); // create plan fft_status = rocfft_plan_create(&it.plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_double, 1, &length, 1, nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_create failed."); size_t work_buf_size = 0; fft_status = rocfft_plan_get_work_buffer_size(it.plan, &work_buf_size); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_get_work_buffer_size failed."); assert(work_buf_size == 0); // simple 1D inplace fft doesn't need extra working buffer } /// execution for(auto& it : ffts) { fft_status = rocfft_execute(it.plan, (void**)&(it.gpu_buf), (void**)&(it.gpu_buf), nullptr); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execute failed."); } /// wait and copy back for(auto& it : ffts) { if(hipStreamSynchronize(it.stream) != hipSuccess) throw std::runtime_error("hipStreamSynchronize failed."); hip_status = hipMemcpy(it.cpu_buf.data(), it.gpu_buf, total_bytes, hipMemcpyDeviceToHost); if(hip_status != hipSuccess) throw std::runtime_error("hipMemcpy failed."); } /// clean up for(auto& it : ffts) { fft_status = rocfft_plan_destroy(it.plan); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_plan_destroy failed."); fft_status = rocfft_execution_info_destroy(it.info); if(fft_status != rocfft_status_success) throw std::runtime_error("rocfft_execution_info_destroy failed."); if(hipStreamDestroy(it.stream) != hipSuccess) throw std::runtime_error("hipStreamDestroy failed."); if(hipFree(it.gpu_buf) != hipSuccess) throw std::runtime_error("hipFree failed."); } if(rocfft_cleanup() != rocfft_status_success) throw std::runtime_error("rocfft_cleanup failed."); return 0; } rocFFT-rocm-5.7.1/clients/tests/000077500000000000000000000000001446473624700164315ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/tests/CMakeLists.txt000066400000000000000000000235211446473624700211740ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# cmake_minimum_required( VERSION 3.16 ) # This should appear before the project command, because it does not # use FORCE if( WIN32 ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" ) else( ) set( CMAKE_INSTALL_PREFIX "/opt/rocm" CACHE PATH "Install path prefix, prepended onto install directories" ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user # specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() project( rocfft-clients-tests LANGUAGES CXX ) set(CMAKE_CXX_STANDARD 17) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR}/cmake ) if( NOT TARGET rocfft ) find_package( rocfft REQUIRED CONFIG PATHS ) endif( ) if( NOT HIP_FOUND ) find_package( HIP REQUIRED ) endif() if( NOT ROCM_FOUND ) find_package( ROCM 0.7.3 REQUIRED ) endif() if( NOT hiprand_FOUND ) find_package( hiprand REQUIRED ) endif() include( ROCMInstallTargets ) set( rocfft-test_source gtest_main.cpp rocfft_accuracy_test.cpp accuracy_test.cpp accuracy_test_1D.cpp accuracy_test_2D.cpp accuracy_test_3D.cpp accuracy_test_adhoc.cpp accuracy_test_callback.cpp accuracy_test_checkstride.cpp multithread_test.cpp hermitian_test.cpp hipGraph_test.cpp default_callbacks_test.cpp unit_test.cpp misc/source/test_exception.cpp validate_length_stride.cpp random.cpp ../../shared/array_validator.cpp ) set( rocfft-test_includes fftw_transform.h rocfft_against_fftw.h misc/include/test_exception.h ) add_executable( rocfft-test ${rocfft-test_source} ${rocfft-test_includes} ) add_executable( rtc_helper_crash rtc_helper_crash.cpp ) find_package( Boost COMPONENTS program_options REQUIRED) set( Boost_DEBUG ON ) set( Boost_USE_MULTITHREADED ON ) set( Boost_DETAILED_FAILURE_MSG ON ) set( Boost_USE_STATIC_LIBS OFF ) option( BUILD_FFTW "Download and build FFTW" OFF ) # look for installed FFTW if we weren't asked to build it if( NOT BUILD_FFTW ) find_package( FFTW 3.0 MODULE COMPONENTS FLOAT DOUBLE ) endif() include( ExternalProject ) # also try to build FFTW if FFTW isn't present if( BUILD_FFTW OR NOT FFTW_FOUND ) set(FFTW_LIBRARIES_DOUBLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_double-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_LIBRARIES_SINGLE ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f_threads${CMAKE_SHARED_LIBRARY_SUFFIX} ${CMAKE_CURRENT_BINARY_DIR}/src/fftw_single-build/${CMAKE_SHARED_LIBRARY_PREFIX}fftw3f${CMAKE_SHARED_LIBRARY_SUFFIX}) set(FFTW_CMAKE_ARGS_COMMON -DDISABLE_FORTRAN=ON -DENABLE_AVX2=ON -DENABLE_THREADS=ON -DBUILD_SHARED_LIBS=ON -DBUILD_TESTS=OFF -DCMAKE_C_COMPILER_LAUNCHER=${CMAKE_C_COMPILER_LAUNCHER}) set(FFTW_SRC_URL http://www.fftw.org/fftw-3.3.9.tar.gz CACHE STRING "Location of FFTW source code") set(FFTW_SRC_SHA256 bf2c7ce40b04ae811af714deb512510cc2c17b9ab9d6ddcf49fe4487eea7af3d CACHE STRING "SHA256 hash of FFTW source code") # build double-precision FFTW ExternalProject_Add(fftw_double URL ${FFTW_SRC_URL} URL_HASH SHA256=${FFTW_SRC_SHA256} SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_DOUBLE}) ExternalProject_Get_Property( fftw_double source_dir binary_dir ) # also build single-precision fftw from the same source dir ExternalProject_Add(fftw_single DOWNLOAD_COMMAND "" SOURCE_DIR ${CMAKE_CURRENT_BINARY_DIR}/src/fftw PREFIX ${CMAKE_CURRENT_BINARY_DIR} CMAKE_ARGS ${FFTW_CMAKE_ARGS_COMMON} -DENABLE_FLOAT=ON INSTALL_COMMAND "" BUILD_BYPRODUCTS ${FFTW_LIBRARIES_SINGLE} DEPENDS fftw_double) ExternalProject_Get_Property( fftw_single source_dir binary_dir ) set(FFTW_INCLUDES ${CMAKE_CURRENT_BINARY_DIR}/src/fftw/api) set(FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ${FFTW_LIBRARIES_SINGLE}) # FFTW we build is always threaded set( FFTW_MULTITHREAD TRUE ) add_dependencies( rocfft-test fftw_double fftw_single ) rocm_install( FILES ${FFTW_LIBRARIES} DESTINATION ${CMAKE_INSTALL_LIBDIR}/fftw COMPONENT clients-common ) endif() set( rocfft-test_include_dirs $ $ $ $ ${ROCM_CLANG_ROOT}/include ) set( rocfft-test_link_libs ${FFTW_LIBRARIES} Boost::program_options ) include( ../cmake/build-gtest.cmake ) if( BUILD_GTEST OR NOT GTEST_FOUND ) add_dependencies( rocfft-test gtest ) list( APPEND rocfft-test_include_dirs ${GTEST_INCLUDE_DIRS} ) list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) else() list( APPEND rocfft-test_include_dirs $ ) list( APPEND rocfft-test_link_libs ${GTEST_LIBRARIES} ) endif() target_compile_options( rocfft-test PRIVATE ${WARNING_FLAGS} -Wno-cpp ) if( ROCFFT_RUNTIME_COMPILE ) target_compile_options( rocfft-test PRIVATE -DROCFFT_RUNTIME_COMPILE ) endif() target_include_directories( rocfft-test PRIVATE ${rocfft-test_include_dirs} ) if( NOT BUILD_SHARED_LIBS ) list(APPEND rocfft-test_link_libs ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS}) endif() target_link_libraries( rocfft-test PRIVATE hip::device roc::rocfft hip::hiprand ${rocfft-test_link_libs} ) if( USE_CUDA ) target_include_directories( rocfft-test PRIVATE $ $ ) target_compile_definitions( rocfft-test PRIVATE __HIP_PLATFORM_NVCC__ ) endif( ) target_link_libraries( rocfft-test PRIVATE ${ROCFFT_CLIENTS_HOST_LINK_LIBS} ${ROCFFT_CLIENTS_DEVICE_LINK_LIBS} ) option( BUILD_CLIENTS_TESTS_OPENMP "Build tests with OpenMP" ON ) if( BUILD_CLIENTS_TESTS_OPENMP ) if( CMAKE_CXX_COMPILER MATCHES ".*/hipcc$" ) target_compile_options( rocfft-test PRIVATE -fopenmp ) target_link_libraries( rocfft-test PRIVATE -fopenmp -L${HIP_CLANG_ROOT}/lib -Wl,-rpath=${HIP_CLANG_ROOT}/lib ) target_include_directories( rocfft-test PRIVATE ${HIP_CLANG_ROOT}/include ) else() if(CMAKE_CXX_COMPILER_ID STREQUAL "Clang") target_compile_options( rocfft-test PRIVATE -fopenmp=libomp ) target_link_options( rocfft-test PRIVATE -fopenmp=libomp ) endif() endif() endif() if(FFTW_MULTITHREAD) target_compile_options( rocfft-test PRIVATE -DFFTW_MULTITHREAD ) endif( ) set_target_properties( rocfft-test PROPERTIES DEBUG_POSTFIX "-d" CXX_STANDARD_REQUIRED ON ) if( ROCFFT_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../staging" ) elseif( ROCFFT_CLIENTS_BUILD_SCOPE ) set( TESTS_OUT_DIR "/../bin" ) else() set( TESTS_OUT_DIR "/bin" ) endif() string( CONCAT TESTS_OUT_DIR "${PROJECT_BINARY_DIR}" ${TESTS_OUT_DIR} ) set_target_properties(rocfft-test PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) set_target_properties(rtc_helper_crash PROPERTIES RUNTIME_OUTPUT_DIRECTORY ${TESTS_OUT_DIR}) rocm_install(TARGETS rocfft-test rtc_helper_crash COMPONENT tests) if (WIN32) # Ensure tests run with HIP DLLs and not anything the driver owns # in system32. Libraries like amdhip64.dll are also in the HIP # runtime, and we need run with those. But the only way to make a # same-named DLL override something in system32 is to have it next # to the executable. So copy them in. file( GLOB third_party_dlls LIST_DIRECTORIES OFF CONFIGURE_DEPENDS ${HIP_DIR}/bin/*.dll C:/Windows/System32/libomp140*.dll ) foreach( file_i ${third_party_dlls}) add_custom_command( TARGET rocfft-test POST_BUILD COMMAND ${CMAKE_COMMAND} ARGS -E copy ${file_i} $ ) endforeach( file_i ) endif() rocFFT-rocm-5.7.1/clients/tests/accuracy_test.cpp000066400000000000000000000623511446473624700217750ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "accuracy_test.h" #include "../../shared/rocfft_complex.h" #include // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // multiply each element by scalar if(input == testdata->base) return input[offset] * testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_dev_half = load_callback<_Float16>; __device__ auto load_callback_dev_complex_half = load_callback>; __device__ auto load_callback_dev_float = load_callback; __device__ auto load_callback_dev_complex_float = load_callback>; __device__ auto load_callback_dev_double = load_callback; __device__ auto load_callback_dev_complex_double = load_callback>; // load/store callbacks - cbdata in each is actually a scalar double // with a number to apply to each element template __host__ __device__ Tdata load_callback_round_trip_inverse(Tdata* input, size_t offset, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // subtract each element by scalar if(input == testdata->base) return input[offset] - testdata->scalar; // wrong base address passed, return something obviously wrong else { // wrong base address passed, return something obviously wrong return input[0]; } } __device__ auto load_callback_round_trip_inverse_dev_half = load_callback_round_trip_inverse<_Float16>; __device__ auto load_callback_round_trip_inverse_dev_complex_half = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_float = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_float = load_callback_round_trip_inverse>; __device__ auto load_callback_round_trip_inverse_dev_double = load_callback_round_trip_inverse; __device__ auto load_callback_round_trip_inverse_dev_complex_double = load_callback_round_trip_inverse>; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse = false) { void* load_callback_host = nullptr; switch(itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_half), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_float), sizeof(void*)), hipSuccess); } return load_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&load_callback_host, HIP_SYMBOL(load_callback_dev_double), sizeof(void*)), hipSuccess); } return load_callback_host; } } default: // planar is unsupported for now return load_callback_host; } } template __host__ __device__ static void store_callback(Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element + testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_dev_half = store_callback<_Float16>; __device__ auto store_callback_dev_complex_half = store_callback>; __device__ auto store_callback_dev_float = store_callback; __device__ auto store_callback_dev_complex_float = store_callback>; __device__ auto store_callback_dev_double = store_callback; __device__ auto store_callback_dev_complex_double = store_callback>; template __host__ __device__ static void store_callback_round_trip_inverse( Tdata* output, size_t offset, Tdata element, void* cbdata, void* sharedMem) { auto testdata = static_cast(cbdata); // add scalar to each element if(output == testdata->base) { output[offset] = element / testdata->scalar; } // otherwise, wrong base address passed, just don't write } __device__ auto store_callback_round_trip_inverse_dev_half = store_callback_round_trip_inverse<_Float16>; __device__ auto store_callback_round_trip_inverse_dev_complex_half = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_float = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_float = store_callback_round_trip_inverse>; __device__ auto store_callback_round_trip_inverse_dev_double = store_callback_round_trip_inverse; __device__ auto store_callback_round_trip_inverse_dev_complex_double = store_callback_round_trip_inverse>; void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse = false) { void* store_callback_host = nullptr; switch(otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ(hipMemcpyFromSymbol( &store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_complex_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_complex_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } case fft_array_type_real: { switch(precision) { case fft_precision_half: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_half), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_half), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_single: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_float), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_float), sizeof(void*)), hipSuccess); } return store_callback_host; case fft_precision_double: if(round_trip_inverse) { EXPECT_EQ( hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_round_trip_inverse_dev_double), sizeof(void*)), hipSuccess); } else { EXPECT_EQ(hipMemcpyFromSymbol(&store_callback_host, HIP_SYMBOL(store_callback_dev_double), sizeof(void*)), hipSuccess); } return store_callback_host; } } default: // planar is unsupported for now return store_callback_host; } } // Apply store callback if necessary void apply_store_callback(const fft_params& params, std::vector& output) { if(!params.run_callbacks && params.scale_factor == 1.0) return; callback_test_data cbdata; cbdata.scalar = params.store_cb_scalar; cbdata.base = output.front().data(); switch(params.otype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex<_Float16>); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; case fft_array_type_complex_planar: case fft_array_type_hermitian_planar: { // planar wouldn't run callbacks, but we could still want scaling switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex<_Float16>); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); for(auto& buf : output) { const size_t num_elems = buf.size() / elem_size; auto output_begin = reinterpret_cast*>(buf.data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; } } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(_Float16); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast<_Float16*>(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = output.front().size() / elem_size; auto output_begin = reinterpret_cast(output.front().data()); for(size_t i = 0; i < num_elems; ++i) { auto& element = output_begin[i]; if(params.scale_factor != 1.0) element = element * params.scale_factor; if(params.run_callbacks) store_callback(output_begin, i, element, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } // apply load callback if necessary void apply_load_callback(const fft_params& params, std::vector& input) { if(!params.run_callbacks) return; // we're applying callbacks to FFTW input/output which we can // assume is contiguous and non-planar callback_test_data cbdata; cbdata.scalar = params.load_cb_scalar; cbdata.base = input.front().data(); switch(params.itype) { case fft_array_type_complex_interleaved: case fft_array_type_hermitian_interleaved: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(rocfft_complex<_Float16>); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(rocfft_complex); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; case fft_array_type_real: { switch(params.precision) { case fft_precision_half: { const size_t elem_size = sizeof(_Float16); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast<_Float16*>(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_single: { const size_t elem_size = sizeof(float); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } case fft_precision_double: { const size_t elem_size = sizeof(double); const size_t num_elems = input.front().size() / elem_size; auto input_begin = reinterpret_cast(input.front().data()); for(size_t i = 0; i < num_elems; ++i) { input_begin[i] = load_callback(input_begin, i, &cbdata, nullptr); } break; } } } break; default: // this is FFTW data which should always be interleaved (if complex) abort(); } } rocFFT-rocm-5.7.1/clients/tests/accuracy_test.h000066400000000000000000002211011446473624700214300ustar00rootroot00000000000000// Copyright (C) 2020 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ACCURACY_TEST #define ACCURACY_TEST #include #include #include #include #include #include #include "../../shared/enum_to_string.h" #include "../../shared/fft_params.h" #include "../../shared/gpubuf.h" #include "fftw_transform.h" #include "rocfft_against_fftw.h" #include "test_params.h" extern int verbose; extern size_t ramgb; static const size_t ONE_GiB = 1 << 30; inline size_t bytes_to_GiB(const size_t bytes) { return bytes == 0 ? 0 : (bytes - 1 + ONE_GiB) / ONE_GiB; } typedef std::tuple type_place_io_t; // Remember the results of the last FFT we computed with FFTW. Tests // are ordered so that later cases can often reuse this result. struct last_cpu_fft_cache { // keys to the cache std::vector length; size_t nbatch = 0; fft_transform_type transform_type = fft_transform_type_complex_forward; bool run_callbacks = false; fft_precision precision = fft_precision_single; // FFTW input/output std::vector cpu_input; std::vector cpu_output; }; extern last_cpu_fft_cache last_cpu_fft_data; struct system_memory { size_t total_bytes = 0; size_t free_bytes = 0; }; extern system_memory start_memory; system_memory get_system_memory(); // Estimate the amount of host memory needed for buffers. inline size_t needed_ram_buffers(const fft_params& params, const int verbose) { // This calculation is assuming contiguous data but noncontiguous buffers // are assumed to require a close enough amount of space for the purposes // of this estimate. size_t needed_ram = 6 * std::accumulate(params.length.begin(), params.length.end(), static_cast(1), std::multiplies()); // Account for precision and data type: if(params.transform_type != fft_transform_type_real_forward && params.transform_type != fft_transform_type_real_inverse) { needed_ram *= 2; } switch(params.precision) { case fft_precision_half: needed_ram *= 2; break; case fft_precision_single: needed_ram *= 4; break; case fft_precision_double: needed_ram *= 8; break; } needed_ram *= params.nbatch; if(verbose) { std::cout << "required host memory for buffers (GiB): " << bytes_to_GiB(needed_ram) << "\n"; } return needed_ram; } template bool fftw_plan_uses_bluestein(const typename fftw_trait::fftw_plan_type& cpu_plan) { #ifdef FFTW_HAVE_SPRINT_PLAN char* print_plan_c_str = fftw_sprint_plan(cpu_plan); std::string print_plan(print_plan_c_str); free(print_plan_c_str); return print_plan.find("bluestein") != std::string::npos; #else // assume worst case (bluestein is always used) return true; #endif } // Estimate the amount of host memory needed for fftw. template inline size_t needed_ram_fftw(const fft_params& contiguous_params, const typename fftw_trait::fftw_plan_type& cpu_plan, const int verbose) { size_t total_length = std::accumulate(contiguous_params.length.begin(), contiguous_params.length.end(), static_cast(1), std::multiplies()); size_t needed_ram = 0; // Detect Bluestein in plan if(fftw_plan_uses_bluestein(cpu_plan)) { for(size_t dim : contiguous_params.length) { unsigned int needed_ram_dim = dim; // Next-plus-one-power-of-two multiplied any other lengths needed_ram_dim--; needed_ram_dim |= needed_ram_dim >> 2; needed_ram_dim |= needed_ram_dim >> 4; needed_ram_dim |= needed_ram_dim >> 8; needed_ram_dim |= needed_ram_dim >> 16; needed_ram_dim++; needed_ram_dim *= 2 * (total_length / dim); if(needed_ram_dim > needed_ram) { needed_ram = needed_ram_dim; } } } // Account for precision and data type: if(contiguous_params.transform_type != fft_transform_type_real_forward && contiguous_params.transform_type != fft_transform_type_real_inverse) { needed_ram *= 2; } switch(contiguous_params.precision) { case fft_precision_half: needed_ram *= 2; break; case fft_precision_single: needed_ram *= 4; break; case fft_precision_double: needed_ram *= 8; break; } needed_ram *= contiguous_params.nbatch; if(verbose) { std::cout << "required host memory for FFTW (GiB): " << bytes_to_GiB(needed_ram) << "\n"; } return needed_ram; } // Base gtest class for comparison with FFTW. class accuracy_test : public ::testing::TestWithParam { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; const static std::vector batch_range = {2, 1}; const static std::vector precision_range_full = {fft_precision_double, fft_precision_single, fft_precision_half}; const static std::vector precision_range_sp_dp = {fft_precision_double, fft_precision_single}; const static std::vector place_range = {fft_placement_inplace, fft_placement_notinplace}; const static std::vector trans_type_range = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; const static std::vector trans_type_range_complex = {fft_transform_type_complex_forward}; const static std::vector trans_type_range_real = {fft_transform_type_real_forward}; // Given a vector of vector of lengths, generate all unique permutations. // Add an optional vector of ad-hoc lengths to the result. inline std::vector> generate_lengths(const std::vector>& inlengths) { std::vector> output; if(inlengths.size() == 0) { return output; } const size_t dim = inlengths.size(); std::vector looplength(dim); for(unsigned int i = 0; i < dim; ++i) { looplength[i] = inlengths[i].size(); } for(unsigned int idx = 0; idx < inlengths.size(); ++idx) { std::vector index(dim); do { std::vector length(dim); for(unsigned int i = 0; i < dim; ++i) { length[i] = inlengths[i][index[i]]; } output.push_back(length); } while(increment_rowmajor(index, looplength)); } // uniquify the result std::sort(output.begin(), output.end()); output.erase(std::unique(output.begin(), output.end()), output.end()); return output; } // Return the valid rocFFT input and output types for a given transform type. inline std::vector> iotypes(const fft_transform_type transformType, const fft_result_placement place, const bool planar = true) { std::vector> iotypes; switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: iotypes.push_back(std::make_pair( fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)); if(planar) { iotypes.push_back(std::make_pair( fft_array_type_complex_planar, fft_array_type_complex_planar)); if(place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_complex_planar, fft_array_type_complex_interleaved)); iotypes.push_back(std::make_pair( fft_array_type_complex_interleaved, fft_array_type_complex_planar)); } } break; case fft_transform_type_real_forward: iotypes.push_back(std::make_pair( fft_array_type_real, fft_array_type_hermitian_interleaved)); if(planar && place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_real, fft_array_type_hermitian_planar)); } break; case fft_transform_type_real_inverse: iotypes.push_back(std::make_pair( fft_array_type_hermitian_interleaved, fft_array_type_real)); if(planar && place == fft_placement_notinplace) { iotypes.push_back(std::make_pair( fft_array_type_hermitian_planar, fft_array_type_real)); } break; default: throw std::runtime_error("Invalid transform type"); } return iotypes; } // Generate all combinations of input/output types, from combinations of transform and placement // types. static std::vector generate_types(fft_transform_type transform_type, const std::vector& place_range, const bool planar) { std::vector ret; for(auto place : place_range) { for(auto iotype : iotypes(transform_type, place, planar)) { ret.push_back(std::make_tuple(transform_type, place, iotype.first, iotype.second)); } } return ret; } struct stride_generator { struct stride_dist { stride_dist(const std::vector& s, size_t d) : stride(s) , dist(d) { } std::vector stride; size_t dist; }; // NOTE: allow for this ctor to be implicit, so it's less typing for a test writer // // cppcheck-suppress noExplicitConstructor stride_generator(const std::vector>& stride_list_in) : stride_list(stride_list_in) { } virtual std::vector generate(const std::vector& lengths, size_t batch) const { std::vector ret; for(const auto& s : stride_list) ret.emplace_back(s, 0); return ret; } std::vector> stride_list; }; // Generate strides such that batch is essentially the innermost dimension // e.g. given a batch-2 4x3x2 transform which logically looks like: // // batch0: // A B A B // A B A B // A B A B // // A B A B // A B A B // A B A B // // batch1: // A B A B // A B A B // A B A B // // A B A B // A B A B // A B A B // // we instead do stride-2 4x3x2 transform where first batch is the // A's and second batch is the B's. struct stride_generator_3D_inner_batch : public stride_generator { explicit stride_generator_3D_inner_batch(const std::vector>& stride_list_in) : stride_generator(stride_list_in) { } std::vector generate(const std::vector& lengths, size_t batch) const override { std::vector ret = stride_generator::generate(lengths, batch); std::vector strides{lengths[1] * lengths[2] * batch, lengths[2] * batch, batch}; ret.emplace_back(strides, 1); return ret; } }; // Create an array of parameters to pass to gtest. Base generator // that allows choosing transform type. inline auto param_generator_base(const std::vector& type_range, const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, decltype(generate_types) types_generator, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar = true, const bool run_callbacks = false) { std::vector params; // For any length, we compute double-precision CPU reference // for largest batch size first and reuse for smaller batch // sizes, then convert to single-precision. for(auto& transform_type : type_range) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { continue; } { for(const auto precision : precision_range) { for(const auto batch : batch_range) { for(const auto& types : types_generator(transform_type, place_range, planar)) { for(const auto& istride_dist : istride.generate(lengths, batch)) { for(const auto& ostride_dist : ostride.generate(lengths, batch)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = istride_dist.stride; param.ostride = ostride_dist.stride; param.nbatch = batch; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = istride_dist.dist; param.odist = ostride_dist.dist; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; if(run_callbacks) { // add a test if both input and output support callbacks if(param.itype != fft_array_type_complex_planar && param.itype != fft_array_type_hermitian_planar && param.otype != fft_array_type_complex_planar && param.otype != fft_array_type_hermitian_planar) { param.run_callbacks = true; } else { continue; } } param.validate(); // Keeping the random number generator here // allows one to run the same tests for a given // random seed; ie the test suite is repeatable. std::hash hasher; std::ranlux24_base gen(random_seed + hasher(param.token())); std::uniform_real_distribution<> dis(0.0, 1.0); if(param.is_planar()) { const double roll = dis(gen); if(roll > planar_prob) { if(verbose > 4) { std::cout << "Planar transform skipped " "(planar_prob: " << planar_prob << " > " << roll << ")\n"; } continue; } } if(run_callbacks) { const double roll = dis(gen); if(roll > callback_prob) { if(verbose > 4) { std::cout << "Callback transform skipped " "(planar_prob: " << planar_prob << " > " << roll << ")\n"; } continue; } } if(param.valid(0)) { params.push_back(param); } } } } } } } } } } } return params; } // Create an array of parameters to pass to gtest. Default generator // that picks all transform types. inline auto param_generator(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } // Create an array of parameters to pass to gtest. Only tests complex-type transforms inline auto param_generator_complex(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range_complex, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } // Create an array of parameters to pass to gtest. inline auto param_generator_real(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector& batch_range, const stride_generator& istride, const stride_generator& ostride, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range, const bool planar, const bool run_callbacks = false) { return param_generator_base(trans_type_range_real, v_lengths, precision_range, batch_range, generate_types, istride, ostride, ioffset_range, ooffset_range, place_range, planar, run_callbacks); } template auto param_generator_token(const Tcontainer& tokens) { std::vector params; params.reserve(tokens.size()); for(auto t : tokens) { params.push_back({}); params.back().from_token(t); } return params; } struct callback_test_data { // scalar to modify the input/output with double scalar; // base address of input, to ensure that each callback gets an offset from that base void* base; }; void* get_load_callback_host(fft_array_type itype, fft_precision precision, bool round_trip_inverse); void apply_load_callback(const fft_params& params, std::vector& input); void apply_store_callback(const fft_params& params, std::vector& output); void* get_store_callback_host(fft_array_type otype, fft_precision precision, bool round_trip_inverse); static auto allocate_cpu_fft_buffer(const fft_precision precision, const fft_array_type type, const std::vector& size) { // FFTW does not support half-precision, so we do single instead. // So if we need to do a half-precision FFTW transform, allocate // enough buffer for single-precision instead. return allocate_host_buffer( precision == fft_precision_half ? fft_precision_single : precision, type, size); } template inline void execute_cpu_fft(fft_params& params, fft_params& contiguous_params, typename fftw_trait::fftw_plan_type& cpu_plan, std::vector& cpu_input, std::vector& cpu_output) { // CPU output might not be allocated already for us, if FFTW never // needed an output buffer during planning if(cpu_output.empty()) cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); // If this is either C2R or callbacks are enabled, the // input will be modified. So we need to modify the copy instead. std::vector cpu_input_copy(cpu_input.size()); std::vector* input_ptr = &cpu_input; if(params.run_callbacks || contiguous_params.transform_type == fft_transform_type_real_inverse) { for(size_t i = 0; i < cpu_input.size(); ++i) { cpu_input_copy[i] = cpu_input[i].copy(); } input_ptr = &cpu_input_copy; } // run FFTW (which may destroy CPU input) apply_load_callback(params, *input_ptr); fftw_run(contiguous_params.transform_type, cpu_plan, *input_ptr, cpu_output); // clean up fftw_destroy_plan_type(cpu_plan); // ask FFTW to fully clean up, since it tries to cache plan details fftw_cleanup(); cpu_plan = nullptr; apply_store_callback(params, cpu_output); } // execute the GPU transform template inline void execute_gpu_fft(Tparams& params, std::vector& pibuffer, std::vector& pobuffer, std::vector& gpu_output, bool round_trip_inverse = false) { gpubuf_t load_cb_data_dev; gpubuf_t store_cb_data_dev; if(params.run_callbacks) { void* load_cb_host = get_load_callback_host(params.itype, params.precision, round_trip_inverse); callback_test_data load_cb_data_host; if(round_trip_inverse) { load_cb_data_host.scalar = params.store_cb_scalar; } else { load_cb_data_host.scalar = params.load_cb_scalar; } load_cb_data_host.base = pibuffer.front(); auto hip_status = hipSuccess; hip_status = load_cb_data_dev.alloc(sizeof(callback_test_data)); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } hip_status = hipMemcpy(load_cb_data_dev.data(), &load_cb_data_host, sizeof(callback_test_data), hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } void* store_cb_host = get_store_callback_host(params.otype, params.precision, round_trip_inverse); callback_test_data store_cb_data_host; if(round_trip_inverse) { store_cb_data_host.scalar = params.load_cb_scalar; } else { store_cb_data_host.scalar = params.store_cb_scalar; } store_cb_data_host.base = pobuffer.front(); hip_status = store_cb_data_dev.alloc(sizeof(callback_test_data)); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } hip_status = hipMemcpy(store_cb_data_dev.data(), &store_cb_data_host, sizeof(callback_test_data), hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP(); } else { GTEST_FAIL(); } } auto fft_status = params.set_callbacks( load_cb_host, load_cb_data_dev.data(), store_cb_host, store_cb_data_dev.data()); if(fft_status != fft_status_success) throw std::runtime_error("set callback failure"); } // Execute the transform: auto fft_status = params.execute(pibuffer.data(), pobuffer.data()); if(fft_status != fft_status_success) throw std::runtime_error("rocFFT plan execution failure"); // copy GPU output back ASSERT_TRUE(!gpu_output.empty()) << "no output buffers"; for(unsigned int idx = 0; idx < gpu_output.size(); ++idx) { ASSERT_TRUE(gpu_output[idx].data() != nullptr) << "output buffer index " << idx << " is empty"; auto hip_status = hipMemcpy(gpu_output[idx].data(), pobuffer.at(idx), gpu_output[idx].size(), hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure"; } else { GTEST_FAIL() << "hipMemcpy failure"; } } } if(verbose > 2) { std::cout << "GPU output:\n"; params.print_obuffer(gpu_output); } if(verbose > 5) { std::cout << "flat GPU output:\n"; params.print_obuffer_flat(gpu_output); } } template static void assert_init_value(const std::vector& output, const size_t idx, const Tfloat orig_value); template <> void assert_init_value(const std::vector& output, const size_t idx, const float orig_value) { float actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> void assert_init_value(const std::vector& output, const size_t idx, const double orig_value) { double actual_value = reinterpret_cast(output.front().data())[idx]; ASSERT_EQ(actual_value, orig_value) << "index " << idx; } template <> void assert_init_value(const std::vector& output, const size_t idx, const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { rocfft_complex actual_value = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar rocfft_complex actual_value{ reinterpret_cast(output.front().data())[idx], reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } } template <> void assert_init_value(const std::vector& output, const size_t idx, const rocfft_complex orig_value) { // if this is interleaved, check directly if(output.size() == 1) { rocfft_complex actual_value = reinterpret_cast*>(output.front().data())[idx]; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } else { // planar rocfft_complex actual_value{ reinterpret_cast(output.front().data())[idx], reinterpret_cast(output.back().data())[idx]}; ASSERT_EQ(actual_value.x, orig_value.x) << "x index " << idx; ASSERT_EQ(actual_value.y, orig_value.y) << "y index " << idx; } } static const int OUTPUT_INIT_PATTERN = 0xcd; template void check_single_output_stride(const std::vector& output, const size_t offset, const std::vector& length, const std::vector& stride, const size_t i) { Tfloat orig; memset(static_cast(&orig), OUTPUT_INIT_PATTERN, sizeof(Tfloat)); size_t curLength = length[i]; size_t curStride = stride[i]; size_t nextSmallerLength = i == length.size() - 1 ? 0 : length[i + 1]; size_t nextSmallerStride = i == stride.size() - 1 ? 0 : stride[i + 1]; if(nextSmallerLength == 0) { // this is the fastest dim, indexes that are not multiples of // the stride should be the initial value for(size_t idx = 0; idx < (curLength - 1) * curStride; ++idx) { if(idx % curStride != 0) assert_init_value(output, idx, orig); } } else { for(size_t lengthIdx = 0; lengthIdx < curLength; ++lengthIdx) { // check that the space after the next smaller dim and the // end of this dim is initial value for(size_t idx = nextSmallerLength * nextSmallerStride; idx < curStride; ++idx) assert_init_value(output, idx, orig); check_single_output_stride( output, offset + lengthIdx * curStride, length, stride, i + 1); } } } template void check_output_strides(const std::vector& output, Tparams& params) { // treat batch+dist like highest length+stride, if batch > 1 std::vector length; std::vector stride; if(params.nbatch > 1) { length.push_back(params.nbatch); stride.push_back(params.odist); } auto olength = params.olength(); std::copy(olength.begin(), olength.end(), std::back_inserter(length)); std::copy(params.ostride.begin(), params.ostride.end(), std::back_inserter(stride)); if(params.precision == fft_precision_single) { if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else check_single_output_stride>(output, 0, length, stride, 0); } else { if(params.otype == fft_array_type_real) check_single_output_stride(output, 0, length, stride, 0); else check_single_output_stride>(output, 0, length, stride, 0); } } // run rocFFT inverse transform template inline void run_round_trip_inverse(Tparams& params, std::vector& obuffer, std::vector& pibuffer, std::vector& pobuffer, std::vector& gpu_output) { params.validate(); // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); // Create FFT plan - this will also allocate work buffer, but will throw a // specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { std::stringstream ss; ss << "Failed to allocate work buffer (size: " << params.workbuffersize << ")"; ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } ASSERT_EQ(plan_status, fft_status_success) << "round trip inverse plan creation failed"; auto obuffer_sizes = params.obuffer_sizes(); if(params.placement != fft_placement_inplace) { for(unsigned int i = 0; i < obuffer_sizes.size(); ++i) { // If we're validating output strides, init the // output buffer to a known pattern and we can check // that the pattern is untouched in places that // shouldn't have been touched. if(params.check_output_strides) { auto hip_status = hipMemset(obuffer[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemset failure"; } else { GTEST_FAIL() << "hipMemset failure"; } } } } } // execute GPU transform // // limited scope for local variables execute_gpu_fft(params, pibuffer, pobuffer, gpu_output, true); } // compare rocFFT inverse transform with forward transform input template inline void compare_round_trip_inverse(Tparams& params, fft_params& contiguous_params, std::vector& gpu_output, std::vector& cpu_input, const VectorNorms& cpu_input_norm, size_t total_length) { if(params.check_output_strides) { check_output_strides(gpu_output, params); } // compute GPU output norm std::shared_future gpu_norm = std::async(std::launch::async, [&]() { return norm(gpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); }); // compare GPU inverse output to CPU forward input std::unique_ptr>> linf_failures; if(verbose > 1) linf_failures = std::make_unique>>(); const double linf_cutoff = type_epsilon(params.precision) * cpu_input_norm.l_inf * log(total_length); VectorNorms diff = distance(cpu_input, gpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.otype, params.ostride, params.odist, linf_failures.get(), linf_cutoff, {0}, params.ooffset, 1.0 / total_length); if(verbose > 1) { std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; std::cout << "GPU linf norm failures:"; std::sort(linf_failures->begin(), linf_failures->end()); for(const auto& i : *linf_failures) { std::cout << " (" << i.first << "," << i.second << ")"; } std::cout << std::endl; } EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); switch(params.precision) { case fft_precision_half: max_linf_eps_half = std::max(max_linf_eps_half, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_half = std::max(max_l2_eps_half, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_single: max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_single = std::max(max_l2_eps_single, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_double: max_linf_eps_double = std::max(max_linf_eps_double, diff.l_inf / cpu_input_norm.l_inf / log(total_length)); max_l2_eps_double = std::max(max_l2_eps_double, diff.l_2 / cpu_input_norm.l_2 * sqrt(log2(total_length))); break; } if(verbose > 1) { std::cout << "L2 diff: " << diff.l_2 << "\n"; std::cout << "Linf diff: " << diff.l_inf << "\n"; } EXPECT_TRUE(diff.l_inf <= linf_cutoff) << "Linf test failed. Linf:" << diff.l_inf << "\tnormalized Linf: " << diff.l_inf / cpu_input_norm.l_inf << "\tcutoff: " << linf_cutoff << params.str(); EXPECT_TRUE(diff.l_2 / cpu_input_norm.l_2 < sqrt(log2(total_length)) * type_epsilon(params.precision)) << "L2 test failed. L2: " << diff.l_2 << "\tnormalized L2: " << diff.l_2 / cpu_input_norm.l_2 << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) << params.str(); } // RAII type to put data into the cache when this object leaves scope struct StoreCPUDataToCache { StoreCPUDataToCache(std::vector& cpu_input, std::vector& cpu_output) : cpu_input(cpu_input) , cpu_output(cpu_output) { } ~StoreCPUDataToCache() { last_cpu_fft_data.cpu_output.swap(cpu_output); last_cpu_fft_data.cpu_input.swap(cpu_input); } std::vector& cpu_input; std::vector& cpu_output; }; // run CPU + rocFFT transform with the given params and compare template inline void fft_vs_reference_impl(Tparams& params, bool round_trip) { // Make sure that the parameters make sense: ASSERT_TRUE(params.valid(verbose)); size_t needed_ram = needed_ram_buffers(params, verbose); if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) { GTEST_SKIP() << "needed_ramgb: " << bytes_to_GiB(needed_ram) << ", ramgb limit: " << ramgb << ".\n"; } auto ibuffer_sizes = params.ibuffer_sizes(); auto obuffer_sizes = params.obuffer_sizes(); size_t vram_avail = 0; if(vramgb == 0) { // Check free and total available memory: size_t free = 0; size_t total = 0; auto hip_status = hipMemGetInfo(&free, &total); if(hip_status != hipSuccess || total == 0) { ++n_hip_failures; std::stringstream ss; if(total == 0) ss << "hipMemGetInfo claims there there isn't any vram"; else ss << "hipMemGetInfo failure with error " << hip_status; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } vram_avail = total; } else { vram_avail = vramgb * ONE_GiB; } // First try a quick estimation of vram footprint, to speed up skipping tests // that are too large to fit in the gpu (no plan created with the rocFFT backend) const auto raw_vram_footprint = params.fft_params_vram_footprint() + twiddle_table_vram_footprint(params); if(!vram_fits_problem(raw_vram_footprint, vram_avail)) { GTEST_SKIP() << "Raw problem size (" << bytes_to_GiB(raw_vram_footprint) << " GiB) raw data too large for device"; } if(verbose > 2) { std::cout << "Raw problem size: " << raw_vram_footprint << std::endl; } // If it passed the quick estimation test, go for the more // accurate calculation that actually creates the plan and // take into account the work buffer size const auto vram_footprint = params.vram_footprint(); if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { std::cout << "Problem raw data won't fit on device; skipped." << std::endl; } GTEST_SKIP() << "Problem size (" << bytes_to_GiB(vram_footprint) << " GiB) raw data too large for device"; } // Create FFT plan - this will also allocate work buffer, but // will throw a specific exception if that step fails auto plan_status = fft_status_success; try { plan_status = params.create_plan(); } catch(fft_params::work_buffer_alloc_failure& e) { ++n_hip_failures; std::stringstream ss; ss << "Work buffer allocation failed with size: " << params.workbuffersize; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } ASSERT_EQ(plan_status, fft_status_success) << "plan creation failed"; if(!vram_fits_problem(vram_footprint, vram_avail)) { if(verbose) { std::cout << "Problem won't fit on device; skipped." << std::endl; } GTEST_SKIP() << "Problem size (" << vram_footprint << ") too large for device"; return; } fft_params contiguous_params; contiguous_params.length = params.length; contiguous_params.precision = params.precision; contiguous_params.placement = fft_placement_notinplace; contiguous_params.transform_type = params.transform_type; contiguous_params.nbatch = params.nbatch; contiguous_params.itype = contiguous_itype(params.transform_type); contiguous_params.otype = contiguous_otype(contiguous_params.transform_type); contiguous_params.validate(); if(!contiguous_params.valid(verbose)) { throw std::runtime_error("Invalid contiguous params"); } if(verbose > 3) { std::cout << "CPU params:\n"; std::cout << contiguous_params.str("\n\t") << std::endl; } std::vector ibuffer(ibuffer_sizes.size()); std::vector pibuffer(ibuffer_sizes.size()); for(unsigned int i = 0; i < ibuffer.size(); ++i) { auto hip_status = ibuffer[i].alloc(ibuffer_sizes[i]); if(hip_status != hipSuccess) { std::stringstream ss; ss << "hipMalloc failure for input buffer " << i << " size " << ibuffer_sizes[i] << "(" << bytes_to_GiB(ibuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } pibuffer[i] = ibuffer[i].data(); } // allocation counts in elements, ibuffer_sizes is in bytes auto ibuffer_sizes_elems = ibuffer_sizes; for(auto& buf : ibuffer_sizes_elems) buf /= var_size(params.precision, params.itype); // Check cache first - nbatch is a >= comparison because we compute // the largest batch size and cache it. Smaller batch runs can // compare against the larger data. std::vector cpu_input; std::vector cpu_output; std::shared_future convert_cpu_output_precision; std::shared_future convert_cpu_input_precision; bool run_fftw = true; std::unique_ptr store_to_cache; if(last_cpu_fft_data.length == params.length && last_cpu_fft_data.transform_type == params.transform_type && last_cpu_fft_data.run_callbacks == params.run_callbacks) { if(last_cpu_fft_data.nbatch >= params.nbatch) { // use the cached input/output cpu_input.swap(last_cpu_fft_data.cpu_input); cpu_output.swap(last_cpu_fft_data.cpu_output); run_fftw = false; store_to_cache = std::make_unique(cpu_input, cpu_output); if(params.precision != last_cpu_fft_data.precision) { // Tests should be ordered so we do wider first, then narrower. switch(params.precision) { case fft_precision_double: std::cerr << "test ordering is incorrect: double precision follows a narrower one" << std::endl; abort(); break; case fft_precision_single: if(last_cpu_fft_data.precision != fft_precision_double) { std::cerr << "test ordering is incorrect: float precision follows a narrower one" << std::endl; abort(); } // convert the input/output to single-precision convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); break; case fft_precision_half: // convert to half precision if(last_cpu_fft_data.precision == fft_precision_double) { convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); } else if(last_cpu_fft_data.precision == fft_precision_single) { convert_cpu_output_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_output.front()); }); convert_cpu_input_precision = std::async(std::launch::async, [&]() { narrow_precision_inplace(cpu_input.front()); }); } else { std::cerr << "unhandled previous precision, cannot convert to half" << std::endl; abort(); } break; } last_cpu_fft_data.precision = params.precision; } } // If the last result has a smaller batch than the new // params, that might be a developer error - tests should be // ordered to generate the bigger batch first. But if tests // got filtered or skipped due to insufficient memory, we // might never have tried to generate the bigger batch first. // So just fall through and redo the CPU FFT. } else { // Clear cache explicitly so that even if we didn't get a hit, // we're not uselessly holding on to cached cpu input/output last_cpu_fft_data = last_cpu_fft_cache(); } // Allocate CPU input if(run_fftw) { cpu_input = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.itype, contiguous_params.isize); } // Create FFTW plan - this may write to input, but that's fine // since there's nothing in there right now typename fftw_trait::fftw_plan_type cpu_plan = nullptr; if(run_fftw) { // Normally, we would want to defer allocation of CPU output // buffer until when we actually do the CPU FFT. But if we're // using FFTW wisdom, FFTW needs an output buffer at plan // creation time. if(use_fftw_wisdom) { cpu_output = allocate_cpu_fft_buffer( contiguous_params.precision, contiguous_params.otype, contiguous_params.osize); } cpu_plan = fftw_plan_via_rocfft(contiguous_params.length, contiguous_params.istride, contiguous_params.ostride, contiguous_params.nbatch, contiguous_params.idist, contiguous_params.odist, contiguous_params.transform_type, cpu_input, cpu_output); needed_ram += needed_ram_fftw(contiguous_params, cpu_plan, verbose); if(ramgb > 0 && needed_ram > ramgb * ONE_GiB) { if(verbose) { std::cout << "Problem exceeds memory limit; skipped [rocfft_transform]." << std::endl; } GTEST_SKIP(); return; } } std::vector gpu_input_data = allocate_host_buffer(params.precision, params.itype, ibuffer_sizes_elems); // allocate and populate the input buffer (cpu/gpu) if(run_fftw) { //generate the input directly on the gpu params.compute_input(ibuffer); // Copy the input to CPU if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { auto hip_status = hipMemcpy(gpu_input_data.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } copy_buffers(gpu_input_data, cpu_input, params.ilength(), params.nbatch, params.precision, params.itype, params.istride, params.idist, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.ioffset, contiguous_params.ioffset); } else { // Copy input to CPU for(unsigned int idx = 0; idx < ibuffer.size(); ++idx) { auto hip_status = hipMemcpy(cpu_input.at(idx).data(), ibuffer[idx].data(), ibuffer_sizes[idx], hipMemcpyDeviceToHost); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } } } else { // In case the cached cpu input needed conversion, wait for it if(convert_cpu_input_precision.valid()) convert_cpu_input_precision.get(); // gets a pre-computed gpu input buffer from the cpu cache std::vector* gpu_input = &cpu_input; if(params.itype != contiguous_params.itype || params.istride != contiguous_params.istride || params.idist != contiguous_params.idist || params.isize != contiguous_params.isize) { copy_buffers(cpu_input, gpu_input_data, params.ilength(), params.nbatch, params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, params.itype, params.istride, params.idist, {0}, params.ioffset); gpu_input = &gpu_input_data; } // Copy input to GPU for(unsigned int idx = 0; idx < gpu_input->size(); ++idx) { auto hip_status = hipMemcpy(ibuffer[idx].data(), gpu_input->at(idx).data(), ibuffer_sizes[idx], hipMemcpyHostToDevice); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemcpy failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemcpy failure with error " << hip_status; } } } } if(verbose > 3) { std::cout << "CPU input:\n"; contiguous_params.print_ibuffer(cpu_input); } // compute input norm std::shared_future cpu_input_norm = std::async(std::launch::async, [&]() { // in case the cached cpu input needed conversion, wait for it if(convert_cpu_input_precision.valid()) convert_cpu_input_precision.get(); auto input_norm = norm(cpu_input, contiguous_params.ilength(), contiguous_params.nbatch, contiguous_params.precision, contiguous_params.itype, contiguous_params.istride, contiguous_params.idist, contiguous_params.ioffset); if(verbose > 2) { std::cout << "CPU Input Linf norm: " << input_norm.l_inf << "\n"; std::cout << "CPU Input L2 norm: " << input_norm.l_2 << "\n"; } return input_norm; }); std::vector obuffer_data; std::vector* obuffer = &obuffer_data; std::vector pobuffer; // allocate the output buffer if(params.placement == fft_placement_inplace) { obuffer = &ibuffer; } else { auto obuffer_sizes = params.obuffer_sizes(); obuffer_data.resize(obuffer_sizes.size()); for(unsigned int i = 0; i < obuffer_data.size(); ++i) { auto hip_status = obuffer_data[i].alloc(obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; std::stringstream ss; ss << "hipMalloc failure for output buffer " << i << " size " << obuffer_sizes[i] << "(" << bytes_to_GiB(obuffer_sizes[i]) << " GiB)" << " with code " << hipError_to_string(hip_status); if(skip_runtime_fails) { GTEST_SKIP() << ss.str(); } else { GTEST_FAIL() << ss.str(); } } // If we're validating output strides, init the // output buffer to a known pattern and we can check // that the pattern is untouched in places that // shouldn't have been touched. if(params.check_output_strides) { hip_status = hipMemset(obuffer_data[i].data(), OUTPUT_INIT_PATTERN, obuffer_sizes[i]); if(hip_status != hipSuccess) { ++n_hip_failures; if(skip_runtime_fails) { GTEST_SKIP() << "hipMemset failure with error " << hip_status; } else { GTEST_FAIL() << "hipMemset failure with error " << hip_status; } } } } } pobuffer.resize(obuffer->size()); for(unsigned int i = 0; i < obuffer->size(); ++i) { pobuffer[i] = obuffer->at(i).data(); } // Run CPU transform // // NOTE: This must happen after input is copied to GPU and input // norm is computed, since the CPU FFT may overwrite the input. VectorNorms cpu_output_norm; std::shared_future cpu_fft = std::async(std::launch::async, [&]() { // wait for input norm to finish, since we might overwrite input cpu_input_norm.get(); if(run_fftw) execute_cpu_fft(params, contiguous_params, cpu_plan, cpu_input, cpu_output); // in case the cached cpu output needed conversion, wait for it else if(convert_cpu_output_precision.valid()) convert_cpu_output_precision.get(); if(verbose > 3) { std::cout << "CPU output:\n"; contiguous_params.print_obuffer(cpu_output); } cpu_output_norm = norm(cpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.otype, contiguous_params.ostride, contiguous_params.odist, contiguous_params.ooffset); if(verbose > 2) { std::cout << "CPU Output Linf norm: " << cpu_output_norm.l_inf << "\n"; std::cout << "CPU Output L2 norm: " << cpu_output_norm.l_2 << "\n"; } }); // execute GPU transform // // limited scope for local variables std::vector gpu_output = allocate_host_buffer(params.precision, params.otype, params.osize); execute_gpu_fft(params, pibuffer, pobuffer, gpu_output); params.free(); if(params.check_output_strides) { check_output_strides(gpu_output, params); } // compute GPU output norm std::shared_future gpu_norm = std::async(std::launch::async, [&]() { return norm(gpu_output, params.olength(), params.nbatch, params.precision, params.otype, params.ostride, params.odist, params.ooffset); }); // compare output // // Compute the l-infinity and l-2 distance between the CPU and GPU output: // wait for cpu FFT so we can compute cutoff const auto total_length = std::accumulate(params.length.begin(), params.length.end(), static_cast(1), std::multiplies()); std::unique_ptr>> linf_failures; if(verbose > 1) linf_failures = std::make_unique>>(); double linf_cutoff; VectorNorms diff; std::shared_future compare_output = std::async(std::launch::async, [&]() { cpu_fft.get(); linf_cutoff = type_epsilon(params.precision) * cpu_output_norm.l_inf * log(total_length); diff = distance(cpu_output, gpu_output, params.olength(), params.nbatch, params.precision, contiguous_params.otype, contiguous_params.ostride, contiguous_params.odist, params.otype, params.ostride, params.odist, linf_failures.get(), linf_cutoff, {0}, params.ooffset); }); // Update the cache if this current transform is different from // what's stored. But if this transform only has a smaller batch // than what's cached, we can still keep the cache around since // the input/output we already have is still valid. const bool update_last_cpu_fft_data = last_cpu_fft_data.length != params.length || last_cpu_fft_data.transform_type != params.transform_type || last_cpu_fft_data.run_callbacks != params.run_callbacks || last_cpu_fft_data.precision != params.precision || params.nbatch > last_cpu_fft_data.nbatch; // store cpu output in cache if(update_last_cpu_fft_data) { last_cpu_fft_data.length = params.length; last_cpu_fft_data.nbatch = params.nbatch; last_cpu_fft_data.transform_type = params.transform_type; last_cpu_fft_data.run_callbacks = params.run_callbacks; last_cpu_fft_data.precision = params.precision; } compare_output.get(); if(!store_to_cache) store_to_cache = std::make_unique(cpu_input, cpu_output); Tparams params_inverse; if(round_trip) { params_inverse.inverse_from_forward(params); run_round_trip_inverse( params_inverse, ibuffer, pobuffer, pibuffer, gpu_input_data); } ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_2)); ASSERT_TRUE(std::isfinite(cpu_input_norm.get().l_inf)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_2)); ASSERT_TRUE(std::isfinite(cpu_output_norm.l_inf)); if(verbose > 1) { std::cout << "GPU output Linf norm: " << gpu_norm.get().l_inf << "\n"; std::cout << "GPU output L2 norm: " << gpu_norm.get().l_2 << "\n"; std::cout << "GPU linf norm failures:"; std::sort(linf_failures->begin(), linf_failures->end()); for(const auto& i : *linf_failures) { std::cout << " (" << i.first << "," << i.second << ")"; } std::cout << std::endl; } EXPECT_TRUE(std::isfinite(gpu_norm.get().l_inf)) << params.str(); EXPECT_TRUE(std::isfinite(gpu_norm.get().l_2)) << params.str(); switch(params.precision) { case fft_precision_half: max_linf_eps_half = std::max(max_linf_eps_half, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_half = std::max(max_l2_eps_half, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_single: max_linf_eps_single = std::max(max_linf_eps_single, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_single = std::max(max_l2_eps_single, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; case fft_precision_double: max_linf_eps_double = std::max(max_linf_eps_double, diff.l_inf / cpu_output_norm.l_inf / log(total_length)); max_l2_eps_double = std::max(max_l2_eps_double, diff.l_2 / cpu_output_norm.l_2 * sqrt(log2(total_length))); break; } if(verbose > 1) { std::cout << "L2 diff: " << diff.l_2 << "\n"; std::cout << "Linf diff: " << diff.l_inf << "\n"; } EXPECT_TRUE(diff.l_inf <= linf_cutoff) << "Linf test failed. Linf:" << diff.l_inf << "\tnormalized Linf: " << diff.l_inf / cpu_output_norm.l_inf << "\tcutoff: " << linf_cutoff << params.str(); EXPECT_TRUE(diff.l_2 / cpu_output_norm.l_2 < sqrt(log2(total_length)) * type_epsilon(params.precision)) << "L2 test failed. L2: " << diff.l_2 << "\tnormalized L2: " << diff.l_2 / cpu_output_norm.l_2 << "\tepsilon: " << sqrt(log2(total_length)) * type_epsilon(params.precision) << params.str(); if(round_trip) { compare_round_trip_inverse(params_inverse, contiguous_params, gpu_input_data, cpu_input, cpu_input_norm.get(), total_length); } } #endif rocFFT-rocm-5.7.1/clients/tests/accuracy_test_1D.cpp000066400000000000000000000660711446473624700223240ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "accuracy_test.h" #include "fftw_transform.h" #include "rocfft_against_fftw.h" using ::testing::ValuesIn; // TODO: handle special case where length=2 for real/complex transforms. const static std::vector pow2_range = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536, 131072, 262144, 524288, 1048576, 2097152, 4194304, 8388608, 16777216, 33554432, 67108864, 134217728, 268435456, 536870912, 1073741824}; const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384, 32768, 65536}; const static std::vector pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561, 19683, 59049, 177147, 531441, 1594323, 4782969, 14348907, 43046721, 129140163, 387420489}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625, 78125, 390625, 1953125, 9765625, 48828125, 244140625}; // radix 7, 11, 13 sizes that are either pure powers or sizes people have wanted in the wild const static std::vector radX_range = {7, 49, 84, 112, 11, 13, 52, 104, 208, 343, 2401, 16807}; const static std::vector mix_range = {6, 10, 12, 15, 20, 30, 56, 120, 150, 225, 240, 300, 336, 486, 600, 900, 1250, 1500, 1875, 2160, 2187, 2250, 2500, 3000, 4000, 12000, 24000, 72000}; const static std::vector prime_range = {17, 19, 23, 29, 31, 37, 41, 43, 47, 53, 59, 61, 67, 71, 73, 79, 83, 89, 97}; static std::vector small_1D_sizes() { static const size_t SMALL_1D_MAX = 8192; // generate a list of sizes from 2 and up, skipping any sizes that are already covered std::vector covered_sizes; std::copy(pow2_range.begin(), pow2_range.end(), std::back_inserter(covered_sizes)); std::copy(pow3_range.begin(), pow3_range.end(), std::back_inserter(covered_sizes)); std::copy(pow5_range.begin(), pow5_range.end(), std::back_inserter(covered_sizes)); std::copy(radX_range.begin(), radX_range.end(), std::back_inserter(covered_sizes)); std::copy(mix_range.begin(), mix_range.end(), std::back_inserter(covered_sizes)); std::copy(prime_range.begin(), prime_range.end(), std::back_inserter(covered_sizes)); std::sort(covered_sizes.begin(), covered_sizes.end()); std::vector output; for(size_t i = 2; i < SMALL_1D_MAX; ++i) { if(!std::binary_search(covered_sizes.begin(), covered_sizes.end(), i)) { output.push_back(i); } } return output; } const static std::vector> stride_range = {{1}}; const static std::vector batch_range_1D = {4, 2, 1}; const static std::vector> stride_range_for_prime = {{1}, {2}, {3}, {64}, {65}}; //TODO: this will be merged back to stride_range const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; static std::vector generate_random(size_t number_run) { std::vector output(number_run); const size_t RAND_MAX_NUMBER = 6; for(size_t r = 0; r < number_run; r++) { // Generate a integer number between [0, RAND_MAX - 1] size_t i, j, k; do { i = (size_t)(rand() % RAND_MAX_NUMBER); j = (size_t)(rand() % RAND_MAX_NUMBER); k = (size_t)(rand() % RAND_MAX_NUMBER); } while(i + j + k == 0); output[i] = pow(2, i) * pow(3, j) * pow(5, k); } return output; } INSTANTIATE_TEST_SUITE_P(pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_1D_half, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half}), {fft_precision_half}, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({radX_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_radX_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({radX_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range}), precision_range_sp_dp, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_1D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range}), precision_range_full, batch_range_1D, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // small 1D sizes just need to make sure our factorization isn't // completely broken, so we just check simple C2C outplace interleaved INSTANTIATE_TEST_SUITE_P(small_1D, accuracy_test, ::testing::ValuesIn(param_generator_base( {fft_transform_type_complex_forward}, generate_lengths({small_1D_sizes()}), {fft_precision_single}, {1}, [](fft_transform_type t, const std::vector& place_range, const bool planar) { return std::vector{ std::make_tuple(t, place_range[0], fft_array_type_complex_interleaved, fft_array_type_complex_interleaved)}; }, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, true)), accuracy_test::TestName); // NB: // We have known non-unit strides issues for 1D: // - C2C middle size(for instance, single precision, 8192) // - C2C large size(for instance, single precision, 524288) // We need to fix non-unit strides first, and then address non-unit strides + batch tests. // Then check these problems of R2C and C2R. After that, we could open arbitrary permutations in the // main tests. // // The below test covers non-unit strides, pow of 2, middle sizes, which has SBCC/SBRC kernels // invloved. const static std::vector pow2_range_for_stride = {4096, 8192, 524288}; const static std::vector pow2_range_for_stride_half = {4096, 8192}; const static std::vector> stride_range_for_pow2 = {{2}, {3}}; const static std::vector batch_range_for_stride = {2, 1}; INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex, accuracy_test, ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_complex_half, accuracy_test, ::testing::ValuesIn(param_generator_complex(generate_lengths({pow2_range_for_stride_half}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real, accuracy_test, ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride}), precision_range_sp_dp, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow2_1D_stride_real_half, accuracy_test, ::testing::ValuesIn(param_generator_real(generate_lengths({pow2_range_for_stride_half}), {fft_precision_half}, batch_range_1D, stride_range_for_pow2, stride_range_for_pow2, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // Create an array parameters for strided 2D batched transforms. inline auto param_generator_complex_1d_batched_2d(const std::vector>& v_lengths, const std::vector& precision_range, const std::vector>& ioffset_range, const std::vector>& ooffset_range, const std::vector& place_range) { std::vector params; for(auto& transform_type : trans_type_range_complex) { for(const auto& lengths : v_lengths) { // try to ensure that we are given literal lengths, not // something to be passed to generate_lengths if(lengths.empty() || lengths.size() > 3) { assert(false); continue; } for(const auto precision : precision_range) { for(const auto& types : generate_types(transform_type, place_range, true)) { for(const auto& ioffset : ioffset_range) { for(const auto& ooffset : ooffset_range) { fft_params param; param.length = lengths; param.istride = lengths; param.ostride = lengths; param.nbatch = lengths[0]; param.precision = precision; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.idist = 1; param.odist = 1; param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.ioffset = ioffset; param.ooffset = ooffset; params.push_back(param); } } } } } } return params; } const static std::vector pow2_range_2D = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; INSTANTIATE_TEST_SUITE_P( pow2_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow2_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow3_range_2D = {3, 27, 81, 243, 729, 2187, 6561}; INSTANTIATE_TEST_SUITE_P( pow3_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow3_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector pow5_range_2D = {5, 25, 125, 625, 3125, 15625}; INSTANTIATE_TEST_SUITE_P( pow5_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({pow5_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); const static std::vector prime_range_2D = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; INSTANTIATE_TEST_SUITE_P( prime_1D_complex_batched_2D_strided, accuracy_test, ::testing::ValuesIn(param_generator_complex_1d_batched_2d(generate_lengths({prime_range_2D}), precision_range_sp_dp, ioffset_range_zero, ooffset_range_zero, place_range)), accuracy_test::TestName); rocFFT-rocm-5.7.1/clients/tests/accuracy_test_2D.cpp000066400000000000000000000333241446473624700223200ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "accuracy_test.h" #include "fftw_transform.h" #include "rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: enable 16384, 32768 when omp support is available (takes too // long!) const static std::vector pow2_range = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192}; // For the current configuration, half-precision has a fft size limit of 65536 const static std::vector pow2_range_half = {2, 4, 8, 16, 32, 64, 128, 256, 512, 1024, 2048}; const static std::vector pow3_range = {3, 9, 27, 81, 243, 729, 2187, 6561}; const static std::vector pow5_range = {5, 25, 125, 625, 3125, 15625}; const static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29, 263, 269, 271, 277}; const static std::vector mix_range = {56, 120, 336, 2160, 5000, 6000, 8000}; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_2D_half, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range_half, {2, 4, 8, 16, 32}}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow2_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow3_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_pow5_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_prime_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range, mix_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_mix_2D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({mix_range, mix_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // test length-1 on one dimension against a variety of non-1 lengths INSTANTIATE_TEST_SUITE_P(len1_2D, accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({{1}, {4, 8, 8192, 3, 27, 7, 11, 5000, 8000}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // length-1 on the other dimension INSTANTIATE_TEST_SUITE_P(len1_swap_2D, accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({{4, 8, 8192, 3, 27, 7, 11, 5000, 8000}, {1}}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); rocFFT-rocm-5.7.1/clients/tests/accuracy_test_3D.cpp000066400000000000000000000271631446473624700223250ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include "accuracy_test.h" #include "fftw_transform.h" #include "rocfft_against_fftw.h" using ::testing::ValuesIn; // Set parameters // TODO: 512, 1024, 2048 make the tests take too long; re-enable when // test speed is improved. static std::vector pow2_range = {4, 8, 16, 32, 128, 256}; // For the current configuration, half-precision has a fft size limit of 65536 static std::vector pow2_range_half = {4, 8, 16, 32}; // SBCC+SBRC as a sub-node of a 3D TRTRTR std::vector> pow2_adhoc = {{4, 4, 8192}}; static std::vector pow3_range = {3, 9, 27, 81, 243}; static std::vector pow5_range = {5, 25, 125}; static std::vector prime_range = {7, 11, 13, 17, 19, 23, 29}; static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P( pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(pow2_3D_half, accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({pow2_range_half, pow2_range_half, pow2_range_half}), {fft_precision_half}, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow2_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow2_range, pow2_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow3_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow3_range, pow3_range, pow3_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_pow5_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow5_range, pow5_range, pow5_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_prime_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({prime_range, prime_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( DISABLED_offset_mix_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({pow2_range, pow3_range, prime_range}), precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // Test combinations of SBRC sizes, plus a non-SBRC size (10) to // exercise fused SBRC+transpose kernels. static std::vector sbrc_range = {50, 64, 81, 100, 200, 10, 128, 256}; static std::vector sbrc_batch_range = {2, 1}; INSTANTIATE_TEST_SUITE_P( sbrc_3D, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({sbrc_range, sbrc_range, sbrc_range}), precision_range_sp_dp, sbrc_batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); // pick small sizes that will exercise 2D_SINGLE and a couple of sizes that won't static std::vector inner_batch_3D_range = {4, 8, 16, 32, 20, 24, 64}; static std::vector inner_batch_3D_range_half = {4, 8, 16, 32, 20, 24}; static std::vector inner_batch_3D_batch_range = {3, 2, 1}; INSTANTIATE_TEST_SUITE_P( inner_batch_3D, accuracy_test, ::testing::ValuesIn(param_generator( generate_lengths({inner_batch_3D_range, inner_batch_3D_range, inner_batch_3D_range}), precision_range_sp_dp, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P( inner_batch_3D_half, accuracy_test, ::testing::ValuesIn(param_generator(generate_lengths({inner_batch_3D_range_half, inner_batch_3D_range_half, inner_batch_3D_range_half}), {fft_precision_half}, inner_batch_3D_batch_range, stride_generator_3D_inner_batch(stride_range), stride_generator_3D_inner_batch(stride_range), ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName);rocFFT-rocm-5.7.1/clients/tests/accuracy_test_adhoc.cpp000066400000000000000000000264321446473624700231330ustar00rootroot00000000000000 // Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "accuracy_test.h" std::vector> adhoc_sizes = { // sizes that exercise L1D_TRTRT subplan of 2D_RTRT or 3D_TRTRTR {1, 220}, {1, 330}, {81, 220, 36}, // L1D_CC subplan of 3D_TRTRTR {4, 4, 8192}, // SBRC 192 with special param {192, 192, 192}, {192, 84, 84}, // Failure with build_CS_3D_BLOCK_RC {680, 128, 128}, // Large 1D primes that fall above the block threshold (length 262144). // Bluestein requires two forwards and one inverse FFTs, and the plan // for these sizes breakdown these FFTs either as: // L1D_TRTRT (T + STOCKHAM + T + STOCKHAM + T) for lengthBlue <= 4096^2 // or // L1D_TRTRT (T + L1D_CC + STOCKHAM_BL_CC + STOCHMAM_BL_RC + T + STOCKHAM + T) // for lengthBlue > 4096^2. {196597}, {25165813}, // 2D single-kernel bluestein size combined with multi-kernel bluestein {19, 2053}, // TILE_UNALIGNED type of SBRC 3D ERC {98, 98, 98}, // 3D_BLOCK_CR {336, 336, 56}, }; const static std::vector> stride_range = {{1}}; static std::vector> ioffset_range_zero = {{0, 0}}; static std::vector> ooffset_range_zero = {{0, 0}}; static std::vector> ioffset_range = {{0, 0}, {1, 1}}; static std::vector> ooffset_range = {{0, 0}, {1, 1}}; INSTANTIATE_TEST_SUITE_P(adhoc, accuracy_test, ::testing::ValuesIn(param_generator(adhoc_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_offset_adhoc, accuracy_test, ::testing::ValuesIn(param_generator(adhoc_sizes, precision_range_full, batch_range, stride_range, stride_range, ioffset_range, ooffset_range, place_range, true)), accuracy_test::TestName); // Test that dist is ignored for batch-1 transforms. Normally, // in-place transforms require same dist, but for batch-1 dist isn't // used for anything and differing dist should be allowed. inline auto param_permissive_iodist() { std::vector> lengths = adhoc_sizes; lengths.push_back({4}); std::vector params; for(const auto precision : precision_range_sp_dp) { for(const auto trans_type : trans_type_range) { for(const auto& types : generate_types(trans_type, place_range, true)) { if(std::get<1>(types) != fft_placement_inplace) continue; for(const auto& len : lengths) { fft_params param; param.length = len; param.precision = precision; param.idist = 2; param.odist = 3; param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); params.push_back(param); } } } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_dist, accuracy_test, ::testing::ValuesIn(param_permissive_iodist()), accuracy_test::TestName); inline auto param_adhoc_colmajor() { // generate basic FFTs of adhoc sizes auto params = param_generator(adhoc_sizes, {fft_precision_single}, {2}, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, {fft_placement_notinplace}, false); // remove any params that are: // - 1D (not enough dims to swap) // - real-complex 2D (we only get to play with higher dims, so // again not enough dims to swap) params.erase(std::remove_if(params.begin(), params.end(), [](const fft_params& param) { if(param.length.size() == 1) return true; if(param.length.size() == 2) { if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) return true; } return false; }), params.end()); // reverse length/stride order on remaining params to make them // col-major std::for_each(params.begin(), params.end(), [](fft_params& param) { size_t start_dim = 0; // for real-complex we can't touch the fastest dim if(param.transform_type == fft_transform_type_real_forward || param.transform_type == fft_transform_type_real_inverse) ++start_dim; std::reverse(param.length.rbegin() + start_dim, param.length.rend()); std::reverse(param.istride.rbegin() + start_dim, param.istride.rend()); std::reverse(param.ostride.rbegin() + start_dim, param.ostride.rend()); }); return params; } INSTANTIATE_TEST_SUITE_P(adhoc_colmajor, accuracy_test, ::testing::ValuesIn(param_adhoc_colmajor()), accuracy_test::TestName); inline auto param_adhoc_stride() { std::vector params; for(const auto precision : precision_range_full) { for(const auto& types : generate_types(fft_transform_type_complex_forward, {fft_placement_inplace, fft_placement_notinplace}, true)) { // 2D with non-contiguous strides and dist fft_params param; param.length = {2, 35}; param.precision = precision; param.idist = 200; param.odist = 200; param.transform_type = fft_transform_type_complex_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {90, 2}; param.ostride = {90, 2}; params.push_back(param); } // test C2R/R2C with non-contiguous higher strides and dist - we // want unit stride for length0 so we do the even-length optimization for(const auto& types : generate_types(fft_transform_type_real_forward, {fft_placement_notinplace}, true)) { fft_params param; param.length = {4, 4, 4}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {16, 4, 1}; param.ostride = {16, 4, 1}; params.push_back(param); param.length = {2, 2, 2}; param.precision = precision; param.idist = 0; param.odist = 0; param.transform_type = fft_transform_type_real_forward; param.nbatch = 2; param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.istride = {20, 6, 1}; param.ostride = {20, 6, 1}; params.push_back(param); } } return params; } INSTANTIATE_TEST_SUITE_P(adhoc_stride, accuracy_test, ::testing::ValuesIn(param_adhoc_stride()), accuracy_test::TestName); auto adhoc_tokens = { "complex_forward_len_512_64_single_ip_batch_3_istride_192_3_CI_ostride_192_3_CI_idist_1_odist_" "1_ioffset_0_0_ooffset_0_0", "real_forward_len_1024_1024_1024_single_op_batch_1_istride_1048576_1024_1_R_ostride_525312_513_" "1_HI_idist_1073741824_odist_537919488_ioffset_0_0_ooffset_0_0", "complex_forward_len_6144_single_ip_batch_34_istride_35_CI_ostride_35_CI_idist_1_odist_1_" "ioffset_0_0_ooffset_0_0", }; INSTANTIATE_TEST_SUITE_P(adhoc_token, accuracy_test, ::testing::ValuesIn(param_generator_token(adhoc_tokens)), accuracy_test::TestName); rocFFT-rocm-5.7.1/clients/tests/accuracy_test_callback.cpp000066400000000000000000000135101446473624700236020ustar00rootroot00000000000000// Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "accuracy_test.h" std::vector> callback_sizes = { // some single kernel sizes {4}, {16}, {81}, {100}, // L1D_TRTRT sizes {220}, {330}, {1344}, // L1D_CC sizes {8192}, {10000}, // prime {23}, {29}, // 2D_SINGLE sizes, small and big {16, 8}, {32, 32}, {9, 81}, {27, 81}, {81, 27}, {256, 9}, {9, 256}, {125, 32}, {32, 125}, // 2D_RTRT {20, 40}, {81, 81}, // 2D_RC {128, 64}, {128, 256}, // more complicated children of 2D_RTRT (L1D_TRTRT, L1D_CC, prime) {4, 220}, {220, 4}, {4, 8192}, {8192, 4}, {4, 23}, {23, 4}, // 3D_TRTRTR, with complicated children {63, 5, 6}, {6, 5, 63}, {23, 5, 6}, {6, 5, 23}, {70, 5, 6}, {6, 5, 70}, {8192, 5, 6}, {6, 5, 8192}, // 3D_RTRT, with complicated children {23, 4, 4}, {4, 4, 23}, {70, 4, 4}, {4, 4, 70}, {8192, 4, 4}, {4, 4, 8192}, // 3D odd lengths {27, 27, 27}, // 3D_BLOCK_RC {64, 64, 64}, }; const static std::vector> stride_range = {{1}}; const static std::vector> ioffset_range_zero = {{0, 0}}; const static std::vector> ooffset_range_zero = {{0, 0}}; const static std::vector> ioffset_range = {{0, 0}, {1, 1}}; const static std::vector> ooffset_range = {{0, 0}, {1, 1}}; auto forward_transform_types = {fft_transform_type_complex_forward, fft_transform_type_real_forward}; INSTANTIATE_TEST_SUITE_P(callback, accuracy_test, ::testing::ValuesIn(param_generator_base(forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, false, true)), accuracy_test::TestName); INSTANTIATE_TEST_SUITE_P(DISABLED_callback, accuracy_test, ::testing::ValuesIn(param_generator_base(forward_transform_types, callback_sizes, precision_range_sp_dp, batch_range, generate_types, stride_range, stride_range, ioffset_range, ooffset_range, place_range, false, true)), accuracy_test::TestName); // one of the obvious use cases for callbacks is to implement result // scaling manually, so use the same sizes to test rocFFT's own // result scaling feature. inline auto param_generator_scaling(const std::vector>& v_lengths) { auto params = param_generator(callback_sizes, precision_range_sp_dp, batch_range, stride_range, stride_range, ioffset_range_zero, ooffset_range_zero, place_range, true); for(auto& param : params) param.scale_factor = 7.23; return params; } INSTANTIATE_TEST_SUITE_P(scaling, accuracy_test, ::testing::ValuesIn(param_generator_scaling(callback_sizes)), accuracy_test::TestName); rocFFT-rocm-5.7.1/clients/tests/accuracy_test_checkstride.cpp000066400000000000000000000107271446473624700243450ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "accuracy_test.h" inline auto param_checkstride() { // tuples of length,stride,nbatch,dist to test. strides are arranged so // there's space either between elements on the fastest dim, or // between dims, or both. std::vector, std::vector, size_t, size_t>> sizes = { // 1D single kernel non-unit stride {{64}, {2}, 2, 140}, // 1D single kernel unit stride but non-contiguous batch {{64}, {1}, 2, 80}, // 1D odd length (to test odd-length R2C/C2R) {{15}, {2}, 2, 40}, // 1D SBCC+SBRC {{8192}, {2}, 2, 17000}, // 1D TRTRT {{24000}, {2}, 2, 50000}, // 2D_RTRT {{20, 30}, {80, 2}, 2, 1700}, {{40, 30}, {80, 2}, 2, 3600}, // 2D_RTRT unit stride along fast dim {{20, 30}, {40, 1}, 2, 1000}, {{40, 30}, {40, 1}, 2, 2000}, // 2D_RC {{64, 64}, {130, 2}, 2, 8400}, // 3D_RC {{64, 64, 64}, {8400, 130, 2}, 2, 540000}, // 3D_RTRTRT {{2, 3, 4}, {40, 10, 2}, 2, 100}, // bigger 3D_RTRTRT {{30, 40, 50}, {3000, 60, 1}, 2, 100000}, }; std::vector params; for(const auto trans_type : trans_type_range) { for(const auto& s : sizes) { for(const auto precision : precision_range_sp_dp) { for(const auto& types : generate_types(trans_type, {fft_placement_notinplace}, true)) { for(bool callback : {true, false}) { // callbacks don't work for planar bool is_planar = std::get<2>(types) == fft_array_type_complex_planar || std::get<2>(types) == fft_array_type_hermitian_planar || std::get<3>(types) == fft_array_type_complex_planar || std::get<3>(types) == fft_array_type_hermitian_planar; if(callback && is_planar) continue; fft_params param; param.length = std::get<0>(s); param.istride = std::get<1>(s); param.ostride = std::get<1>(s); param.nbatch = std::get<2>(s); param.precision = precision; param.idist = std::get<3>(s); param.odist = std::get<3>(s); param.transform_type = std::get<0>(types); param.placement = std::get<1>(types); param.itype = std::get<2>(types); param.otype = std::get<3>(types); param.run_callbacks = callback; param.check_output_strides = true; params.push_back(param); } } } } } return params; } INSTANTIATE_TEST_SUITE_P(checkstride, accuracy_test, ::testing::ValuesIn(param_checkstride()), accuracy_test::TestName); rocFFT-rocm-5.7.1/clients/tests/cmake/000077500000000000000000000000001446473624700175115ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/tests/cmake/FindFFTW.cmake000066400000000000000000000114341446473624700220650ustar00rootroot00000000000000# ############################################################################# # Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ############################################################################# #if( FFTW_FIND_VERSION VERSION_LESS "3" ) # message( FFTW_FIND_VERION is ${FFTW_FIND_VERSION}) # message( FATAL_ERROR "FindFFTW can not configure versions less than FFTW 3.0.0" ) #endif( ) find_path(FFTW_INCLUDE_DIRS NAMES fftw3.h HINTS ${FFTW_ROOT}/include $ENV{FFTW_ROOT}/include PATHS /usr/include /usr/local/include ) mark_as_advanced( FFTW_INCLUDE_DIRS ) # message( STATUS "FFTW_FIND_COMPONENTS: ${FFTW_FIND_COMPONENTS}" ) # message( STATUS "FFTW_FIND_REQUIRED_FLOAT: ${FFTW_FIND_REQUIRED_FLOAT}" ) # message( STATUS "FFTW_FIND_REQUIRED_DOUBLE: ${FFTW_FIND_REQUIRED_DOUBLE}" ) include( CheckSymbolExists ) set( FFTW_LIBRARIES "" ) if( FFTW_FIND_REQUIRED_FLOAT OR FFTW_FIND_REQUIRED_SINGLE ) find_library( FFTW_LIBRARIES_SINGLE NAMES fftw3f fftw3f-3 fftw3 fftw3-3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library single" ) mark_as_advanced( FFTW_LIBRARIES_SINGLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTWF_OMP_LIBRARY fftw3f_omp ) find_library( FFTWF_THREADS_LIBRARY fftw3f_threads ) if( FFTWF_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTWF_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTWF_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_SINGLE} ) check_symbol_exists( fftwf_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( FFTW_FIND_REQUIRED_DOUBLE ) find_library( FFTW_LIBRARIES_DOUBLE NAMES fftw3 HINTS ${FFTW_ROOT}/lib $ENV{FFTW_ROOT}/lib PATHS /usr/lib /usr/local/lib PATH_SUFFIXES x86_64-linux-gnu DOC "FFTW dynamic library double" ) mark_as_advanced( FFTW_LIBRARIES_DOUBLE ) list( APPEND FFTW_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) # Look for omp (preferred) or thread libraries. These are not a # hard requirement, but are nice to have to make FFTW run faster. find_library( FFTW_OMP_LIBRARY fftw3_omp ) find_library( FFTW_THREADS_LIBRARY fftw3_threads ) if( FFTW_OMP_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_OMP_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) elseif( FFTW_THREADS_LIBRARY ) list( APPEND FFTW_LIBRARIES ${FFTW_THREADS_LIBRARY} ) set( FFTW_MULTITHREAD TRUE ) endif() list( APPEND CMAKE_REQUIRED_LIBRARIES ${FFTW_LIBRARIES_DOUBLE} ) check_symbol_exists( fftw_sprint_plan "fftw3.h" FFTW_HAVE_SPRINT_PLAN ) endif( ) if( BUILD_FFTW OR FFTW_HAVE_SPRINT_PLAN ) target_compile_definitions( rocfft-test PUBLIC FFTW_HAVE_SPRINT_PLAN ) endif() include( FindPackageHandleStandardArgs ) FIND_PACKAGE_HANDLE_STANDARD_ARGS( FFTW REQUIRED_VARS FFTW_INCLUDE_DIRS FFTW_LIBRARIES ) # assume the threads feature is always enabled on Windows, since it's # not a separate library there if( FFTW_FOUND AND WIN32 ) set( FFTW_MULTITHREAD TRUE ) endif() if( NOT FFTW_FOUND ) message( STATUS "FindFFTW could not find all of the following fftw libraries" ) message( STATUS "${FFTW_FIND_COMPONENTS}" ) else( ) message(STATUS "FindFFTW configured variables:" ) message(STATUS "FFTW_INCLUDE_DIRS: ${FFTW_INCLUDE_DIRS}" ) message(STATUS "FFTW_LIBRARIES: ${FFTW_LIBRARIES}" ) endif() rocFFT-rocm-5.7.1/clients/tests/default_callbacks_test.cpp000066400000000000000000000373411446473624700236270ustar00rootroot00000000000000// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include "../../shared/rocfft_params.h" #include "fftw_transform.h" #include "rocfft.h" // ------------------------------------- // default load callback definitions // ------------------------------------- template __device__ T load_cb(T* data, size_t offset, void* cbdata, void* sharedMem) { return data[offset]; } __device__ auto load_cb_complex_double = load_cb>; __device__ auto load_cb_double = load_cb; __device__ auto load_cb_complex_float = load_cb>; __device__ auto load_cb_float = load_cb; // ------------------------------------- // default store callback definitions // ------------------------------------- template __device__ void store_cb(T* data, size_t offset, T element, void* cbdata, void* sharedMem) { data[offset] = element; } __device__ auto store_cb_complex_double = store_cb>; __device__ auto store_cb_double = store_cb; __device__ auto store_cb_complex_float = store_cb>; __device__ auto store_cb_float = store_cb; // ------------------------------------- // type traits definitions // ------------------------------------- template struct is_hip_complex { static const bool value = false; }; template <> struct is_hip_complex> { static const bool value = true; }; template <> struct is_hip_complex> { static const bool value = true; }; // ------------------------------------- // test callbacks struct // ------------------------------------- enum struct DefaultCallbackType { LOAD, STORE, }; struct Test_Callback { Test_Callback(size_t _N, size_t _dim, rocfft_transform_type_e _frwd_transf_type, rocfft_precision_e _frwd_transf_precision, DefaultCallbackType _cb_type, uint32_t _seed) : N(_N) , dim(_dim) , fwrd_transf_type(_frwd_transf_type) , frwd_transf_precision(_frwd_transf_precision) , cb_type(_cb_type) , seed(_seed) { float low_bound_f = -1.0f, up_bound_f = 1.0f; double low_bound_d = -1.0, up_bound_d = 1.0; std::vector> h_mem_out_f2, h_mem_out_no_cb_f2; std::vector> h_mem_out_d2, h_mem_out_no_cb_d2; switch(fwrd_transf_type) { case rocfft_transform_type_complex_forward: { std::vector> h_mem_in_f2; std::vector> h_mem_in_d2; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f2, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d2, h_mem_out_d2, h_mem_out_no_cb_d2); break; } case rocfft_transform_type_real_forward: { std::vector h_mem_in_f; std::vector h_mem_in_d; (frwd_transf_precision == rocfft_precision_single) ? run(low_bound_f, up_bound_f, h_mem_in_f, h_mem_out_f2, h_mem_out_no_cb_f2) : run(low_bound_d, up_bound_d, h_mem_in_d, h_mem_out_d2, h_mem_out_no_cb_d2); break; } default: break; } } size_t get_data_size() { // compute total data size size_t data_size = 1; for(size_t i = 0; i < dim; ++i) { data_size *= N; } return data_size; } template void run(Tbound low_bound, Tbound up_bound, std::vector& host_mem_in, std::vector& host_mem_out, std::vector& host_mem_out_no_cb) { auto data_sz = get_data_size(); if(cb_type == DefaultCallbackType::LOAD) set_load_callback(); else if(cb_type == DefaultCallbackType::STORE) set_store_callback(); host_mem_in.resize(data_sz); if constexpr(!is_hip_complex::value) init_data(low_bound, up_bound, host_mem_in); else init_data_complex(low_bound, up_bound, host_mem_in); if constexpr(!is_hip_complex::value) data_sz = (data_sz / 2) + 1; host_mem_out.resize(data_sz); forward_transform(true, host_mem_in, host_mem_out); host_mem_out_no_cb.resize(data_sz); forward_transform(false, host_mem_in, host_mem_out_no_cb); validate_test(host_mem_out, host_mem_out_no_cb); } template void init_data(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i] = dist(gen); } } template void init_data_complex(const Tbound low_bound, const Tbound up_bound, std::vector& host_mem) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(low_bound, up_bound); for(size_t i = 0; i < host_mem.size(); i++) { host_mem[i].x = dist(gen); host_mem[i].y = dist(gen); } } template void forward_transform(bool apply_callback, const std::vector& host_mem_in, std::vector& host_mem_out) { rocfft_plan plan = nullptr; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, fwrd_transf_type, frwd_transf_precision, dim, lengths.data(), 1, nullptr), rocfft_status_success); size_t work_buffer_size = 0; void* work_buffer = nullptr; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } hipStream_t stream = nullptr; ASSERT_EQ(hipStreamCreate(&stream), hipSuccess); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); if(apply_callback) { if(cb_type == DefaultCallbackType::LOAD) { ASSERT_EQ(rocfft_execution_info_set_load_callback(info, &load_cb_host, nullptr, 0), rocfft_status_success); } else if(cb_type == DefaultCallbackType::STORE) { ASSERT_EQ( rocfft_execution_info_set_store_callback(info, &store_cb_host, nullptr, 0), rocfft_status_success); } } gpubuf device_mem_in; size_t NbytesIn = host_mem_in.size() * sizeof(Tin); ASSERT_EQ(device_mem_in.alloc(NbytesIn), hipSuccess); EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), NbytesIn, hipMemcpyHostToDevice), hipSuccess); gpubuf device_mem_out; size_t NbytesOut = host_mem_out.size() * sizeof(Tout); ASSERT_EQ(device_mem_out.alloc(NbytesOut), hipSuccess); void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); ASSERT_EQ(hipMemcpy(host_mem_out.data(), out_ptr, NbytesOut, hipMemcpyDeviceToHost), hipSuccess); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); ASSERT_EQ(hipFree(work_buffer), hipSuccess); } template void validate_test(const std::vector& host_mem_out, const std::vector& host_mem_out_no_cb) { auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_out.data()), reinterpret_cast*>(host_mem_out_no_cb.data()), host_mem_out.size(), 1, 1, host_mem_out.size(), 1, host_mem_out_no_cb.size(), nullptr, type_epsilon(), {0}, {0}); EXPECT_LT(diff.l_inf, type_epsilon()); } // ------------------------------------------------ // set_load_callback template specializations // ------------------------------------------------ template void set_load_callback(){}; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_load_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_load_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&load_cb_host, HIP_SYMBOL(load_cb_float), sizeof(void*)), hipSuccess); }; // ------------------------------------------------ // set_store_callback template specializations // ------------------------------------------------ template void set_store_callback(){}; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_double), sizeof(void*)), hipSuccess); }; template <> void set_store_callback>() { EXPECT_EQ( hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_complex_float), sizeof(void*)), hipSuccess); }; template <> void set_store_callback() { EXPECT_EQ(hipMemcpyFromSymbol(&store_cb_host, HIP_SYMBOL(store_cb_float), sizeof(void*)), hipSuccess); }; size_t N = 0; size_t dim = 0; rocfft_transform_type_e fwrd_transf_type; rocfft_precision_e frwd_transf_precision; DefaultCallbackType cb_type; uint32_t seed = 0; void* store_cb_host = nullptr; void* load_cb_host = nullptr; }; // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a load callback set. // ------------------------------------------------------------------- TEST(rocfft_UnitTest, default_load_callback_complex_single) { Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 1); } TEST(rocfft_UnitTest, default_load_callback_complex_double) { Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 2); } TEST(rocfft_UnitTest, default_load_callback_real_single) { Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::LOAD, 3); } TEST(rocfft_UnitTest, default_load_callback_real_double) { Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::LOAD, 4); } // ------------------------------------------------------------------- // Test forward transforms in single/double precision with real and // complex data inputs and having only a store callback set. // ------------------------------------------------------------------- TEST(rocfft_UnitTest, default_store_callback_complex_single) { Test_Callback test(256, 1, rocfft_transform_type_complex_forward, rocfft_precision_single, DefaultCallbackType::STORE, 5); } TEST(rocfft_UnitTest, default_store_callback_complex_double) { Test_Callback test(512, 1, rocfft_transform_type_complex_forward, rocfft_precision_double, DefaultCallbackType::STORE, 6); } TEST(rocfft_UnitTest, default_store_callback_real_single) { Test_Callback test(1024, 1, rocfft_transform_type_real_forward, rocfft_precision_single, DefaultCallbackType::STORE, 7); } TEST(rocfft_UnitTest, default_store_callback_real_double) { Test_Callback test(2048, 1, rocfft_transform_type_real_forward, rocfft_precision_double, DefaultCallbackType::STORE, 8); } rocFFT-rocm-5.7.1/clients/tests/fftw_transform.h000066400000000000000000000541541446473624700216540ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef FFTWTRANSFORM_H #define FFTWTRANSFORM_H #include "test_params.h" #include #include // Function to return maximum error for float and double types. // // Following Schatzman (1996; Accuracy of the Discrete Fourier // Transform and the Fast Fourier Transform), the shape of relative // l_2 error vs length should look like // // epsilon * sqrt(log2(length)). // // The magic epsilon constants below were chosen so that we get a // reasonable upper bound for (all of) our tests. // // For rocFFT, prime lengths result in the highest error. As such, // the epsilons below are perhaps too loose for pow2 lengths; but they // are appropriate for prime lengths. template inline double type_epsilon(); template <> inline double type_epsilon<_Float16>() { return half_epsilon; } template <> inline double type_epsilon() { return single_epsilon; } template <> inline double type_epsilon() { return double_epsilon; } // C++ traits to translate float->fftwf_complex and // double->fftw_complex. // The correct FFTW complex type can be accessed via, for example, // using complex_t = typename fftw_complex_trait::complex_t; template struct fftw_trait; template <> struct fftw_trait<_Float16> { // fftw does not support half precision, so use single precision and convert using fftw_complex_type = fftwf_complex; using fftw_plan_type = fftwf_plan; }; template <> struct fftw_trait { using fftw_complex_type = fftwf_complex; using fftw_plan_type = fftwf_plan; }; template <> struct fftw_trait { using fftw_complex_type = fftw_complex; using fftw_plan_type = fftw_plan; }; // Copies the half-precision input buffer to a single-precision // buffer. Note that the input buffer is already sized like it's a // single-precision buffer (but only half of it is filled), because // we allocate a single-precision buffer for FFTW to plan with. static hostbuf half_to_single_copy(const hostbuf& in) { auto out = in.copy(); auto in_begin = reinterpret_cast(in.data()); std::copy_n(in_begin, in.size() / sizeof(_Float16) / 2, reinterpret_cast(out.data())); return out; } // converts a wider precision buffer to a narrower precision, in-place template void narrow_precision_inplace(hostbuf& in) { // ensure we're actually shrinking the data static_assert(sizeof(TfloatIn) > sizeof(TfloatOut)); auto readPtr = reinterpret_cast(in.data()); auto writePtr = reinterpret_cast(in.data()); std::copy_n(readPtr, in.size() / sizeof(TfloatIn), writePtr); in.shrink(in.size() / (sizeof(TfloatIn) / sizeof(TfloatOut))); } static void single_to_half_inplace(hostbuf& in) { narrow_precision_inplace(in); } // Template wrappers for real-valued FFTW allocators: template inline Tfloat* fftw_alloc_real_type(size_t n); template <> inline float* fftw_alloc_real_type(size_t n) { return fftwf_alloc_real(n); } template <> inline double* fftw_alloc_real_type(size_t n) { return fftw_alloc_real(n); } // Template wrappers for complex-valued FFTW allocators: template inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n); template <> inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) { return fftwf_alloc_complex(n); } template <> inline typename fftw_trait::fftw_complex_type* fftw_alloc_complex_type(size_t n) { return fftw_alloc_complex(n); } template inline fftw_type* fftw_alloc_type(size_t n); template <> inline float* fftw_alloc_type(size_t n) { return fftw_alloc_real_type(n); } template <> inline double* fftw_alloc_type(size_t n) { return fftw_alloc_real_type(n); } template <> inline fftwf_complex* fftw_alloc_type(size_t n) { return fftw_alloc_complex_type(n); } template <> inline fftw_complex* fftw_alloc_type(size_t n) { return fftw_alloc_complex_type(n); } template <> inline rocfft_complex* fftw_alloc_type>(size_t n) { return (rocfft_complex*)fftw_alloc_complex_type(n); } template <> inline rocfft_complex* fftw_alloc_type>(size_t n) { return (rocfft_complex*)fftw_alloc_complex_type(n); } // Template wrappers for FFTW plan executors: template inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan); template <> inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) { return fftwf_execute(plan); } template <> inline void fftw_execute_type(typename fftw_trait::fftw_plan_type plan) { return fftw_execute(plan); } // Template wrappers for FFTW plan destroyers: template inline void fftw_destroy_plan_type(Tfftw_plan plan); template <> inline void fftw_destroy_plan_type(fftwf_plan plan) { return fftwf_destroy_plan(plan); } template <> inline void fftw_destroy_plan_type(fftw_plan plan) { return fftw_destroy_plan(plan); } // Template wrappers for FFTW c2c planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_dft<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait<_Float16>::fftw_complex_type* in, typename fftw_trait<_Float16>::fftw_complex_type* out, int sign, unsigned flags) { return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags) { return fftwf_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_dft(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, typename fftw_trait::fftw_complex_type* out, int sign, unsigned flags) { return fftw_plan_guru64_dft(rank, dims, howmany_rank, howmany_dims, in, out, sign, flags); } // Template wrappers for FFTW c2c executors: template inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_c2c<_Float16>(typename fftw_trait<_Float16>::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_c2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } // Template wrappers for FFTW r2c planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, Tfloat* in, typename fftw_trait::fftw_complex_type* out, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_r2c<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, _Float16* in, typename fftw_trait<_Float16>::fftw_complex_type* out, unsigned flags) { return fftwf_plan_guru64_dft_r2c( rank, dims, howmany_rank, howmany_dims, reinterpret_cast(in), out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, float* in, typename fftw_trait::fftw_complex_type* out, unsigned flags) { return fftwf_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_r2c(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, double* in, typename fftw_trait::fftw_complex_type* out, unsigned flags) { return fftw_plan_guru64_dft_r2c(rank, dims, howmany_rank, howmany_dims, in, out, flags); } // Template wrappers for FFTW r2c executors: template inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_r2c<_Float16>(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft_r2c(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft_r2c(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_r2c(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft_r2c(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } // Template wrappers for FFTW c2r planners: template inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, Tfloat* out, unsigned flags); template <> inline typename fftw_trait<_Float16>::fftw_plan_type fftw_plan_guru64_c2r<_Float16>(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait<_Float16>::fftw_complex_type* in, _Float16* out, unsigned flags) { return fftwf_plan_guru64_dft_c2r( rank, dims, howmany_rank, howmany_dims, in, reinterpret_cast(out), flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, float* out, unsigned flags) { return fftwf_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); } template <> inline typename fftw_trait::fftw_plan_type fftw_plan_guru64_c2r(int rank, const fftw_iodim64* dims, int howmany_rank, const fftw_iodim64* howmany_dims, typename fftw_trait::fftw_complex_type* in, double* out, unsigned flags) { return fftw_plan_guru64_dft_c2r(rank, dims, howmany_rank, howmany_dims, in, out, flags); } // Template wrappers for FFTW c2r executors: template inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out); template <> inline void fftw_plan_execute_c2r<_Float16>(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { // since FFTW does not natively support half precision, convert // input to single, execute, then convert output back to half auto in_single = half_to_single_copy(in.front()); fftwf_execute_dft_c2r(plan, reinterpret_cast(in_single.data()), reinterpret_cast(out.front().data())); single_to_half_inplace(out.front()); } template <> inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftwf_execute_dft_c2r(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } template <> inline void fftw_plan_execute_c2r(typename fftw_trait::fftw_plan_type plan, std::vector& in, std::vector& out) { fftw_execute_dft_c2r(plan, reinterpret_cast(in.front().data()), reinterpret_cast(out.front().data())); } #ifdef FFTW_HAVE_SPRINT_PLAN // Template wrappers for FFTW print plan: template inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan); template <> inline char* fftw_sprint_plan<_Float16>(const typename fftw_trait<_Float16>::fftw_plan_type plan) { return fftwf_sprint_plan(plan); } template <> inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) { return fftwf_sprint_plan(plan); } template <> inline char* fftw_sprint_plan(const typename fftw_trait::fftw_plan_type plan) { return fftw_sprint_plan(plan); } #endif #endif rocFFT-rocm-5.7.1/clients/tests/gtest_main.cpp000066400000000000000000000454561446473624700213050ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. /// @file /// @brief googletest based unit tester for rocfft /// #include #include #include #include #include #include #include #include #include "../../shared/concurrency.h" #include "../../shared/environment.h" #include "../../shared/work_queue.h" #include "rocfft.h" #include "rocfft_accuracy_test.h" #include "test_params.h" #ifdef WIN32 #include #else #include #endif #include namespace po = boost::program_options; // Control output verbosity: int verbose; // User-defined random seed size_t random_seed; // Probability of running individual planar FFTs double planar_prob; // Probability of running individual callback FFTs double callback_prob; // Transform parameters for manual test: fft_params manual_params; // Host memory limitation for tests (GiB): size_t ramgb; // Device memory limitation for tests (GiB): size_t vramgb; // Allow skipping tests if there is a runtime error bool skip_runtime_fails; // But count the number of failures int n_hip_failures = 0; // Manually specified precision cutoffs: double half_epsilon; double single_epsilon; double double_epsilon; // Measured precision cutoffs: double max_linf_eps_double = 0.0; double max_l2_eps_double = 0.0; double max_linf_eps_single = 0.0; double max_l2_eps_single = 0.0; double max_linf_eps_half = 0.0; double max_l2_eps_half = 0.0; // Control whether we use FFTW's wisdom (which we use to imply FFTW_MEASURE). bool use_fftw_wisdom = false; // Cache the last cpu fft that was requested last_cpu_fft_cache last_cpu_fft_data; system_memory get_system_memory() { system_memory memory_data; #ifdef WIN32 MEMORYSTATUSEX info; info.dwLength = sizeof(info); if(!GlobalMemoryStatusEx(&info)) return memory_data; memory_data.total_bytes = info.ullTotalPhys; memory_data.free_bytes = info.ullAvailPhys; #else struct sysinfo info; if(sysinfo(&info) != 0) return memory_data; memory_data.total_bytes = info.totalram * info.mem_unit; memory_data.free_bytes = info.freeram * info.mem_unit; #endif return memory_data; } system_memory start_memory = get_system_memory(); void precompile_test_kernels(const std::string& precompile_file) { std::cout << "precompiling test kernels...\n"; WorkQueue tokenQueue; std::vector tokens; auto ut = testing::UnitTest::GetInstance(); for(int ts_index = 0; ts_index < ut->total_test_suite_count(); ++ts_index) { const auto ts = ut->GetTestSuite(ts_index); // skip disabled suites if(strncmp(ts->name(), "DISABLED", 8) == 0) continue; for(int ti_index = 0; ti_index < ts->total_test_count(); ++ti_index) { const auto ti = ts->GetTestInfo(ti_index); std::string name = ti->name(); // only care about accuracy tests if(name.find("vs_fftw/") != std::string::npos) { name.erase(0, 8); // change batch to 1, so we don't waste time creating // multiple plans that differ only by batch auto idx = name.find("_batch_"); if(idx == std::string::npos) continue; // advance idx to batch number idx += 7; auto end = name.find('_', idx); if(end == std::string::npos) continue; name.replace(idx, end - idx, "1"); tokens.emplace_back(std::move(name)); } } } std::random_device dev; std::mt19937 dist(dev()); std::shuffle(tokens.begin(), tokens.end(), dist); auto precompile_begin = std::chrono::steady_clock::now(); std::cout << "precompiling " << tokens.size() << " FFT plans...\n"; for(auto&& t : tokens) tokenQueue.push(std::move(t)); EnvironmentSetTemp env_compile_only{"ROCFFT_INTERNAL_COMPILE_ONLY", "1"}; const size_t NUM_THREADS = rocfft_concurrency(); std::vector threads; for(size_t i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([&tokenQueue]() { for(;;) { std::string token{tokenQueue.pop()}; if(token.empty()) break; try { rocfft_params params_forward; params_forward.from_token(token); params_forward.validate(); params_forward.setup_structs(); params_forward.free(); rocfft_params params_inverse; params_inverse.inverse_from_forward(params_forward); params_inverse.validate(); params_inverse.setup_structs(); } catch(std::exception& e) { // failed to create a plan, abort // // we could continue on, but the test should just // fail later anyway in the same way. so report // which token failed early and get out throw std::runtime_error(token + " plan creation failure: " + e.what()); } } }); // insert empty tokens to tell threads to stop tokenQueue.push({}); } for(auto& t : threads) t.join(); auto precompile_end = std::chrono::steady_clock::now(); std::chrono::duration precompile_ms = precompile_end - precompile_begin; std::cout << "done precompiling FFT plans in " << static_cast(precompile_ms.count()) << " ms\n"; } int main(int argc, char* argv[]) { // We would like to parse a few arguments before initiating gtest. po::options_description opdesc( "\n" "rocFFT Runtime Test command line options\n" "NB: input parameters are row-major.\n" "\n" "FFTW accuracy test cases are named using these identifiers:\n" "\n" " len_: problem dimensions, row-major\n" " single,double: precision\n" " ip,op: in-place or out-of-place\n" " batch_: batch size\n" " istride__: input stride (ostride for output stride), format may be:\n" " CI - complex interleaved\n" " CP - complex planar\n" " R - real\n" " HI - hermitian interleaved\n" " HP - hermitian planar\n" "\n" "Usage"); // clang-format off opdesc.add_options() ("verbose,v", po::value()->default_value(0), "print out detailed information for the tests.") ("seed", po::value(&random_seed), "Random seed; if unset, use an actual random seed.") ("planar_prob", po::value(&planar_prob)->default_value(0.1), "Probability of running individual planar transforms") ("callback_prob", po::value(&callback_prob)->default_value(0.1), "Probability of running individual callback transforms"); // clang-format on po::variables_map vm; po::store(po::command_line_parser(argc, argv).options(opdesc).allow_unregistered().run(), vm); po::notify(vm); verbose = vm["verbose"].as(); // NB: If we initialize gtest first, then it removes all of its own command-line // arguments and sets argc and argv correctly; no need to jump through hoops for // boost::program_options. ::testing::InitGoogleTest(&argc, argv); // Filename for fftw and fftwf wisdom. std::string fftw_wisdom_filename; // Token string to fully specify fft params for the manual test. std::string test_token; // Filename for precompiled kernels to be written to std::string precompile_file; // Declare the supported options. // clang-format doesn't handle boost program options very well: // clang-format off opdesc.add_options() ("help,h", "produces this help message") ("skip_runtime_fails", po::value(&skip_runtime_fails)->default_value(true), "Skip the test if there is a runtime failure.") ("version", "Print queryable version information from the rocfft library and exit") ("transformType,t", po::value(&manual_params.transform_type) ->default_value(fft_transform_type_complex_forward), "Type of transform:\n0) complex forward\n1) complex inverse\n2) real " "forward\n3) real inverse") ("notInPlace,o", "Not in-place FFT transform (default: in-place)") ("callback", "Inject load/store callbacks") ("checkstride", "Check that data is not written outside of output strides") ("double", "Double precision transform (deprecated: use --precision double)") ("precision", po::value(&manual_params.precision), "Transform precision: single (default), double, half") ( "itype", po::value(&manual_params.itype) ->default_value(fft_array_type_unset), "Array type of input data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ( "otype", po::value(&manual_params.otype) ->default_value(fft_array_type_unset), "Array type of output data:\n0) interleaved\n1) planar\n2) real\n3) " "hermitian interleaved\n4) hermitian planar") ("length", po::value>(&manual_params.length)->multitoken(), "Lengths.") ( "batchSize,b", po::value(&manual_params.nbatch)->default_value(1), "If this value is greater than one, arrays will be used ") ("istride", po::value>(&manual_params.istride)->multitoken(), "Input stride.") ("ostride", po::value>(&manual_params.ostride)->multitoken(), "Output stride.") ("idist", po::value(&manual_params.idist)->default_value(0), "Logical distance between input batches.") ("odist", po::value(&manual_params.odist)->default_value(0), "Logical distance between output batches.") ("ioffset", po::value>(&manual_params.ioffset)->multitoken(), "Input offset.") ("ooffset", po::value>(&manual_params.ooffset)->multitoken(), "Output offset.") ("isize", po::value>(&manual_params.isize)->multitoken(), "Logical size of input buffer.") ("osize", po::value>(&manual_params.osize)->multitoken(), "Logical size of output.") ("R", po::value(&ramgb)->default_value((start_memory.total_bytes + ONE_GiB - 1) / ONE_GiB), "Ram limit in GiB for tests.") ("V", po::value(&vramgb)->default_value(0), "vram limit in GiB for tests.") ("half_epsilon", po::value(&half_epsilon)->default_value(9.77e-4)) ("single_epsilon", po::value(&single_epsilon)->default_value(3.75e-5)) ("double_epsilon", po::value(&double_epsilon)->default_value(1e-15)) ("wise,w", "use FFTW wisdom") ("wisdomfile,W", po::value(&fftw_wisdom_filename)->default_value("wisdom3.txt"), "FFTW3 wisdom filename") ("scalefactor", po::value(&manual_params.scale_factor), "Scale factor to apply to output.") ("token", po::value(&test_token)->default_value(""), "Test token name for manual test") ("precompile", po::value(&precompile_file), "Precompile kernels to a file for all test cases before running tests"); // clang-format on po::store(po::parse_command_line(argc, argv, opdesc), vm); po::notify(vm); if(vm.count("help")) { std::cout << opdesc << "\n"; return 0; } if(vm.count("version")) { char v[256]; rocfft_get_version_string(v, 256); std::cout << "version " << v << "\n"; return EXIT_SUCCESS; } std::cout << "half epsilon: " << half_epsilon << "\tsingle epsilon: " << single_epsilon << "\tdouble epsilon: " << double_epsilon << "\n"; if(!vm.count("seed")) { std::random_device dev; random_seed = dev(); } std::cout << "Random seed: " << random_seed << "\n"; if(vm.count("wise")) { use_fftw_wisdom = true; } // if precompiling, tell rocFFT to use the specified cache file // to write kernels to // // but if our environment already has a cache file for RTC, then // we should just use that std::unique_ptr env_precompile; if(!precompile_file.empty() && rocfft_getenv("ROCFFT_RTC_CACHE_PATH").empty()) { env_precompile = std::make_unique("ROCFFT_RTC_CACHE_PATH", precompile_file.c_str()); } rocfft_setup(); char v[256]; rocfft_get_version_string(v, 256); std::cout << "rocFFT version: " << v << "\n"; #ifdef FFTW_MULTITHREAD fftw_init_threads(); fftwf_init_threads(); fftw_plan_with_nthreads(rocfft_concurrency()); fftwf_plan_with_nthreads(rocfft_concurrency()); #endif if(use_fftw_wisdom) { if(verbose) { std::cout << "Using " << fftw_wisdom_filename << " wisdom file\n"; } std::ifstream fftw_wisdom_file(fftw_wisdom_filename); std::string allwisdom = std::string(std::istreambuf_iterator(fftw_wisdom_file), std::istreambuf_iterator()); std::string fftw_wisdom; std::string fftwf_wisdom; bool load_wisdom = false; bool load_fwisdom = false; std::istringstream input; input.str(allwisdom); // Separate the single-precision and double-precision wisdom: for(std::string line; std::getline(input, line);) { if(line.rfind("(fftw", 0) == 0 && line.find("fftw_wisdom") != std::string::npos) { load_wisdom = true; } if(line.rfind("(fftw", 0) == 0 && line.find("fftwf_wisdom") != std::string::npos) { load_fwisdom = true; } if(load_wisdom) { fftw_wisdom.append(line + "\n"); } if(load_fwisdom) { fftwf_wisdom.append(line + "\n"); } if(line.rfind(")", 0) == 0) { load_wisdom = false; load_fwisdom = false; } } fftw_import_wisdom_from_string(fftw_wisdom.c_str()); fftwf_import_wisdom_from_string(fftwf_wisdom.c_str()); } if(test_token != "") { std::cout << "Reading fft params from token:\n" << test_token << "\n"; try { manual_params.from_token(test_token); } catch(...) { std::cout << "Unable to parse token.\n"; return 1; } } else { if(manual_params.length.empty()) { manual_params.length.push_back(8); // TODO: add random size? } manual_params.placement = vm.count("notInPlace") ? fft_placement_notinplace : fft_placement_inplace; if(vm.count("double")) manual_params.precision = fft_precision_double; if(vm.count("callback")) { manual_params.run_callbacks = true; } if(vm.count("checkstride")) { manual_params.check_output_strides = true; } if(manual_params.istride.empty()) { manual_params.istride.push_back(1); // TODO: add random size? } if(manual_params.ostride.empty()) { manual_params.ostride.push_back(1); // TODO: add random size? } } if(vm.count("precompile")) precompile_test_kernels(precompile_file); auto retval = RUN_ALL_TESTS(); if(use_fftw_wisdom) { std::string fftw_wisdom = std::string(fftw_export_wisdom_to_string()); std::string fftwf_wisdom = std::string(fftwf_export_wisdom_to_string()); fftw_wisdom.append(std::string(fftwf_export_wisdom_to_string())); std::ofstream fftw_wisdom_file(fftw_wisdom_filename); fftw_wisdom_file << fftw_wisdom; fftw_wisdom_file << fftwf_wisdom; fftw_wisdom_file.close(); } rocfft_cleanup(); std::cout << "Random seed: " << random_seed << "\n"; std::cout << "half precision max l-inf epsilon: " << max_linf_eps_half << "\n"; std::cout << "half precision max l2 epsilon: " << max_l2_eps_half << "\n"; std::cout << "single precision max l-inf epsilon: " << max_linf_eps_single << "\n"; std::cout << "single precision max l2 epsilon: " << max_l2_eps_single << "\n"; std::cout << "double precision max l-inf epsilon: " << max_linf_eps_double << "\n"; std::cout << "double precision max l2 epsilon: " << max_l2_eps_double << "\n"; std::cout << "Number of runtime issues: " << n_hip_failures << "\n"; return retval; } TEST(manual, vs_fftw) // MANUAL TESTS HERE { // Run an individual test using the provided command-line parameters. manual_params.validate(); std::cout << "Manual test:" << "\n\t" << manual_params.str("\n\t") << "\n"; std::cout << "Token: " << manual_params.token() << "\n"; if(!manual_params.valid(verbose + 2)) { std::cout << "manual params are not valid\n"; } rocfft_params params(manual_params); fft_vs_reference(params); } rocFFT-rocm-5.7.1/clients/tests/hermitian_test.cpp000066400000000000000000000263361446473624700221660ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "../samples/rocfft/examplekernels.h" #include "../samples/rocfft/exampleutils.h" #include "accuracy_test.h" #include "rocfft.h" #include #include #include #include #include #include #include void run_1D_hermitian_test(size_t length) { // Run two 1D C2R transforms, on: // * random input // * identical random input, but modified to be Hermitian-symmetric // We should tolerate the input being having non-zero imaginary part in the DC mode // and the Nyquist frequency (of the length is even). rocfft_params p; p.length = {length}; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); std::vector h_input(p.isize[0]); std::random_device rd; std::mt19937 gen(rd()); std::uniform_real_distribution dis(0.0, 1.0); for(auto& val : h_input) { val.x = dis(gen); val.y = dis(gen); } if(verbose > 2) { std::cout << "non-Hermitian input:"; for(const auto& val : h_input) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } gpubuf ibuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); gpubuf obuf; ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(p.create_plan() == fft_status_success); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); ASSERT_TRUE(hipDeviceSynchronize() == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output) { std::cout << " " << val; } std::cout << std::endl; } std::vector h_input1(p.isize[0]); std::copy(h_input.begin(), h_input.end(), h_input1.begin()); // Impose Hermitian symmetry on the input: h_input1[0].y = 0.0; if(p.length[0] % 2 == 0) { h_input1.back().y = 0.0; } if(verbose > 2) { std::cout << "Hermitian input:"; for(const auto& val : h_input1) { std::cout << " " << "(" << val.x << ", " << val.y << ")"; } std::cout << std::endl; } double maxdiff = 0.0; for(unsigned int i = 0; i < h_input.size(); ++i) { auto val = std::abs( rocfft_complex(h_input[i].x - h_input1[i].x, h_input[i].y - h_input1[i].y)); if(val > maxdiff) maxdiff = val; } ASSERT_TRUE(maxdiff > 0.0); ASSERT_TRUE(hipMemcpy(ibuf.data(), h_input1.data(), ibuf.size(), hipMemcpyHostToDevice) == hipSuccess); ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output1(p.osize[0]); ASSERT_TRUE(hipMemcpy(h_output1.data(), obuf.data(), obuf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 2) { std::cout << "output:"; for(const auto& val : h_output1) { std::cout << " " << val; } std::cout << std::endl; } double maxerr = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto val = std::abs(h_output[i] - h_output1[i]); if(val > maxerr) maxerr = val; } if(verbose) std::cout << maxerr << std::endl; EXPECT_TRUE(maxerr == 0.0); } // test a case that's small enough that it only needs one kernel TEST(rocfft_UnitTest, 1D_hermitian_single_small) { run_1D_hermitian_test(8); } // test a case that's big enough that it needs multiple kernels TEST(rocfft_UnitTest, 1D_hermitian_single_large) { run_1D_hermitian_test(8192); } template std::string str(T begin, T end) { std::stringstream ss; bool first = true; for(; begin != end; begin++) { if(!first) ss << ", "; ss << *begin; first = false; } return ss.str(); } // Test that the GPU Hermitian symmetrizer code produces the correct results. TEST(rocfft_UnitTest, gpu_symmetrizer) { std::vector> lengths = {{4, 4, 3}, {5}, {8}, {5, 5}, {5, 8}, {8, 5}, {8, 8}, {5, 5, 5}, {8, 5, 5}, {5, 8, 5}, {5, 5, 8}, {5, 8, 8}, {8, 5, 8}, {8, 8, 5}, {8, 8, 8}}; for(const auto& length : lengths) { // Symmetrize complex data and ensure that the checker sees that it's symmetric. // Use the params class to set up strides and lengths: rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_inverse; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); // Data buffers: gpubuf buf; ASSERT_TRUE(buf.alloc(sizeof(hipDoubleComplex) * p.isize[0]) == hipSuccess); std::vector hbuf(p.isize[0]); // Initialize a Hermitian-symmetric array; it should be symmetric. init_hermitiancomplex_cm(p.length_cm(), p.ilength_cm(), p.istride_cm(), buf.data()); ASSERT_TRUE(hipMemcpy(hbuf.data(), buf.data(), buf.size(), hipMemcpyDeviceToHost) == hipSuccess); if(verbose > 1) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, verbose > 0)) << "length: " << str(length.begin(), length.end()); // This should not be symmetric: std::mt19937_64 rng; std::seed_seq ss{uint32_t(10)}; rng.seed(ss); std::uniform_real_distribution unif(0, 1); for(auto& v : hbuf) { v.x = unif(rng); v.y = unif(rng); } if(verbose > 2) { printbuffer_cm(hbuf, p.ilength_cm(), p.istride_cm(), p.nbatch, p.idist); } EXPECT_TRUE( !check_symmetry_cm(hbuf, p.length_cm(), p.istride_cm(), p.nbatch, p.idist, false)) << "length: " << str(length.begin(), length.end()); } for(const auto& length : lengths) { // Generate Hermitian-symmetric data and ensure that applying the symmetrizer has no effect. rocfft_params p; p.length = length; p.precision = fft_precision_double; p.transform_type = fft_transform_type_real_forward; p.placement = fft_placement_notinplace; p.validate(); if(verbose) { std::cout << "\t" << p.str("\n\t") << std::endl; } ASSERT_TRUE(p.valid(verbose)); ASSERT_TRUE(p.create_plan() == fft_status_success); gpubuf ibuf, obuf; ASSERT_TRUE(ibuf.alloc(p.ibuffer_sizes()[0]) == hipSuccess); ASSERT_TRUE(obuf.alloc(p.obuffer_sizes()[0]) == hipSuccess); initreal_cm(p.length_cm(), p.istride_cm(), ibuf.data()); std::vector pibuf = {ibuf.data()}; std::vector pobuf = {obuf.data()}; ASSERT_TRUE(p.execute(pibuf.data(), pobuf.data()) == fft_status_success); std::vector h_output(p.osize[0]); std::fill(h_output.begin(), h_output.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy(h_output.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); impose_hermitian_symmetry_cm(p.length_cm(), p.olength_cm(), p.ostride_cm(), obuf.data()); std::vector h_output_resym(p.osize[0]); std::fill(h_output_resym.begin(), h_output_resym.end(), hipDoubleComplex{0.0, 0.0}); ASSERT_TRUE( hipMemcpy( h_output_resym.data(), obuf.data(), p.obuffer_sizes()[0], hipMemcpyDeviceToHost) == hipSuccess); double maxdiff = 0; for(unsigned int i = 0; i < h_output.size(); ++i) { auto rdiff = std::abs(h_output[i].x - h_output_resym[i].x); auto idiff = std::abs(h_output[i].y - h_output_resym[i].y); maxdiff = std::max({maxdiff, rdiff, idiff}); } if(verbose) { std::cout << "maxdiff: " << maxdiff << std::endl; } if(verbose > 2) { std::cout << "before symmetrization:\n"; printbuffer_cm(h_output, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); std::cout << "after symmetrization:\n"; printbuffer_cm(h_output_resym, p.olength_cm(), p.ostride_cm(), p.nbatch, p.odist); } EXPECT_TRUE(maxdiff < 1e-13) << maxdiff << "\n" << p.str() << "\n"; } } rocFFT-rocm-5.7.1/clients/tests/hipGraph_test.cpp000066400000000000000000000333001446473624700217350ustar00rootroot00000000000000// Copyright (C) 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/arithmetic.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "accuracy_test.h" #include "rocfft.h" #include "rocfft_against_fftw.h" #include #include #include #include #include static const unsigned int KERNEL_THREADS = 64; __global__ void scale_data_kernel(rocfft_complex* data, size_t length, float scale) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x *= scale; data[idx].y *= scale; } } template __global__ void offset_data_kernel_complex(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx].x += offset.x; data[idx].y += offset.y; } } template __global__ void offset_data_kernel_real(T* data, size_t length, T offset) { const auto idx = blockIdx.x * blockDim.x + threadIdx.x; if(idx < length) { data[idx] += offset; } } static void init_input_data(size_t N, size_t seed, std::vector>& host_data, gpubuf_t>& device_data) { std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_data.resize(N); for(size_t i = 0; i < N; i++) { host_data[i].x = dist(gen); host_data[i].y = dist(gen); } size_t Nbytes = N * sizeof(rocfft_complex); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } template static void init_data(size_t N, T init_val, std::vector& host_data, gpubuf_t& device_data) { host_data.resize(N); std::fill(host_data.begin(), host_data.end(), init_val); size_t Nbytes = N * sizeof(T); if(device_data.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); EXPECT_EQ(hipMemcpy(device_data.data(), host_data.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } static void create_forward_fft_plan(size_t N, rocfft_plan& plan) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void create_inverse_fft_plan(size_t N, rocfft_plan& plan_inv) { auto dim = 1; std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); } static void set_fft_info(hipStream_t stream, rocfft_execution_info& info) { EXPECT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); EXPECT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); } static void run_forward_fft(rocfft_execution_info info, const rocfft_plan plan, void* in_ptr, void* out_ptr) { ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); } static void run_inverse_fft(rocfft_execution_info info, const rocfft_plan plan_inv, void* in_ptr, void* out_ptr) { // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &in_ptr, &out_ptr, info), rocfft_status_success); } static void scale_device_data(hipStream_t stream, float scale, size_t N, rocfft_complex* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(scale_data_kernel, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, scale); } template static void offset_device_data_real(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_real, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void offset_device_data_complex(hipStream_t stream, T offset, size_t N, T* data) { auto blockSize = KERNEL_THREADS; auto numBlocks = DivRoundingUp(N, blockSize); hipLaunchKernelGGL(offset_data_kernel_complex, dim3(numBlocks), dim3(blockSize), 0, // sharedMemBytes stream, // stream data, N, offset); } template static void compare_data_exact_match(hipStream_t other_stream, const std::vector& host_data, const gpubuf_t& device_data) { std::vector host_data_compare(host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpyAsync(host_data_compare.data(), device_data.data(), host_data_compare.size() * sizeof(T), hipMemcpyDeviceToHost, other_stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(other_stream), hipSuccess); ASSERT_EQ(host_data == host_data_compare, true); } static void compare_data(const std::vector>& original_host_data, const gpubuf_t>& modified_device_data) { std::vector> modified_host_data(original_host_data.size()); // Copy result back to host ASSERT_EQ(hipMemcpy(modified_host_data.data(), modified_device_data.data(), modified_host_data.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(original_host_data.data()), original_host_data.size(), 1, 1, original_host_data.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(original_host_data.data()), reinterpret_cast*>(modified_host_data.data()), // data is all contiguous, we can treat it as 1d original_host_data.size(), 1, 1, original_host_data.size(), 1, modified_host_data.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(original_host_data.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(original_host_data.size()) * MAX_TRANSFORM_ERROR); } TEST(rocfft_UnitTest, DISABLED_hipGraph_execution) { hipStream_t stream = nullptr; hipStream_t other_stream = nullptr; hipGraph_t graph = nullptr; hipGraphExec_t graph_exec = nullptr; size_t N = 256; size_t seed = 100; auto offset_1 = rocfft_complex{.1, .1}; auto offset_2 = rocfft_complex{-.1, -.1}; float scale = 2.2; float inv_scale = 1. / scale; auto output_init_val = rocfft_complex(0., 0.); size_t num_kernel_launches = 100; size_t num_graph_launches = 10; gpubuf_t> device_mem_in; std::vector> host_mem_in; init_input_data(N, seed, host_mem_in, device_mem_in); rocfft_complex* in_ptr = static_cast*>(device_mem_in.data()); gpubuf_t> device_mem_out; std::vector> host_mem_out; init_data>(N, output_init_val, host_mem_out, device_mem_out); rocfft_complex* out_ptr = static_cast*>(device_mem_out.data()); gpubuf_t device_mem_counter; std::vector host_mem_counter; init_data(N, 0, host_mem_counter, device_mem_counter); size_t* counter_ptr = static_cast(device_mem_counter.data()); rocfft_plan plan; create_forward_fft_plan(N, plan); rocfft_plan plan_inv; create_inverse_fft_plan(N, plan_inv); EXPECT_EQ(hipDeviceSynchronize(), hipSuccess); ASSERT_EQ(hipStreamCreate(&stream), hipSuccess); ASSERT_EQ(hipStreamCreate(&other_stream), hipSuccess); ASSERT_EQ(hipStreamBeginCapture(stream, hipStreamCaptureModeGlobal), hipSuccess); rocfft_execution_info info; set_fft_info(stream, info); // add offset to device input data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, in_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, in_ptr); // scale the device input data scale_device_data(stream, scale, N, in_ptr); // backout the scale scale_device_data(stream, inv_scale, N, in_ptr); // run forward transform on input data run_forward_fft(info, plan, in_ptr, out_ptr); // scale the device output data scale_device_data(stream, scale, N, out_ptr); // backout the scale scale_device_data(stream, inv_scale, N, out_ptr); // run (in-place) inverse transform on output data run_inverse_fft(info, plan_inv, out_ptr, nullptr); // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform auto inv_scale_N = 1. / static_cast(N); scale_device_data(stream, inv_scale_N, N, out_ptr); // add offset to device output data for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_1, N, out_ptr); // back out the offsets for(size_t i = 0; i < num_kernel_launches; ++i) offset_device_data_complex>(stream, offset_2, N, out_ptr); // increment counter offset_device_data_real(stream, 1, N, counter_ptr); ASSERT_EQ(hipStreamEndCapture(stream, &graph), hipSuccess); // make sure no actual work has been done for // the captured stream before graph execution compare_data_exact_match>(other_stream, host_mem_out, device_mem_out); ASSERT_EQ(hipGraphInstantiate(&graph_exec, graph, NULL, NULL, 0), hipSuccess); ASSERT_EQ(hipGraphDestroy(graph), hipSuccess); for(size_t i = 0; i < num_graph_launches; ++i) ASSERT_EQ(hipGraphLaunch(graph_exec, stream), hipSuccess); ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); ASSERT_EQ(hipStreamDestroy(stream), hipSuccess); // check for correctness of the output data compare_data(host_mem_in, device_mem_out); // check for correctness of the counter // incremented with multiple graph executions std::vector host_mem_counter_modified(N); fill(host_mem_counter_modified.begin(), host_mem_counter_modified.end(), num_graph_launches); compare_data_exact_match(other_stream, host_mem_counter_modified, device_mem_counter); ASSERT_EQ(hipStreamDestroy(other_stream), hipSuccess); } rocFFT-rocm-5.7.1/clients/tests/misc/000077500000000000000000000000001446473624700173645ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/tests/misc/include/000077500000000000000000000000001446473624700210075ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/tests/misc/include/test_exception.h000066400000000000000000000006071446473624700242200ustar00rootroot00000000000000/******************************************************************************* * Copyright (C) 2016-2022 Advanced Micro Devices, Inc. All rights reserved. ******************************************************************************/ #pragma once #ifndef __TEST_EXCEPTION__ #define __TEST_EXCEPTION__ #include void handle_exception(const std::exception& except); #endif rocFFT-rocm-5.7.1/clients/tests/misc/source/000077500000000000000000000000001446473624700206645ustar00rootroot00000000000000rocFFT-rocm-5.7.1/clients/tests/misc/source/test_exception.cpp000066400000000000000000000031101446473624700244200ustar00rootroot00000000000000/****************************************************************************** * Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to deal * in the Software without restriction, including without limitation the rights * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell * copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in * all copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN * THE SOFTWARE. *******************************************************************************/ #include "test_exception.h" #include #include #include void handle_exception(const std::exception& except) { std::cout << "--- EXCEPTION CAUGHT ---" << std::endl; std::string error_message = except.what(); std::cout << error_message << std::endl; FAIL(); } rocFFT-rocm-5.7.1/clients/tests/multithread_test.cpp000066400000000000000000000313171446473624700225230ustar00rootroot00000000000000// Copyright (C) 2020 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/gpubuf.h" #include "../../shared/rocfft_params.h" #include "accuracy_test.h" #include "rocfft.h" #include "rocfft_against_fftw.h" #include #include #include #include #include #include // normalize results of an inverse transform, so it can be directly // compared to the original data before the forward transform __global__ void normalize_inverse_results(rocfft_complex* array, float N) { const int idx = blockIdx.x * blockDim.x + threadIdx.x; array[idx].x /= N; array[idx].y /= N; } // Run a transform of specified dimensions, size N on each dimension. // Data is randomly generated based on the seed value, and we do a // forward + inverse transform and compare against what we started // with. struct Test_Transform { // real constructor sets all the data up and creates the plans Test_Transform(size_t _N, size_t _dim, uint32_t _seed) : N(_N) , dim(_dim) , seed(_seed) { // compute total data size size_t datasize = 1; for(size_t i = 0; i < dim; ++i) { datasize *= N; } size_t Nbytes = datasize * sizeof(rocfft_complex); // Create HIP device buffers if(device_mem_in.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); if(device_mem_out.alloc(Nbytes) != hipSuccess) throw std::bad_alloc(); // Initialize data std::minstd_rand gen(seed); std::uniform_real_distribution dist(-1.0f, 1.0f); host_mem_in.resize(datasize); host_mem_out.resize(datasize); for(size_t i = 0; i < datasize; i++) { host_mem_in[i].x = dist(gen); host_mem_in[i].y = dist(gen); } // Copy data to device // NB: Cannot use ASSERT_EQ because constructor does not return void. EXPECT_EQ( hipMemcpy(device_mem_in.data(), host_mem_in.data(), Nbytes, hipMemcpyHostToDevice), hipSuccess); } Test_Transform(const Test_Transform&) = delete; void operator=(const Test_Transform&) = delete; Test_Transform(Test_Transform&& other) : stream(other.stream) , work_buffer(other.work_buffer) , device_mem_in(std::move(other.device_mem_in)) , device_mem_out(std::move(other.device_mem_out)) { other.stream = nullptr; other.work_buffer = nullptr; host_mem_in.swap(other.host_mem_in); host_mem_out.swap(other.host_mem_out); } void run_transform() { // Create rocFFT plans (forward + inverse) std::vector lengths(dim, N); ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_notinplace, rocfft_transform_type_complex_forward, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); ASSERT_EQ(rocfft_plan_create(&plan_inv, rocfft_placement_inplace, rocfft_transform_type_complex_inverse, rocfft_precision_single, dim, lengths.data(), 1, nullptr), rocfft_status_success); // allocate work buffer if necessary ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &work_buffer_size), rocfft_status_success); // NOTE: assuming that same-sized work buffer is ok for both // forward and inverse transforms if(work_buffer_size) { ASSERT_EQ(hipMalloc(&work_buffer, work_buffer_size), hipSuccess); } ASSERT_EQ(hipStreamCreate(&stream), hipSuccess); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_set_stream(info, stream), rocfft_status_success); // NOTE: This multithread test is intended to test the cases having work_buffer_size // If the assert fails, this means we should change the problem. // But that rarely happens (maybe when the opt_strategy is minimal_buffer) // So we don't put this one inside the if(work_buffer_size){ ... } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer, work_buffer_size), rocfft_status_success); // Execute forward plan out-of-place void* in_ptr = device_mem_in.data(); void* out_ptr = device_mem_out.data(); ASSERT_EQ(rocfft_execute(plan, &in_ptr, &out_ptr, info), rocfft_status_success); // Execute inverse plan in-place ASSERT_EQ(rocfft_execute(plan_inv, &out_ptr, nullptr, info), rocfft_status_success); ASSERT_EQ(rocfft_execution_info_destroy(info), rocfft_status_success); // Apply normalization so the values really are comparable hipLaunchKernelGGL(normalize_inverse_results, host_mem_out.size(), 1, 0, // sharedMemBytes stream, // stream static_cast*>(device_mem_out.data()), static_cast(host_mem_out.size())); ran_transform = true; } void do_cleanup() { // complain loudly if we set up for a transform but did not // actually run it if(plan && !ran_transform) ADD_FAILURE(); // wait for execution to finish if(stream) { ASSERT_EQ(hipStreamSynchronize(stream), hipSuccess); ASSERT_EQ(hipStreamDestroy(stream), hipSuccess); stream = nullptr; } ASSERT_EQ(hipFree(work_buffer), hipSuccess); work_buffer = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan), rocfft_status_success); plan = nullptr; ASSERT_EQ(rocfft_plan_destroy(plan_inv), rocfft_status_success); plan_inv = nullptr; // Copy result back to host if(device_mem_out.data() && !host_mem_out.empty()) { ASSERT_EQ(hipMemcpy(host_mem_out.data(), device_mem_out.data(), host_mem_out.size() * sizeof(rocfft_complex), hipMemcpyDeviceToHost), hipSuccess); // Compare data we got to the original. // We're running 2 transforms (forward+inverse), so we // should tolerate 2x the error of a single transform. const double MAX_TRANSFORM_ERROR = 2 * type_epsilon(); auto input_norm = norm_complex(reinterpret_cast*>(host_mem_in.data()), host_mem_in.size(), 1, 1, host_mem_in.size(), {0}); auto diff = distance_1to1_complex( reinterpret_cast*>(host_mem_in.data()), reinterpret_cast*>(host_mem_out.data()), // data is all contiguous, we can treat it as 1d host_mem_in.size(), 1, 1, host_mem_in.size(), 1, host_mem_out.size(), nullptr, MAX_TRANSFORM_ERROR, {0}, {0}); EXPECT_LT(diff.l_2 / input_norm.l_2, sqrt(log2(host_mem_in.size())) * MAX_TRANSFORM_ERROR); EXPECT_LT(diff.l_inf / input_norm.l_inf, log2(host_mem_in.size()) * MAX_TRANSFORM_ERROR); // Free buffers host_mem_in.clear(); host_mem_out.clear(); } } ~Test_Transform() { do_cleanup(); } size_t N = 0; size_t dim = 0; uint32_t seed = 0; hipStream_t stream = nullptr; rocfft_plan plan = nullptr; rocfft_plan plan_inv = nullptr; size_t work_buffer_size = 0; void* work_buffer = nullptr; gpubuf device_mem_in; gpubuf device_mem_out; std::vector> host_mem_in; std::vector> host_mem_out; // ensure that we don't forget to actually run the transform bool ran_transform = false; }; // run concurrent transforms, one per thread, size N on each dimension static void multithread_transform(size_t N, size_t dim, size_t num_threads) { std::vector threads; threads.reserve(num_threads); for(size_t j = 0; j < num_threads; ++j) { threads.emplace_back([=]() { try { Test_Transform t(N, dim, j); t.run_transform(); } catch(std::bad_alloc& e) { ADD_FAILURE() << "memory allocation failure"; } }); } for(auto& t : threads) t.join(); } // for multi-stream tests, set up a bunch of streams, then execute // all of those transforms from a single thread. afterwards, // wait/verify/cleanup in parallel to save wall time during the test. static void multistream_transform(size_t N, size_t dim, size_t num_streams) { std::vector> transforms; transforms.resize(num_streams); std::vector threads; threads.reserve(num_streams); // get all data ready in parallel for(size_t i = 0; i < num_streams; ++i) threads.emplace_back([=, &transforms]() { try { transforms[i] = std::make_unique(N, dim, i); } catch(std::bad_alloc&) { ADD_FAILURE() << "memory allocation failure"; } }); for(auto& t : threads) t.join(); threads.clear(); // now start the actual transforms serially, but in separate // streams for(auto& t : transforms) { if(!t) // must have failed to allocate memory, abort the test return; t->run_transform(); } // clean up for(size_t i = 0; i < transforms.size(); ++i) threads.emplace_back([=, &transforms]() { transforms[i]->do_cleanup(); }); for(auto& t : threads) t.join(); } // pick arbitrary sizes here to get some parallelism while still // fitting into e.g. 8 GB of GPU memory TEST(rocfft_UnitTest, simple_multithread_1D) { multithread_transform(1048576, 1, 64); } TEST(rocfft_UnitTest, simple_multithread_2D) { multithread_transform(1024, 2, 64); } TEST(rocfft_UnitTest, simple_multithread_3D) { multithread_transform(128, 3, 40); } TEST(rocfft_UnitTest, simple_multistream_1D) { multistream_transform(1048576, 1, 32); } TEST(rocfft_UnitTest, simple_multistream_2D) { multistream_transform(1024, 2, 32); } TEST(rocfft_UnitTest, simple_multistream_3D) { multistream_transform(128, 3, 32); } rocFFT-rocm-5.7.1/clients/tests/random.cpp000066400000000000000000000136701446473624700204240ustar00rootroot00000000000000// Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include "accuracy_test.h" #include "rocfft_accuracy_test.h" static const int n_random_tests = 10; class random_params : public ::testing::TestWithParam< std::tuple> { }; // TODO: Add batch and stride TEST_P(random_params, vs_fftw) { const int random_seed_salt = std::get<0>(GetParam()); const int dimension = std::get<1>(GetParam()); const auto precision = std::get<2>(GetParam()); const auto placement = std::get<3>(GetParam()); const auto transform_type = std::get<4>(GetParam()); rocfft_params params; params.transform_type = fft_transform_type_complex_forward; params.placement = placement; params.precision = precision; params.transform_type = transform_type; int maxlen = 0; switch(dimension) { case 1: maxlen = 1 << 15; break; case 2: maxlen = 1 << 10; break; case 3: maxlen = 1 << 6; break; default: ASSERT_TRUE(false); } std::mt19937 rgen(random_seed + random_seed_salt); // Mean value of the exponential distribution is maxlen: std::exponential_distribution distribution(1.0 / maxlen); for(int idim = 0; idim < dimension; ++idim) { // NB: the distribution can return 0, so add 1 to avoid this issue. params.length.push_back(1 + (size_t)distribution(rgen)); } params.validate(); if(verbose > 1) { std::cout << "Random test params:" << "\n\t" << params.str("\n\t") << std::endl; } if(verbose) { std::cout << "Token: " << params.token() << std::endl; } if(!params.valid(verbose)) { std::cout << "Params are not valid\n"; } fft_vs_reference(params, true); } INSTANTIATE_TEST_SUITE_P(random_complex_1d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({1}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); INSTANTIATE_TEST_SUITE_P(random_complex_2d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({2}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); INSTANTIATE_TEST_SUITE_P(random_complex_3d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({3}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn(place_range), ::testing::ValuesIn(trans_type_range_complex))); INSTANTIATE_TEST_SUITE_P(random_real_1d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({1}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); INSTANTIATE_TEST_SUITE_P(random_real_2d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({2}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); INSTANTIATE_TEST_SUITE_P(random_real_3d, random_params, ::testing::Combine(::testing::Range(0, n_random_tests), ::testing::ValuesIn({3}), ::testing::ValuesIn(precision_range_sp_dp), ::testing::ValuesIn({fft_placement_notinplace}), ::testing::ValuesIn(trans_type_range_real))); rocFFT-rocm-5.7.1/clients/tests/rocfft_accuracy_test.cpp000066400000000000000000000047121446473624700233350ustar00rootroot00000000000000// Copyright (C) 2022 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include #include #include #include #include #include #include "rocfft_accuracy_test.h" #include "../../shared/gpubuf.h" #include "fftw_transform.h" #include "rocfft.h" #include "rocfft_against_fftw.h" void fft_vs_reference(rocfft_params& params, bool round_trip) { switch(params.precision) { case fft_precision_half: fft_vs_reference_impl<_Float16, rocfft_params>(params, round_trip); break; case fft_precision_single: fft_vs_reference_impl(params, round_trip); break; case fft_precision_double: fft_vs_reference_impl(params, round_trip); break; } } // Test for comparison between FFTW and rocFFT. TEST_P(accuracy_test, vs_fftw) { rocfft_params params(GetParam()); params.validate(); // Test that the tokenization works as expected. auto token = params.token(); fft_params tokentest; tokentest.from_token(token); auto token1 = tokentest.token(); EXPECT_EQ(token, token1); if(!params.valid(verbose)) { if(verbose) { std::cout << "Invalid parameters, skip this test." << std::endl; } GTEST_SKIP(); } fft_vs_reference(params, true); SUCCEED(); } rocFFT-rocm-5.7.1/clients/tests/rocfft_accuracy_test.h000066400000000000000000000025221446473624700227770ustar00rootroot00000000000000// Copyright (C) 2022 - 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #ifndef ROCFFT_ACCURACY_TEST #define ROCFFT_ACCURACY_TEST #include "../../shared/rocfft_params.h" #include "accuracy_test.h" void fft_vs_reference(rocfft_params& params, bool round_trip = false); #endif rocFFT-rocm-5.7.1/clients/tests/rocfft_against_fftw.h000066400000000000000000000221431446473624700226230ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef ROCFFT_AGAINST_FFTW #define ROCFFT_AGAINST_FFTW #include #include #include #include #include "fftw_transform.h" // Return the precision enum for rocFFT based upon the type. template inline fft_precision precision_selector(); template <> inline fft_precision precision_selector() { return fft_precision_single; } template <> inline fft_precision precision_selector() { return fft_precision_double; } extern bool use_fftw_wisdom; // construct and return an FFTW plan with the specified type, // precision, and dimensions. cpu_out is required if we're using // wisdom, which runs actual FFTs to work out the best plan. template static typename fftw_trait::fftw_plan_type fftw_plan_with_precision(const std::vector& dims, const std::vector& howmany_dims, const fft_transform_type transformType, const size_t isize, void* cpu_in, void* cpu_out) { using fftw_complex_type = typename fftw_trait::fftw_complex_type; // NB: Using FFTW_MEASURE implies that the input buffer's data // may be destroyed during plan creation. But if we're wanting // to run FFTW in the first place, we must have just created an // uninitialized input buffer anyway. switch(transformType) { case fft_transform_type_complex_forward: return fftw_plan_guru64_dft(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), -1, use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_complex_inverse: return fftw_plan_guru64_dft(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), 1, use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_real_forward: return fftw_plan_guru64_r2c(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); case fft_transform_type_real_inverse: return fftw_plan_guru64_c2r(dims.size(), dims.data(), howmany_dims.size(), howmany_dims.data(), reinterpret_cast(cpu_in), reinterpret_cast(cpu_out), use_fftw_wisdom ? FFTW_MEASURE : FFTW_ESTIMATE); default: throw std::runtime_error("Invalid transform type"); } } // construct an FFTW plan, given rocFFT parameters. output is // required if planning with wisdom. template static typename fftw_trait::fftw_plan_type fftw_plan_via_rocfft(const std::vector& length, const std::vector& istride, const std::vector& ostride, const size_t nbatch, const size_t idist, const size_t odist, const fft_transform_type transformType, std::vector& input, std::vector& output) { // Dimension configuration: std::vector dims(length.size()); for(unsigned int idx = 0; idx < length.size(); ++idx) { dims[idx].n = length[idx]; dims[idx].is = istride[idx]; dims[idx].os = ostride[idx]; } // Batch configuration: std::vector howmany_dims(1); howmany_dims[0].n = nbatch; howmany_dims[0].is = idist; howmany_dims[0].os = odist; return fftw_plan_with_precision(dims, howmany_dims, transformType, idist * nbatch, input.front().data(), output.empty() ? nullptr : output.front().data()); } template void fftw_run(fft_transform_type transformType, typename fftw_trait::fftw_plan_type cpu_plan, std::vector& cpu_in, std::vector& cpu_out) { switch(transformType) { case fft_transform_type_complex_forward: { fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_complex_inverse: { fftw_plan_execute_c2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_forward: { fftw_plan_execute_r2c(cpu_plan, cpu_in, cpu_out); break; } case fft_transform_type_real_inverse: { fftw_plan_execute_c2r(cpu_plan, cpu_in, cpu_out); break; } } } // Given a transform type, return the contiguous input type. inline fft_array_type contiguous_itype(const fft_transform_type transformType) { switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: return fft_array_type_complex_interleaved; case fft_transform_type_real_forward: return fft_array_type_real; case fft_transform_type_real_inverse: return fft_array_type_hermitian_interleaved; default: throw std::runtime_error("Invalid transform type"); } return fft_array_type_complex_interleaved; } // Given a transform type, return the contiguous output type. inline fft_array_type contiguous_otype(const fft_transform_type transformType) { switch(transformType) { case fft_transform_type_complex_forward: case fft_transform_type_complex_inverse: return fft_array_type_complex_interleaved; case fft_transform_type_real_forward: return fft_array_type_hermitian_interleaved; case fft_transform_type_real_inverse: return fft_array_type_real; default: throw std::runtime_error("Invalid transform type"); } return fft_array_type_complex_interleaved; } // Given a precision, return the acceptable tolerance. inline double type_epsilon(const fft_precision precision) { switch(precision) { case fft_precision_half: return type_epsilon<_Float16>(); break; case fft_precision_single: return type_epsilon(); break; case fft_precision_double: return type_epsilon(); break; default: throw std::runtime_error("Invalid precision"); } } #endif rocFFT-rocm-5.7.1/clients/tests/rtc_helper_crash.cpp000066400000000000000000000025471446473624700224540ustar00rootroot00000000000000// Copyright (C) 2021 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. // just crash int main() { char* a = 0; // NOTE: this is supposed to crash, since it's used in a test // that checks crashing child processes. // // cppcheck-suppress nullPointer *a = 0; return 0; } rocFFT-rocm-5.7.1/clients/tests/test_params.h000066400000000000000000000033531446473624700211300ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #pragma once #ifndef TESTCONSTANTS_H #define TESTCONSTANTS_H #include "rocfft.h" #include extern int verbose; extern size_t ramgb; extern size_t vramgb; extern size_t random_seed; extern double planar_prob; extern double callback_prob; extern double half_epsilon; extern double single_epsilon; extern double double_epsilon; extern bool skip_runtime_fails; extern double max_linf_eps_double; extern double max_l2_eps_double; extern double max_linf_eps_single; extern double max_l2_eps_single; extern double max_linf_eps_half; extern double max_l2_eps_half; extern int n_hip_failures; #endif rocFFT-rocm-5.7.1/clients/tests/unit_test.cpp000066400000000000000000000430121446473624700211530ustar00rootroot00000000000000// Copyright (C) 2016 - 2023 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "rocfft.h" #include "../../shared/environment.h" #include "../../shared/gpubuf.h" #include "../../shared/rocfft_complex.h" #include "hip/hip_runtime_api.h" #include #include #include #include #include #include #include #include #if __has_include() #include #else #include namespace std { namespace filesystem = experimental::filesystem; } #endif namespace fs = std::filesystem; #ifndef WIN32 // get program_invocation_name #include #endif TEST(rocfft_UnitTest, plan_description) { rocfft_plan_description desc = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_create(&desc)); rocfft_array_type in_array_type = rocfft_array_type_complex_interleaved; rocfft_array_type out_array_type = rocfft_array_type_complex_interleaved; size_t rank = 1; size_t i_strides[3] = {1, 1, 1}; size_t o_strides[3] = {1, 1, 1}; size_t idist = 0; size_t odist = 0; rocfft_plan plan = NULL; size_t length = 8; ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_set_data_layout(desc, in_array_type, out_array_type, 0, 0, rank, i_strides, idist, rank, o_strides, odist)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, rank, &length, 1, desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_description_destroy(desc)); ASSERT_TRUE(rocfft_status_success == rocfft_plan_destroy(plan)); } // Check whether logs can be emitted from multiple threads properly TEST(rocfft_UnitTest, log_multithreading) { static const int NUM_THREADS = 10; static const int NUM_ITERS_PER_THREAD = 50; static const char* TRACE_FILE = "trace.log"; // clean up environment and temporary file when we exit BOOST_SCOPE_EXIT_ALL(=) { rocfft_cleanup(); remove(TRACE_FILE); // re-init logs with default logging rocfft_setup(); }; // ask for trace logging, since that's the easiest to trigger rocfft_cleanup(); EnvironmentSetTemp layer("ROCFFT_LAYER", "1"); EnvironmentSetTemp tracepath("ROCFFT_LOG_TRACE_PATH", TRACE_FILE); rocfft_setup(); // run a whole bunch of threads in parallel, each one doing // something small that will write to the trace log std::vector threads; threads.reserve(NUM_THREADS); for(int i = 0; i < NUM_THREADS; ++i) { threads.emplace_back([]() { for(int j = 0; j < NUM_ITERS_PER_THREAD; ++j) { rocfft_plan_description desc; rocfft_plan_description_create(&desc); rocfft_plan_description_destroy(desc); } }); } for(auto& t : threads) { t.join(); } rocfft_cleanup(); // now verify that the trace log has one message per line, with nothing garbled std::ifstream trace_log(TRACE_FILE); std::string line; std::regex validator("^rocfft_(setup|cleanup|plan_description_(create|destroy)," "description,[x0-9a-fA-F]+)$"); while(std::getline(trace_log, line)) { bool res = std::regex_match(line, validator); ASSERT_TRUE(res) << "line contains invalid content: " << line; } } // a function that accepts a plan's requested size on input, and // returns the size to actually allocate for the test typedef std::function workmem_sizer; void workmem_test(workmem_sizer sizer, rocfft_status exec_status_expected, bool give_null_work_buf = false) { // Prime size requires Bluestein, which guarantees work memory. size_t length = 8191; rocfft_plan plan = NULL; ASSERT_EQ(rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &length, 1, nullptr), rocfft_status_success); size_t requested_work_size = 0; ASSERT_EQ(rocfft_plan_get_work_buffer_size(plan, &requested_work_size), rocfft_status_success); ASSERT_GT(requested_work_size, 0U); rocfft_execution_info info; ASSERT_EQ(rocfft_execution_info_create(&info), rocfft_status_success); size_t alloc_work_size = sizer(requested_work_size); gpubuf work_buffer; if(alloc_work_size) { ASSERT_EQ(work_buffer.alloc(alloc_work_size), hipSuccess); void* work_buffer_ptr; rocfft_status set_work_expected_status; if(give_null_work_buf) { work_buffer_ptr = nullptr; set_work_expected_status = rocfft_status_invalid_work_buffer; } else { work_buffer_ptr = work_buffer.data(); set_work_expected_status = rocfft_status_success; } ASSERT_EQ(rocfft_execution_info_set_work_buffer(info, work_buffer_ptr, alloc_work_size), set_work_expected_status); } // allocate 2x length for complex std::vector data_host(length * 2, 1.0f); gpubuf data_device; auto data_size_bytes = data_host.size() * sizeof(float); ASSERT_EQ(data_device.alloc(data_size_bytes), hipSuccess); ASSERT_EQ( hipMemcpy(data_device.data(), data_host.data(), data_size_bytes, hipMemcpyHostToDevice), hipSuccess); std::vector ibuffers(1, static_cast(data_device.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, info), exec_status_expected); rocfft_execution_info_destroy(info); rocfft_plan_destroy(plan); } // check what happens if work memory is required but is not provided // - library should allocate TEST(rocfft_UnitTest, workmem_missing) { workmem_test([](size_t) { return 0; }, rocfft_status_success); } // check what happens if work memory is required but not enough is provided TEST(rocfft_UnitTest, workmem_small) { workmem_test([](size_t requested) { return requested / 2; }, rocfft_status_invalid_work_buffer); } // hard to imagine this being a problem, but try giving too much as well TEST(rocfft_UnitTest, workmem_big) { workmem_test([](size_t requested) { return requested * 2; }, rocfft_status_success); } // check if a user explicitly gives a null pointer - set work buffer // should fail, but transform should succeed because library // allocates TEST(rocfft_UnitTest, workmem_null) { workmem_test([](size_t requested) { return requested; }, rocfft_status_success, true); } #ifdef ROCFFT_RUNTIME_COMPILE static const size_t RTC_PROBLEM_SIZE = 2304; // runtime compilation cache tests TEST(rocfft_UnitTest, rtc_cache) { // PRECONDITIONS // - set cache location to custom path, requires uninitializing // the lib and reinitializing with some env vars // - also enable RTC logging so we can tell when something was // actually compiled const std::string rtc_cache_path = std::tmpnam(nullptr); const std::string rtc_log_path = std::tmpnam(nullptr); void* empty_cache = nullptr; size_t empty_cache_bytes = 0; void* onekernel_cache = nullptr; size_t onekernel_cache_bytes = 0; // cleanup BOOST_SCOPE_EXIT_ALL(=) { // close log file handles rocfft_cleanup(); remove(rtc_cache_path.c_str()); remove(rtc_log_path.c_str()); // re-init lib now that the env vars are gone rocfft_setup(); if(empty_cache) rocfft_cache_buffer_free(empty_cache); if(onekernel_cache) rocfft_cache_buffer_free(onekernel_cache); }; rocfft_cleanup(); EnvironmentSetTemp cache_env("ROCFFT_RTC_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp layer_env("ROCFFT_LAYER", "32"); EnvironmentSetTemp log_env("ROCFFT_LOG_RTC_PATH", rtc_log_path.c_str()); rocfft_setup(); // - serialize empty cache as baseline ASSERT_EQ(rocfft_cache_serialize(&empty_cache, &empty_cache_bytes), rocfft_status_success); // END PRECONDITIONS // pick a length that's runtime compiled auto build_plan = [&]() { rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // we don't need to actually execute the plan, so we can // destroy it right away. this ensures that we don't hold on // to a plan after we cleanup the library rocfft_plan_destroy(plan); plan = nullptr; }; // check the RTC log to see if an FFT kernel got compiled auto fft_kernel_was_compiled = [&]() { // HACK: logging is done in a worker thread, so sleep for a // bit to give it a chance to actually write. It at least // should flush after writing. std::this_thread::sleep_for(std::chrono::milliseconds(100)); // look for a ROCFFT_RTC_BEGIN line that indicates RTC happened std::ifstream logfile(rtc_log_path); std::string line; while(std::getline(logfile, line)) { if(line.find("ROCFFT_RTC_BEGIN") != std::string::npos && line.find("fft_") != std::string::npos) return true; } return false; }; // build a plan that requires runtime compilation, // close logs and ensure a kernel was built build_plan(); ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); // serialized cache should be bigger than empty cache ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // blow away the cache, reinit the library, // retry building the plan again and ensure the kernel was rebuilt remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cache_buffer_free(onekernel_cache); onekernel_cache = nullptr; ASSERT_EQ(rocfft_cache_serialize(&onekernel_cache, &onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); ASSERT_GT(onekernel_cache_bytes, empty_cache_bytes); // re-init library without blowing away cache. rebuild plan and // check that the kernel was not recompiled. rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // blow away cache again, deserialize one-kernel cache. re-init // library and rebuild plan - kernel should again not be // recompiled remove(rtc_cache_path.c_str()); rocfft_setup(); ASSERT_EQ(rocfft_cache_deserialize(onekernel_cache, onekernel_cache_bytes), rocfft_status_success); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // use the cache as a system cache and make the user one an empty // in-memory cache. kernel should still not be recompiled. EnvironmentSetTemp cache_sys_env("ROCFFT_RTC_SYS_CACHE_PATH", rtc_cache_path.c_str()); EnvironmentSetTemp cache_empty_env("ROCFFT_RTC_CACHE_PATH", ":memory:"); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_FALSE(fft_kernel_was_compiled()); // check that the system cache is not written to, even if it's // writable by the current user. after removing the cache, the // kernel should always be recompiled since rocFFT has no durable // place to write it to. remove(rtc_cache_path.c_str()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); rocfft_setup(); build_plan(); rocfft_cleanup(); ASSERT_TRUE(fft_kernel_was_compiled()); } // make sure cache API functions tolerate null pointers without crashing TEST(rocfft_UnitTest, rtc_cache_null) { void* buf = nullptr; size_t buf_len = 0; ASSERT_EQ(rocfft_cache_serialize(nullptr, &buf_len), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_serialize(&buf, nullptr), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_buffer_free(nullptr), rocfft_status_success); ASSERT_EQ(rocfft_cache_deserialize(nullptr, 12345), rocfft_status_invalid_arg_value); ASSERT_EQ(rocfft_cache_deserialize(&buf_len, 0), rocfft_status_invalid_arg_value); } // make sure RTC gracefully handles a helper process that crashes TEST(rocfft_UnitTest, rtc_helper_crash) { #ifdef WIN32 char filename[MAX_PATH]; GetModuleFileNameA(NULL, filename, MAX_PATH); fs::path test_exe = filename; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash.exe"); #else fs::path test_exe = program_invocation_name; fs::path crasher_exe = test_exe.replace_filename("rtc_helper_crash"); #endif // use the crashing helper EnvironmentSetTemp env_helper("ROCFFT_RTC_PROCESS_HELPER", crasher_exe.string().c_str()); // don't touch the cache, to force compilation EnvironmentSetTemp env_read("ROCFFT_RTC_CACHE_READ_DISABLE", "1"); EnvironmentSetTemp env_write("ROCFFT_RTC_CACHE_WRITE_DISABLE", "1"); // force out-of-process compile EnvironmentSetTemp env_process("ROCFFT_RTC_PROCESS", "2"); rocfft_plan plan = nullptr; ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); // alloc a complex buffer gpubuf_t> data; ASSERT_EQ(data.alloc(RTC_PROBLEM_SIZE * sizeof(rocfft_complex)), hipSuccess); std::vector ibuffers(1, static_cast(data.data())); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; rocfft_cleanup(); rocfft_setup(); // also try with forcing use of the subprocess, which is a // different code path from the default "try in-process, then // fall back to out-of-process" EnvironmentSetTemp env_force("ROCFFT_RTC_PROCESS", "1"); ASSERT_TRUE(rocfft_status_success == rocfft_plan_create(&plan, rocfft_placement_inplace, rocfft_transform_type_complex_forward, rocfft_precision_single, 1, &RTC_PROBLEM_SIZE, 1, nullptr)); ASSERT_EQ(rocfft_execute(plan, ibuffers.data(), nullptr, nullptr), rocfft_status_success); rocfft_plan_destroy(plan); plan = nullptr; } #endif rocFFT-rocm-5.7.1/clients/tests/validate_length_stride.cpp000066400000000000000000000075431446473624700236520ustar00rootroot00000000000000// Copyright (C) 2022 Advanced Micro Devices, Inc. All rights reserved. // // Permission is hereby granted, free of charge, to any person obtaining a copy // of this software and associated documentation files (the "Software"), to deal // in the Software without restriction, including without limitation the rights // to use, copy, modify, merge, publish, distribute, sublicense, and/or sell // copies of the Software, and to permit persons to whom the Software is // furnished to do so, subject to the following conditions: // // The above copyright notice and this permission notice shall be included in // all copies or substantial portions of the Software. // // THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR // IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, // FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE // AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER // LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, // OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN // THE SOFTWARE. #include "../../shared/array_validator.h" #include "accuracy_test.h" #include #include #include inline auto generate_valid_length_stride() { // Array of tuples of length, stride. std::vector, std::vector>> vals = { {{8}, {1}}, {{8, 2}, {1, 0}}, {{8, 8}, {8, 1}}, {{8, 8, 8}, {64, 8, 1}}, {{8, 8, 8}, {64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 7, 1}}, {{8, 8, 8, 8}, {512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 8, 1}}, {{8, 8, 8, 8, 8}, {4096, 512, 64, 7, 1}}, {{8, 8, 8, 8, 8, 8}, {32768, 4096, 512, 64, 8, 1}}, {{299, 307, 495}, {1006, 50, 674}}, }; return vals; } class valid_length_stride : public ::testing::TestWithParam, std::vector>> { protected: void SetUp() override {} void TearDown() override {} public: static std::string TestName(const testing::TestParamInfo& info) { return info.param.token(); } }; auto direct_validity_test(const std::vector& length, const std::vector& stride, const int verbose) { std::unordered_set vals{}; std::vector index(length.size()); std::fill(index.begin(), index.end(), 0); do { const int i = std::inner_product(index.begin(), index.end(), stride.begin(), (size_t)0); if(vals.find(i) == vals.end()) { vals.insert(i); } else { return false; } } while(increment_rowmajor(index, length)); return true; } TEST_P(valid_length_stride, direct_comparison) { const std::vector length = std::get<0>(GetParam()); const std::vector stride = std::get<1>(GetParam()); if(verbose) { std::cout << "length:"; for(const auto i : length) std::cout << " " << i; std::cout << "\n"; std::cout << "stride:"; for(const auto i : stride) std::cout << " " << i; std::cout << "\n"; } auto test_val = array_valid(length, stride, verbose); if(verbose) { std::cout << "test value is: " << (test_val ? "valid" : "invalid") << "\n"; } auto ref_val = direct_validity_test(length, stride, verbose); if(verbose) { std::cout << "reference value is: " << (ref_val ? "valid" : "invalid") << "\n"; } EXPECT_EQ(test_val, ref_val); SUCCEED(); } INSTANTIATE_TEST_SUITE_P(reference_test, valid_length_stride, ::testing::ValuesIn(generate_valid_length_stride())); rocFFT-rocm-5.7.1/cmake/000077500000000000000000000000001446473624700147065ustar00rootroot00000000000000rocFFT-rocm-5.7.1/cmake/get-cli-arguments.cmake000066400000000000000000000041601446473624700212400ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Attempt (best effort) to return a list of user specified parameters cmake was invoked with # NOTE: Even if the user specifies CMAKE_INSTALL_PREFIX on the command line, the parameter is # not returned because it does not have the matching helpstring function( append_cmake_cli_arguments initial_cli_args return_cli_args ) # Retrieves the contents of CMakeCache.txt get_cmake_property( cmake_properties CACHE_VARIABLES ) foreach( property ${cmake_properties} ) get_property(help_string CACHE ${property} PROPERTY HELPSTRING ) # Properties specified on the command line have boilerplate text if( help_string MATCHES "variable specified on the command line" ) # message( STATUS "property: ${property}") # message( STATUS "value: ${${property}}") list( APPEND cli_args "-D${property}=${${property}}") endif( ) endforeach( ) # message( STATUS "get_command_line_arguments: ${cli_args}") set( ${return_cli_args} ${${initial_cli_args}} ${cli_args} PARENT_SCOPE ) endfunction( )rocFFT-rocm-5.7.1/cmake/package-functions.cmake000066400000000000000000000040651446473624700213160ustar00rootroot00000000000000# Copyright (C) 2021 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # ######################################################################## # A helper function to generate packaging scripts to register libraries with system # ######################################################################## function( write_rocm_package_script_files scripts_write_dir library_name library_link_name ) set( ld_conf_file "/etc/ld.so.conf.d/${library_name}-dev.conf" ) file( WRITE ${scripts_write_dir}/postinst "#!/bin/bash set -e do_ldconfig() { echo ${CPACK_PACKAGING_INSTALL_PREFIX}/${LIB_INSTALL_DIR} > ${ld_conf_file} && ldconfig } case \"\$1\" in configure) do_ldconfig ;; abort-upgrade|abort-remove|abort-deconfigure) echo \"\$1\" ;; *) exit 0 ;; esac " ) file( WRITE ${scripts_write_dir}/prerm "#!/bin/bash set -e rm_ldconfig() { rm -f ${ld_conf_file} && ldconfig } case \"\$1\" in remove|purge) rm_ldconfig ;; *) exit 0 ;; esac " ) endfunction( ) rocFFT-rocm-5.7.1/custom.properties000066400000000000000000000001351446473624700172550ustar00rootroot00000000000000booktitle=rocFFT API Guide spreadsheet.xml=docs/classification-map.xml document.locale=enusrocFFT-rocm-5.7.1/deps/000077500000000000000000000000001446473624700145615ustar00rootroot00000000000000rocFFT-rocm-5.7.1/deps/CMakeLists.txt000066400000000000000000000074671446473624700173370ustar00rootroot00000000000000# Copyright(C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. # Helper cmake script to automate building dependencies for rocfft # This script can be invoked manually by the user with 'cmake -P' # The ROCm platform requires Ubuntu 16.04 or Fedora 24, which has cmake 3.5 cmake_minimum_required( VERSION 3.5 ) list( APPEND CMAKE_MODULE_PATH ${CMAKE_CURRENT_SOURCE_DIR} ${CMAKE_CURRENT_SOURCE_DIR}/../cmake ) # Consider removing this in the future # It can be annoying for visual studio developers to build a project that tries to install into 'program files' if( WIN32 AND CMAKE_INSTALL_PREFIX_INITIALIZED_TO_DEFAULT ) set( CMAKE_INSTALL_PREFIX "${PROJECT_BINARY_DIR}/package" CACHE PATH "Install path prefix, prepended onto install directories" FORCE ) endif( ) # This has to be initialized before the project() command appears # Set the default of CMAKE_BUILD_TYPE to be release, unless user specifies with -D. MSVC_IDE does not use CMAKE_BUILD_TYPE if( NOT DEFINED CMAKE_CONFIGURATION_TYPES AND NOT DEFINED CMAKE_BUILD_TYPE ) set( CMAKE_BUILD_TYPE Release CACHE STRING "Choose the type of build, options are: None Debug Release RelWithDebInfo MinSizeRel." ) endif() # The superbuild does not build anything itself; all compiling is done in external projects project( rocfft-dependencies NONE ) option( BUILD_BOOST "Download and build boost library" ON ) # option( BUILD_VERBOSE "Print helpful build debug information" OFF ) # if( BUILD_VERBOSE ) # message( STATUS "CMAKE_MODULE_PATH: ${CMAKE_MODULE_PATH}" ) # message( STATUS "CMAKE_BINARY_DIR: ${CMAKE_BINARY_DIR}" ) # message( STATUS "CMAKE_SOURCE_DIR: ${CMAKE_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_SOURCE_DIR: ${CMAKE_CURRENT_SOURCE_DIR}" ) # message( STATUS "CMAKE_CURRENT_BINARY_DIR: ${CMAKE_CURRENT_BINARY_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_DIR: ${CMAKE_CURRENT_LIST_DIR}" ) # message( STATUS "CMAKE_CURRENT_LIST_FILE: ${CMAKE_CURRENT_LIST_FILE}" ) # endif( ) # This module scrapes the CMakeCache.txt file and attempts to get all the cli options the user specified to cmake invocation include( get-cli-arguments ) # The following is a series of super-build projects; this cmake project will download and build if( BUILD_BOOST ) set(ext.BUILD_BOOST "static") include( external-boost ) list( APPEND rocfft_dependencies boost ) set( boost_custom_target COMMAND cd ${BOOST_BINARY_ROOT}$ ${Boost.Command} install ) endif( ) # POLICY CMP0037 - "Target names should not be reserved and should match a validity pattern" # Familiar target names like 'install' should be OK at the super-build level if( POLICY CMP0037 ) cmake_policy( SET CMP0037 OLD ) endif( ) add_custom_target( install ${boost_custom_target} ${gtest_custom_target} ${lapack_custom_target} DEPENDS ${rocfft_dependencies} ) rocFFT-rocm-5.7.1/deps/external-boost.cmake000066400000000000000000000166571446473624700205500ustar00rootroot00000000000000# Copyright (C) 2016 - 2022 Advanced Micro Devices, Inc. All rights reserved. # # Permission is hereby granted, free of charge, to any person obtaining a copy # of this software and associated documentation files (the "Software"), to deal # in the Software without restriction, including without limitation the rights # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell # copies of the Software, and to permit persons to whom the Software is # furnished to do so, subject to the following conditions: # # The above copyright notice and this permission notice shall be included in # all copies or substantial portions of the Software. # # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN # THE SOFTWARE. message( STATUS "Configuring boost external dependency" ) include( ExternalProject ) set( PREFIX_BOOST ${CMAKE_INSTALL_PREFIX} CACHE PATH "Location where boost should install, defaults to /usr/local" ) # We need to detect the compiler the user is attempting to invoke with CMake, # we do our best to translate cmake parameters into bjam parameters enable_language( CXX ) include( build-bitness ) # TODO: Options should be added to allow downloading Boost straight from github # This file is used to add Boost as a library dependency to another project # This sets up boost to download from sourceforge, and builds it as a cmake # ExternalProject # Change this one line to upgrade to newer versions of boost set( ext.Boost_VERSION "1.64.0" CACHE STRING "Boost version to download/use" ) mark_as_advanced( ext.Boost_VERSION ) string( REPLACE "." "_" ext.Boost_Version_Underscore ${ext.Boost_VERSION} ) message( STATUS "ext.Boost_VERSION: " ${ext.Boost_VERSION} ) if( WIN32 ) # For newer cmake versions, 7z archives are much smaller to download if( CMAKE_VERSION VERSION_LESS "3.1.0" ) set( Boost_Ext "zip" ) else( ) set( Boost_Ext "7z" ) endif( ) else( ) set( Boost_Ext "tar.bz2" ) endif( ) if( WIN32 ) set( Boost.Command b2 --prefix=${PREFIX_BOOST} ) else( ) set( Boost.Command ./b2 --prefix=${PREFIX_BOOST} ) endif( ) if( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command cxxflags=-fPIC -std=c++11 ) elseif( XCODE_VERSION OR (CMAKE_CXX_COMPILER_ID MATCHES "Clang") ) list( APPEND Boost.Command cxxflags=-std=c++11 -stdlib=libc++ linkflags=-stdlib=libc++ ) endif( ) include( ProcessorCount ) ProcessorCount( Cores ) if( NOT Cores EQUAL 0 ) # Travis can fail to build Boost sporadically; uses 32 cores, reduce stress on VM if( DEFINED ENV{TRAVIS} ) if( Cores GREATER 8 ) set( Cores 8 ) endif( ) endif( ) # Add build thread in addition to the number of cores that we have math( EXPR Cores "${Cores} + 1 " ) else( ) # If we could not detect # of cores, assume 1 core and add an additional build thread set( Cores "2" ) endif( ) message( STATUS "ExternalBoost using ( " ${Cores} " ) cores to build with" ) message( STATUS "ExternalBoost building [ program_options, serialization, filesystem, system, regex ] components" ) list( APPEND Boost.Command -j ${Cores} --with-program_options --with-serialization --with-filesystem --with-system --with-regex ) if( BUILD_64 ) list( APPEND Boost.Command address-model=64 ) else( ) list( APPEND Boost.Command address-model=32 ) endif( ) if( MSVC10 ) list( APPEND Boost.Command toolset=msvc-10.0 ) elseif( MSVC11 ) list( APPEND Boost.Command toolset=msvc-11.0 ) elseif( MSVC12 ) list( APPEND Boost.Command toolset=msvc-12.0 ) elseif( MSVC14 ) list( APPEND Boost.Command toolset=msvc-14.0 ) elseif( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Command toolset=clang ) elseif( CMAKE_COMPILER_IS_GNUCXX ) list( APPEND Boost.Command toolset=gcc ) endif( ) if( WIN32 AND (ext.Boost_VERSION VERSION_LESS "1.60.0") ) list( APPEND Boost.Command define=BOOST_LOG_USE_WINNT6_API ) endif( ) if( NOT DEFINED ext.Boost_LINK ) if( ${BUILD_SHARED_LIBS} MATCHES "ON" ) set( ext.Boost_LINK "shared" CACHE STRING "Which boost link method? static | shared | static,shared" ) else( ) set( ext.Boost_LINK "static" CACHE STRING "Which boost link method? static | shared | static,shared" ) endif( ) endif() mark_as_advanced( ext.Boost_LINK ) if( WIN32 ) # Versioned is the default on windows set( ext.Boost_LAYOUT "versioned" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For windows, default to build both variants to support the VS IDE set( ext.Boost_VARIANT "debug,release" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) # Tagged builds provide unique enough names to be able to build both variants set( ext.Boost_LAYOUT "tagged" CACHE STRING "Which boost layout method? versioned | tagged | system" ) # For Linux, typically a build tree only needs one variant if( ${CMAKE_BUILD_TYPE} MATCHES "Debug") set( ext.Boost_VARIANT "debug" CACHE STRING "Which boost variant? debug | release | debug,release" ) else( ) set( ext.Boost_VARIANT "release" CACHE STRING "Which boost variant? debug | release | debug,release" ) endif( ) endif( ) mark_as_advanced( ext.Boost_LAYOUT ) mark_as_advanced( ext.Boost_VARIANT ) list( APPEND Boost.Command --layout=${ext.Boost_LAYOUT} link=${ext.Boost_LINK} variant=${ext.Boost_VARIANT} ) message( STATUS "Boost.Command: ${Boost.Command}" ) # If the user has a cached local copy stored somewhere, they can define the full path to the package in a BOOST_URL environment variable if( DEFINED ENV{BOOST_URL} ) set( ext.Boost_URL "$ENV{BOOST_URL}" CACHE STRING "URL to download Boost from" ) else( ) set( ext.Boost_URL "http://sourceforge.net/projects/boost/files/boost/${ext.Boost_VERSION}/boost_${ext.Boost_Version_Underscore}.${Boost_Ext}/download" CACHE STRING "URL to download Boost from" ) endif( ) mark_as_advanced( ext.Boost_URL ) set( Boost.Bootstrap "" ) set( ext.HASH "" ) if( WIN32 ) set( Boost.Bootstrap "bootstrap.bat" ) if( CMAKE_VERSION VERSION_LESS "3.1.0" ) # .zip file set( ext.HASH "b99973c805f38b549dbeaf88701c0abeff8b0e8eaa4066df47cac10a32097523" ) else( ) # .7z file set( ext.HASH "49c6abfeb5b480f6a86119c0d57235966b4690ee6ff9e6401ee868244808d155" ) endif( ) else( ) set( Boost.Bootstrap "./bootstrap.sh" ) # .tar.bz2 set( ext.HASH "7bcc5caace97baa948931d712ea5f37038dbb1c5d89b43ad4def4ed7cb683332" ) if( XCODE_VERSION OR ( CMAKE_CXX_COMPILER_ID MATCHES "Clang" ) ) list( APPEND Boost.Bootstrap --with-toolset=clang ) endif( ) endif( ) # Below is a fancy CMake command to download, build and install Boost on the users computer ExternalProject_Add( boost PREFIX ${CMAKE_BINARY_DIR}/boost URL ${ext.Boost_URL} URL_HASH SHA256=${ext.HASH} UPDATE_COMMAND ${Boost.Bootstrap} LOG_UPDATE 1 CONFIGURE_COMMAND "" BUILD_COMMAND ${Boost.Command} stage BUILD_IN_SOURCE 1 LOG_BUILD 1 INSTALL_COMMAND "" ) set_property( TARGET boost PROPERTY FOLDER "extern" ) ExternalProject_Get_Property( boost install_dir ) ExternalProject_Get_Property( boost binary_dir ) # For use by the user of ExternalGtest.cmake set( BOOST_INSTALL_ROOT ${install_dir} ) set( BOOST_BINARY_ROOT ${binary_dir} ) rocFFT-rocm-5.7.1/docs/000077500000000000000000000000001446473624700145565ustar00rootroot00000000000000rocFFT-rocm-5.7.1/docs/.doxygen/000077500000000000000000000000001446473624700163115ustar00rootroot00000000000000rocFFT-rocm-5.7.1/docs/.doxygen/Doxyfile000066400000000000000000003212141446473624700200220ustar00rootroot00000000000000# Doxyfile 1.8.10 # This file describes the settings to be used by the documentation system # doxygen (www.doxygen.org) for a project. # # All text after a double hash (##) is considered a comment and is placed in # front of the TAG it is preceding. # # All text after a single hash (#) is considered a comment and will be ignored. # The format is: # TAG = value [value, ...] # For lists, items can also be appended using: # TAG += value [value, ...] # Values that contain spaces should be placed between quotes (\" \"). #--------------------------------------------------------------------------- # Project related configuration options #--------------------------------------------------------------------------- # This tag specifies the encoding used for all characters in the config file # that follow. The default is UTF-8 which is also the encoding used for all text # before the first occurrence of this tag. Doxygen uses libiconv (or the iconv # built into libc) for the transcoding. See http://www.gnu.org/software/libiconv # for the list of possible encodings. # The default value is: UTF-8. DOXYFILE_ENCODING = UTF-8 # The PROJECT_NAME tag is a single word (or a sequence of words surrounded by # double-quotes, unless you are using Doxywizard) that should identify the # project for which the documentation is generated. This name is used in the # title of most generated pages and in a few other places. # The default value is: My Project. PROJECT_NAME = "rocFFT" # The PROJECT_NUMBER tag can be used to enter a project or revision number. This # could be handy for archiving the generated documentation or if some version # control system is used. PROJECT_NUMBER = v1.0.23 # Using the PROJECT_BRIEF tag one can provide an optional one line description # for a project that appears at the top of each page and should give viewer a # quick idea about the purpose of the project. Keep the description short. PROJECT_BRIEF = "prototype interfaces compatible with ROCm platform and HiP" # With the PROJECT_LOGO tag one can specify a logo or an icon that is included # in the documentation. The maximum height of the logo should not exceed 55 # pixels and the maximum width should not exceed 200 pixels. Doxygen will copy # the logo to the output directory. PROJECT_LOGO = # The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute) path # into which the generated documentation will be written. If a relative path is # entered, it will be relative to the location where doxygen was started. If # left blank the current directory will be used. OUTPUT_DIRECTORY = docBin # If the CREATE_SUBDIRS tag is set to YES then doxygen will create 4096 sub- # directories (in 2 levels) under the output directory of each output format and # will distribute the generated files over these directories. Enabling this # option can be useful when feeding doxygen a huge amount of source files, where # putting all generated files in the same directory would otherwise causes # performance problems for the file system. # The default value is: NO. CREATE_SUBDIRS = NO # If the ALLOW_UNICODE_NAMES tag is set to YES, doxygen will allow non-ASCII # characters to appear in the names of generated files. If set to NO, non-ASCII # characters will be escaped, for example _xE3_x81_x84 will be used for Unicode # U+3044. # The default value is: NO. ALLOW_UNICODE_NAMES = NO # The OUTPUT_LANGUAGE tag is used to specify the language in which all # documentation generated by doxygen is written. Doxygen will use this # information to generate all constant output in the proper language. # Possible values are: Afrikaans, Arabic, Armenian, Brazilian, Catalan, Chinese, # Chinese-Traditional, Croatian, Czech, Danish, Dutch, English (United States), # Esperanto, Farsi (Persian), Finnish, French, German, Greek, Hungarian, # Indonesian, Italian, Japanese, Japanese-en (Japanese with English messages), # Korean, Korean-en (Korean with English messages), Latvian, Lithuanian, # Macedonian, Norwegian, Persian (Farsi), Polish, Portuguese, Romanian, Russian, # Serbian, Serbian-Cyrillic, Slovak, Slovene, Spanish, Swedish, Turkish, # Ukrainian and Vietnamese. # The default value is: English. OUTPUT_LANGUAGE = English # If the BRIEF_MEMBER_DESC tag is set to YES, doxygen will include brief member # descriptions after the members that are listed in the file and class # documentation (similar to Javadoc). Set to NO to disable this. # The default value is: YES. BRIEF_MEMBER_DESC = YES # If the REPEAT_BRIEF tag is set to YES, doxygen will prepend the brief # description of a member or function before the detailed description # # Note: If both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the # brief descriptions will be completely suppressed. # The default value is: YES. REPEAT_BRIEF = YES # This tag implements a quasi-intelligent brief description abbreviator that is # used to form the text in various listings. Each string in this list, if found # as the leading text of the brief description, will be stripped from the text # and the result, after processing the whole list, is used as the annotated # text. Otherwise, the brief description is used as-is. If left blank, the # following values are used ($name is automatically replaced with the name of # the entity):The $name class, The $name widget, The $name file, is, provides, # specifies, contains, represents, a, an and the. ABBREVIATE_BRIEF = "The $name class" \ "The $name widget" \ "The $name file" \ is \ provides \ specifies \ contains \ represents \ a \ an \ the # If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then # doxygen will generate a detailed section even if there is only a brief # description. # The default value is: NO. ALWAYS_DETAILED_SEC = NO # If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all # inherited members of a class in the documentation of that class as if those # members were ordinary class members. Constructors, destructors and assignment # operators of the base classes will not be shown. # The default value is: NO. INLINE_INHERITED_MEMB = NO # If the FULL_PATH_NAMES tag is set to YES, doxygen will prepend the full path # before files name in the file list and in the header files. If set to NO the # shortest path that makes the file name unique will be used # The default value is: YES. FULL_PATH_NAMES = YES # The STRIP_FROM_PATH tag can be used to strip a user-defined part of the path. # Stripping is only done if one of the specified strings matches the left-hand # part of the path. The tag can be used to show relative paths in the file list. # If left blank the directory from which doxygen is run is used as the path to # strip. # # Note that you can specify absolute paths here, but also relative paths, which # will be relative from the directory where doxygen is started. # This tag requires that the tag FULL_PATH_NAMES is set to YES. STRIP_FROM_PATH = ../../library/include # The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of the # path mentioned in the documentation of a class, which tells the reader which # header file to include in order to use a class. If left blank only the name of # the header file containing the class definition is used. Otherwise one should # specify the list of include paths that are normally passed to the compiler # using the -I flag. STRIP_FROM_INC_PATH = # If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter (but # less readable) file names. This can be useful is your file systems doesn't # support long names like on DOS, Mac, or CD-ROM. # The default value is: NO. SHORT_NAMES = NO # If the JAVADOC_AUTOBRIEF tag is set to YES then doxygen will interpret the # first line (until the first dot) of a Javadoc-style comment as the brief # description. If set to NO, the Javadoc-style will behave just like regular Qt- # style comments (thus requiring an explicit @brief command for a brief # description.) # The default value is: NO. JAVADOC_AUTOBRIEF = NO # If the QT_AUTOBRIEF tag is set to YES then doxygen will interpret the first # line (until the first dot) of a Qt-style comment as the brief description. If # set to NO, the Qt-style will behave just like regular Qt-style comments (thus # requiring an explicit \brief command for a brief description.) # The default value is: NO. QT_AUTOBRIEF = NO # The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make doxygen treat a # multi-line C++ special comment block (i.e. a block of //! or /// comments) as # a brief description. This used to be the default behavior. The new default is # to treat a multi-line C++ comment block as a detailed description. Set this # tag to YES if you prefer the old behavior instead. # # Note that setting this tag to YES also means that rational rose comments are # not recognized any more. # The default value is: NO. MULTILINE_CPP_IS_BRIEF = NO # If the INHERIT_DOCS tag is set to YES then an undocumented member inherits the # documentation from any documented member that it re-implements. # The default value is: YES. INHERIT_DOCS = YES # If the SEPARATE_MEMBER_PAGES tag is set to YES then doxygen will produce a new # page for each member. If set to NO, the documentation of a member will be part # of the file/class/namespace that contains it. # The default value is: NO. SEPARATE_MEMBER_PAGES = NO # The TAB_SIZE tag can be used to set the number of spaces in a tab. Doxygen # uses this value to replace tabs by spaces in code fragments. # Minimum value: 1, maximum value: 16, default value: 4. TAB_SIZE = 4 # This tag can be used to specify a number of aliases that act as commands in # the documentation. An alias has the form: # name=value # For example adding # "sideeffect=@par Side Effects:\n" # will allow you to put the command \sideeffect (or @sideeffect) in the # documentation, which will result in a user-defined paragraph with heading # "Side Effects:". You can put \n's in the value part of an alias to insert # newlines. ALIASES = # This tag can be used to specify a number of word-keyword mappings (TCL only). # A mapping has the form "name=value". For example adding "class=itcl::class" # will allow you to use the command class in the itcl::class meaning. TCL_SUBST = # Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C sources # only. Doxygen will then generate output that is more tailored for C. For # instance, some of the names that are used will be different. The list of all # members will be omitted, etc. # The default value is: NO. OPTIMIZE_OUTPUT_FOR_C = NO # Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java or # Python sources only. Doxygen will then generate output that is more tailored # for that language. For instance, namespaces will be presented as packages, # qualified scopes will look different, etc. # The default value is: NO. OPTIMIZE_OUTPUT_JAVA = NO # Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran # sources. Doxygen will then generate output that is tailored for Fortran. # The default value is: NO. OPTIMIZE_FOR_FORTRAN = NO # Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL # sources. Doxygen will then generate output that is tailored for VHDL. # The default value is: NO. OPTIMIZE_OUTPUT_VHDL = NO # Doxygen selects the parser to use depending on the extension of the files it # parses. With this tag you can assign which parser to use for a given # extension. Doxygen has a built-in mapping, but you can override or extend it # using this tag. The format is ext=language, where ext is a file extension, and # language is one of the parsers supported by doxygen: IDL, Java, Javascript, # C#, C, C++, D, PHP, Objective-C, Python, Fortran (fixed format Fortran: # FortranFixed, free formatted Fortran: FortranFree, unknown formatted Fortran: # Fortran. In the later case the parser tries to guess whether the code is fixed # or free formatted code, this is the default for Fortran type files), VHDL. For # instance to make doxygen treat .inc files as Fortran files (default is PHP), # and .f files as C (default is Fortran), use: inc=Fortran f=C. # # Note: For files without extension you can use no_extension as a placeholder. # # Note that for custom extensions you also need to set FILE_PATTERNS otherwise # the files are not read by doxygen. EXTENSION_MAPPING = # If the MARKDOWN_SUPPORT tag is enabled then doxygen pre-processes all comments # according to the Markdown format, which allows for more readable # documentation. See http://daringfireball.net/projects/markdown/ for details. # The output of markdown processing is further processed by doxygen, so you can # mix doxygen, HTML, and XML commands with Markdown formatting. Disable only in # case of backward compatibilities issues. # The default value is: YES. MARKDOWN_SUPPORT = YES # When enabled doxygen tries to link words that correspond to documented # classes, or namespaces to their corresponding documentation. Such a link can # be prevented in individual cases by putting a % sign in front of the word or # globally by setting AUTOLINK_SUPPORT to NO. # The default value is: YES. AUTOLINK_SUPPORT = YES # If you use STL classes (i.e. std::string, std::vector, etc.) but do not want # to include (a tag file for) the STL sources as input, then you should set this # tag to YES in order to let doxygen match functions declarations and # definitions whose arguments contain STL classes (e.g. func(std::string); # versus func(std::string) {}). This also make the inheritance and collaboration # diagrams that involve STL classes more complete and accurate. # The default value is: NO. BUILTIN_STL_SUPPORT = NO # If you use Microsoft's C++/CLI language, you should set this option to YES to # enable parsing support. # The default value is: NO. CPP_CLI_SUPPORT = NO # Set the SIP_SUPPORT tag to YES if your project consists of sip (see: # http://www.riverbankcomputing.co.uk/software/sip/intro) sources only. Doxygen # will parse them like normal C++ but will assume all classes use public instead # of private inheritance when no explicit protection keyword is present. # The default value is: NO. SIP_SUPPORT = NO # For Microsoft's IDL there are propget and propput attributes to indicate # getter and setter methods for a property. Setting this option to YES will make # doxygen to replace the get and set methods by a property in the documentation. # This will only work if the methods are indeed getting or setting a simple # type. If this is not the case, or you want to show the methods anyway, you # should set this option to NO. # The default value is: YES. IDL_PROPERTY_SUPPORT = YES # If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC # tag is set to YES then doxygen will reuse the documentation of the first # member in the group (if any) for the other members of the group. By default # all members of a group must be documented explicitly. # The default value is: NO. DISTRIBUTE_GROUP_DOC = YES # If one adds a struct or class to a group and this option is enabled, then also # any nested class or struct is added to the same group. By default this option # is disabled and one has to add nested compounds explicitly via \ingroup. # The default value is: NO. GROUP_NESTED_COMPOUNDS = NO # Set the SUBGROUPING tag to YES to allow class member groups of the same type # (for instance a group of public functions) to be put as a subgroup of that # type (e.g. under the Public Functions section). Set it to NO to prevent # subgrouping. Alternatively, this can be done per class using the # \nosubgrouping command. # The default value is: YES. SUBGROUPING = YES # When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and unions # are shown inside the group in which they are included (e.g. using \ingroup) # instead of on a separate page (for HTML and Man pages) or section (for LaTeX # and RTF). # # Note that this feature does not work in combination with # SEPARATE_MEMBER_PAGES. # The default value is: NO. INLINE_GROUPED_CLASSES = NO # When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and unions # with only public data fields or simple typedef fields will be shown inline in # the documentation of the scope in which they are defined (i.e. file, # namespace, or group documentation), provided this scope is documented. If set # to NO, structs, classes, and unions are shown on a separate page (for HTML and # Man pages) or section (for LaTeX and RTF). # The default value is: NO. INLINE_SIMPLE_STRUCTS = NO # When TYPEDEF_HIDES_STRUCT tag is enabled, a typedef of a struct, union, or # enum is documented as struct, union, or enum with the name of the typedef. So # typedef struct TypeS {} TypeT, will appear in the documentation as a struct # with name TypeT. When disabled the typedef will appear as a member of a file, # namespace, or class. And the struct will be named TypeS. This can typically be # useful for C code in case the coding convention dictates that all compound # types are typedef'ed and only the typedef is referenced, never the tag name. # The default value is: NO. TYPEDEF_HIDES_STRUCT = YES # The size of the symbol lookup cache can be set using LOOKUP_CACHE_SIZE. This # cache is used to resolve symbols given their name and scope. Since this can be # an expensive process and often the same symbol appears multiple times in the # code, doxygen keeps a cache of pre-resolved symbols. If the cache is too small # doxygen will become slower. If the cache is too large, memory is wasted. The # cache size is given by this formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range # is 0..9, the default is 0, corresponding to a cache size of 2^16=65536 # symbols. At the end of a run doxygen will report the cache usage and suggest # the optimal cache size from a speed point of view. # Minimum value: 0, maximum value: 9, default value: 0. LOOKUP_CACHE_SIZE = 0 #--------------------------------------------------------------------------- # Build related configuration options #--------------------------------------------------------------------------- SHOW_NAMESPACES = NO # If the EXTRACT_ALL tag is set to YES, doxygen will assume all entities in # documentation are documented, even if no documentation was available. Private # class members and static file members will be hidden unless the # EXTRACT_PRIVATE respectively EXTRACT_STATIC tags are set to YES. # Note: This will also disable the warnings about undocumented members that are # normally produced when WARNINGS is set to YES. # The default value is: NO. EXTRACT_ALL = NO # If the EXTRACT_PRIVATE tag is set to YES, all private members of a class will # be included in the documentation. # The default value is: NO. EXTRACT_PRIVATE = NO # If the EXTRACT_PACKAGE tag is set to YES, all members with package or internal # scope will be included in the documentation. # The default value is: NO. EXTRACT_PACKAGE = NO # If the EXTRACT_STATIC tag is set to YES, all static members of a file will be # included in the documentation. # The default value is: NO. EXTRACT_STATIC = NO # If the EXTRACT_LOCAL_CLASSES tag is set to YES, classes (and structs) defined # locally in source files will be included in the documentation. If set to NO, # only classes defined in header files are included. Does not have any effect # for Java sources. # The default value is: YES. EXTRACT_LOCAL_CLASSES = YES # This flag is only useful for Objective-C code. If set to YES, local methods, # which are defined in the implementation section but not in the interface are # included in the documentation. If set to NO, only methods in the interface are # included. # The default value is: NO. EXTRACT_LOCAL_METHODS = NO # If this flag is set to YES, the members of anonymous namespaces will be # extracted and appear in the documentation as a namespace called # 'anonymous_namespace{file}', where file will be replaced with the base name of # the file that contains the anonymous namespace. By default anonymous namespace # are hidden. # The default value is: NO. EXTRACT_ANON_NSPACES = NO # If the HIDE_UNDOC_MEMBERS tag is set to YES, doxygen will hide all # undocumented members inside documented classes or files. If set to NO these # members will be included in the various overviews, but no documentation # section is generated. This option has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_MEMBERS = NO # If the HIDE_UNDOC_CLASSES tag is set to YES, doxygen will hide all # undocumented classes that are normally visible in the class hierarchy. If set # to NO, these classes will be included in the various overviews. This option # has no effect if EXTRACT_ALL is enabled. # The default value is: NO. HIDE_UNDOC_CLASSES = NO # If the HIDE_FRIEND_COMPOUNDS tag is set to YES, doxygen will hide all friend # (class|struct|union) declarations. If set to NO, these declarations will be # included in the documentation. # The default value is: NO. HIDE_FRIEND_COMPOUNDS = NO # If the HIDE_IN_BODY_DOCS tag is set to YES, doxygen will hide any # documentation blocks found inside the body of a function. If set to NO, these # blocks will be appended to the function's detailed documentation block. # The default value is: NO. HIDE_IN_BODY_DOCS = NO # The INTERNAL_DOCS tag determines if documentation that is typed after a # \internal command is included. If the tag is set to NO then the documentation # will be excluded. Set it to YES to include the internal documentation. # The default value is: NO. INTERNAL_DOCS = NO # If the CASE_SENSE_NAMES tag is set to NO then doxygen will only generate file # names in lower-case letters. If set to YES, upper-case letters are also # allowed. This is useful if you have classes or files whose names only differ # in case and if your file system supports case sensitive file names. Windows # and Mac users are advised to set this option to NO. # The default value is: system dependent. CASE_SENSE_NAMES = NO # If the HIDE_SCOPE_NAMES tag is set to NO then doxygen will show members with # their full class and namespace scopes in the documentation. If set to YES, the # scope will be hidden. # The default value is: NO. HIDE_SCOPE_NAMES = NO # If the HIDE_COMPOUND_REFERENCE tag is set to NO (default) then doxygen will # append additional text to a page's title, such as Class Reference. If set to # YES the compound reference will be hidden. # The default value is: NO. HIDE_COMPOUND_REFERENCE= NO # If the SHOW_INCLUDE_FILES tag is set to YES then doxygen will put a list of # the files that are included by a file in the documentation of that file. # The default value is: YES. SHOW_INCLUDE_FILES = YES # If the SHOW_GROUPED_MEMB_INC tag is set to YES then Doxygen will add for each # grouped member an include statement to the documentation, telling the reader # which file to include in order to use the member. # The default value is: NO. SHOW_GROUPED_MEMB_INC = NO # If the FORCE_LOCAL_INCLUDES tag is set to YES then doxygen will list include # files with double quotes in the documentation rather than with sharp brackets. # The default value is: NO. FORCE_LOCAL_INCLUDES = NO # If the INLINE_INFO tag is set to YES then a tag [inline] is inserted in the # documentation for inline members. # The default value is: YES. INLINE_INFO = YES # If the SORT_MEMBER_DOCS tag is set to YES then doxygen will sort the # (detailed) documentation of file and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. # The default value is: YES. SORT_MEMBER_DOCS = YES # If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the brief # descriptions of file, namespace and class members alphabetically by member # name. If set to NO, the members will appear in declaration order. Note that # this will also influence the order of the classes in the class list. # The default value is: NO. SORT_BRIEF_DOCS = NO # If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen will sort the # (brief and detailed) documentation of class members so that constructors and # destructors are listed first. If set to NO the constructors will appear in the # respective orders defined by SORT_BRIEF_DOCS and SORT_MEMBER_DOCS. # Note: If SORT_BRIEF_DOCS is set to NO this option is ignored for sorting brief # member documentation. # Note: If SORT_MEMBER_DOCS is set to NO this option is ignored for sorting # detailed member documentation. # The default value is: NO. SORT_MEMBERS_CTORS_1ST = NO # If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the hierarchy # of group names into alphabetical order. If set to NO the group names will # appear in their defined order. # The default value is: NO. SORT_GROUP_NAMES = NO # If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be sorted by # fully-qualified names, including namespaces. If set to NO, the class list will # be sorted only by class name, not including the namespace part. # Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES. # Note: This option applies only to the class list, not to the alphabetical # list. # The default value is: NO. SORT_BY_SCOPE_NAME = NO # If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to do proper # type resolution of all parameters of a function it will reject a match between # the prototype and the implementation of a member function even if there is # only one candidate or it is obvious which candidate to choose by doing a # simple string match. By disabling STRICT_PROTO_MATCHING doxygen will still # accept a match between prototype and implementation in such cases. # The default value is: NO. STRICT_PROTO_MATCHING = NO # The GENERATE_TODOLIST tag can be used to enable (YES) or disable (NO) the todo # list. This list is created by putting \todo commands in the documentation. # The default value is: YES. GENERATE_TODOLIST = YES # The GENERATE_TESTLIST tag can be used to enable (YES) or disable (NO) the test # list. This list is created by putting \test commands in the documentation. # The default value is: YES. GENERATE_TESTLIST = YES # The GENERATE_BUGLIST tag can be used to enable (YES) or disable (NO) the bug # list. This list is created by putting \bug commands in the documentation. # The default value is: YES. GENERATE_BUGLIST = YES # The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or disable (NO) # the deprecated list. This list is created by putting \deprecated commands in # the documentation. # The default value is: YES. GENERATE_DEPRECATEDLIST= YES # The ENABLED_SECTIONS tag can be used to enable conditional documentation # sections, marked by \if ... \endif and \cond # ... \endcond blocks. ENABLED_SECTIONS = # The MAX_INITIALIZER_LINES tag determines the maximum number of lines that the # initial value of a variable or macro / define can have for it to appear in the # documentation. If the initializer consists of more lines than specified here # it will be hidden. Use a value of 0 to hide initializers completely. The # appearance of the value of individual variables and macros / defines can be # controlled using \showinitializer or \hideinitializer command in the # documentation regardless of this setting. # Minimum value: 0, maximum value: 10000, default value: 30. MAX_INITIALIZER_LINES = 30 # Set the SHOW_USED_FILES tag to NO to disable the list of files generated at # the bottom of the documentation of classes and structs. If set to YES, the # list will mention the files that were used to generate the documentation. # The default value is: YES. SHOW_USED_FILES = YES # Set the SHOW_FILES tag to NO to disable the generation of the Files page. This # will remove the Files entry from the Quick Index and from the Folder Tree View # (if specified). # The default value is: YES. SHOW_FILES = YES # Set the SHOW_NAMESPACES tag to NO to disable the generation of the Namespaces # page. This will remove the Namespaces entry from the Quick Index and from the # Folder Tree View (if specified). # The default value is: YES. SHOW_NAMESPACES = YES # The FILE_VERSION_FILTER tag can be used to specify a program or script that # doxygen should invoke to get the current version for each file (typically from # the version control system). Doxygen will invoke the program by executing (via # popen()) the command command input-file, where command is the value of the # FILE_VERSION_FILTER tag, and input-file is the name of an input file provided # by doxygen. Whatever the program writes to standard output is used as the file # version. For an example see the documentation. FILE_VERSION_FILTER = # The LAYOUT_FILE tag can be used to specify a layout file which will be parsed # by doxygen. The layout file controls the global structure of the generated # output files in an output format independent way. To create the layout file # that represents doxygen's defaults, run doxygen with the -l option. You can # optionally specify a file name after the option, if omitted DoxygenLayout.xml # will be used as the name of the layout file. # # Note that if you run doxygen from a directory containing a file called # DoxygenLayout.xml, doxygen will parse it automatically even if the LAYOUT_FILE # tag is left empty. LAYOUT_FILE = # The CITE_BIB_FILES tag can be used to specify one or more bib files containing # the reference definitions. This must be a list of .bib files. The .bib # extension is automatically appended if omitted. This requires the bibtex tool # to be installed. See also http://en.wikipedia.org/wiki/BibTeX for more info. # For LaTeX the style of the bibliography can be controlled using # LATEX_BIB_STYLE. To use this feature you need bibtex and perl available in the # search path. See also \cite for info how to create references. CITE_BIB_FILES = #--------------------------------------------------------------------------- # Configuration options related to warning and progress messages #--------------------------------------------------------------------------- # The QUIET tag can be used to turn on/off the messages that are generated to # standard output by doxygen. If QUIET is set to YES this implies that the # messages are off. # The default value is: NO. QUIET = NO # The WARNINGS tag can be used to turn on/off the warning messages that are # generated to standard error (stderr) by doxygen. If WARNINGS is set to YES # this implies that the warnings are on. # # Tip: Turn warnings on while writing the documentation. # The default value is: YES. WARNINGS = YES # If the WARN_IF_UNDOCUMENTED tag is set to YES then doxygen will generate # warnings for undocumented members. If EXTRACT_ALL is set to YES then this flag # will automatically be disabled. # The default value is: YES. WARN_IF_UNDOCUMENTED = YES # If the WARN_IF_DOC_ERROR tag is set to YES, doxygen will generate warnings for # potential errors in the documentation, such as not documenting some parameters # in a documented function, or documenting parameters that don't exist or using # markup commands wrongly. # The default value is: YES. WARN_IF_DOC_ERROR = YES # This WARN_NO_PARAMDOC option can be enabled to get warnings for functions that # are documented, but have no documentation for their parameters or return # value. If set to NO, doxygen will only warn about wrong or incomplete # parameter documentation, but not about the absence of documentation. # The default value is: NO. WARN_NO_PARAMDOC = NO # The WARN_FORMAT tag determines the format of the warning messages that doxygen # can produce. The string should contain the $file, $line, and $text tags, which # will be replaced by the file and line number from which the warning originated # and the warning text. Optionally the format may contain $version, which will # be replaced by the version of the file (if it could be obtained via # FILE_VERSION_FILTER) # The default value is: $file:$line: $text. WARN_FORMAT = "$file:$line: $text" # The WARN_LOGFILE tag can be used to specify a file to which warning and error # messages should be written. If left blank the output is written to standard # error (stderr). WARN_LOGFILE = #--------------------------------------------------------------------------- # Configuration options related to the input files #--------------------------------------------------------------------------- # The INPUT tag is used to specify the files and/or directories that contain # documented source files. You may enter file names like myfile.cpp or # directories like /usr/src/myproject. Separate the files or directories with # spaces. See also FILE_PATTERNS and EXTENSION_MAPPING # Note: If this tag is empty the current directory is searched. INPUT = ../../library/include/ # This tag can be used to specify the character encoding of the source files # that doxygen parses. Internally doxygen uses the UTF-8 encoding. Doxygen uses # libiconv (or the iconv built into libc) for the transcoding. See the libiconv # documentation (see: http://www.gnu.org/software/libiconv) for the list of # possible encodings. # The default value is: UTF-8. INPUT_ENCODING = UTF-8 # If the value of the INPUT tag contains directories, you can use the # FILE_PATTERNS tag to specify one or more wildcard patterns (like *.cpp and # *.h) to filter out the source-files in the directories. # # Note that for custom extensions or not directly supported extensions you also # need to set EXTENSION_MAPPING for the extension otherwise the files are not # read by doxygen. # # If left blank the following patterns are tested:*.c, *.cc, *.cxx, *.cpp, # *.c++, *.java, *.ii, *.ixx, *.ipp, *.i++, *.inl, *.idl, *.ddl, *.odl, *.h, # *.hh, *.hxx, *.hpp, *.h++, *.cs, *.d, *.php, *.php4, *.php5, *.phtml, *.inc, # *.m, *.markdown, *.md, *.mm, *.dox, *.py, *.f90, *.f, *.for, *.tcl, *.vhd, # *.vhdl, *.ucf, *.qsf, *.as and *.js. FILE_PATTERNS = *.c \ *.cc \ *.cxx \ *.cpp \ *.c++ \ *.java \ *.ii \ *.ixx \ *.ipp \ *.i++ \ *.inl \ *.idl \ *.ddl \ *.odl \ *.h \ *.hh \ *.hxx \ *.hpp \ *.h++ \ *.cs \ *.d \ *.php \ *.php4 \ *.php5 \ *.phtml \ *.inc \ *.m \ *.markdown \ *.md \ *.mm \ *.dox \ *.py \ *.f90 \ *.f \ *.for \ *.tcl \ *.vhd \ *.vhdl \ *.ucf \ *.qsf \ *.as \ *.js # The RECURSIVE tag can be used to specify whether or not subdirectories should # be searched for input files as well. # The default value is: NO. RECURSIVE = NO # The EXCLUDE tag can be used to specify files and/or directories that should be # excluded from the INPUT source files. This way you can easily exclude a # subdirectory from a directory tree whose root is specified with the INPUT tag. # # Note that relative paths are relative to the directory from which doxygen is # run. EXCLUDE = # The EXCLUDE_SYMLINKS tag can be used to select whether or not files or # directories that are symbolic links (a Unix file system feature) are excluded # from the input. # The default value is: NO. EXCLUDE_SYMLINKS = NO # If the value of the INPUT tag contains directories, you can use the # EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude # certain files from those directories. # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories for example use the pattern */test/* EXCLUDE_PATTERNS = # The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names # (namespaces, classes, functions, etc.) that should be excluded from the # output. The symbol name can be a fully qualified name, a word, or if the # wildcard * is used, a substring. Examples: ANamespace, AClass, # AClass::ANamespace, ANamespace::*Test # # Note that the wildcards are matched against the file with absolute path, so to # exclude all test directories use the pattern */test/* EXCLUDE_SYMBOLS = # The EXAMPLE_PATH tag can be used to specify one or more files or directories # that contain example code fragments that are included (see the \include # command). EXAMPLE_PATH = # If the value of the EXAMPLE_PATH tag contains directories, you can use the # EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp and # *.h) to filter out the source-files in the directories. If left blank all # files are included. EXAMPLE_PATTERNS = * # If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be # searched for input files to be used with the \include or \dontinclude commands # irrespective of the value of the RECURSIVE tag. # The default value is: NO. EXAMPLE_RECURSIVE = NO # The IMAGE_PATH tag can be used to specify one or more files or directories # that contain images that are to be included in the documentation (see the # \image command). IMAGE_PATH = # The INPUT_FILTER tag can be used to specify a program that doxygen should # invoke to filter for each input file. Doxygen will invoke the filter program # by executing (via popen()) the command: # # # # where is the value of the INPUT_FILTER tag, and is the # name of an input file. Doxygen will then use the output that the filter # program writes to standard output. If FILTER_PATTERNS is specified, this tag # will be ignored. # # Note that the filter must not add or remove lines; it is applied before the # code is scanned, but not when the output code is generated. If lines are added # or removed, the anchors will not be placed correctly. INPUT_FILTER = # The FILTER_PATTERNS tag can be used to specify filters on a per file pattern # basis. Doxygen will compare the file name with each pattern and apply the # filter if there is a match. The filters are a list of the form: pattern=filter # (like *.cpp=my_cpp_filter). See INPUT_FILTER for further information on how # filters are used. If the FILTER_PATTERNS tag is empty or if none of the # patterns match the file name, INPUT_FILTER is applied. FILTER_PATTERNS = # If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using # INPUT_FILTER) will also be used to filter the input files that are used for # producing the source files to browse (i.e. when SOURCE_BROWSER is set to YES). # The default value is: NO. FILTER_SOURCE_FILES = NO # The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file # pattern. A pattern will override the setting for FILTER_PATTERN (if any) and # it is also possible to disable source filtering for a specific pattern using # *.ext= (so without naming a filter). # This tag requires that the tag FILTER_SOURCE_FILES is set to YES. FILTER_SOURCE_PATTERNS = # If the USE_MDFILE_AS_MAINPAGE tag refers to the name of a markdown file that # is part of the input, its contents will be placed on the main page # (index.html). This can be useful if you have a project on for instance GitHub # and want to reuse the introduction page also for the doxygen output. USE_MDFILE_AS_MAINPAGE = ../README.md #--------------------------------------------------------------------------- # Configuration options related to source browsing #--------------------------------------------------------------------------- # If the SOURCE_BROWSER tag is set to YES then a list of source files will be # generated. Documented entities will be cross-referenced with these sources. # # Note: To get rid of all source code in the generated output, make sure that # also VERBATIM_HEADERS is set to NO. # The default value is: NO. SOURCE_BROWSER = NO # Setting the INLINE_SOURCES tag to YES will include the body of functions, # classes and enums directly into the documentation. # The default value is: NO. INLINE_SOURCES = NO # Setting the STRIP_CODE_COMMENTS tag to YES will instruct doxygen to hide any # special comment blocks from generated source code fragments. Normal C, C++ and # Fortran comments will always remain visible. # The default value is: YES. STRIP_CODE_COMMENTS = YES # If the REFERENCED_BY_RELATION tag is set to YES then for each documented # function all documented functions referencing it will be listed. # The default value is: NO. REFERENCED_BY_RELATION = NO # If the REFERENCES_RELATION tag is set to YES then for each documented function # all documented entities called/used by that function will be listed. # The default value is: NO. REFERENCES_RELATION = NO # If the REFERENCES_LINK_SOURCE tag is set to YES and SOURCE_BROWSER tag is set # to YES then the hyperlinks from functions in REFERENCES_RELATION and # REFERENCED_BY_RELATION lists will link to the source code. Otherwise they will # link to the documentation. # The default value is: YES. REFERENCES_LINK_SOURCE = YES # If SOURCE_TOOLTIPS is enabled (the default) then hovering a hyperlink in the # source code will show a tooltip with additional information such as prototype, # brief description and links to the definition and documentation. Since this # will make the HTML file larger and loading of large files a bit slower, you # can opt to disable this feature. # The default value is: YES. # This tag requires that the tag SOURCE_BROWSER is set to YES. SOURCE_TOOLTIPS = YES # If the USE_HTAGS tag is set to YES then the references to source code will # point to the HTML generated by the htags(1) tool instead of doxygen built-in # source browser. The htags tool is part of GNU's global source tagging system # (see http://www.gnu.org/software/global/global.html). You will need version # 4.8.6 or higher. # # To use it do the following: # - Install the latest version of global # - Enable SOURCE_BROWSER and USE_HTAGS in the config file # - Make sure the INPUT points to the root of the source tree # - Run doxygen as normal # # Doxygen will invoke htags (and that will in turn invoke gtags), so these # tools must be available from the command line (i.e. in the search path). # # The result: instead of the source browser generated by doxygen, the links to # source code will now point to the output of htags. # The default value is: NO. # This tag requires that the tag SOURCE_BROWSER is set to YES. USE_HTAGS = NO # If the VERBATIM_HEADERS tag is set the YES then doxygen will generate a # verbatim copy of the header file for each class for which an include is # specified. Set to NO to disable this. # See also: Section \class. # The default value is: YES. VERBATIM_HEADERS = YES # If the CLANG_ASSISTED_PARSING tag is set to YES then doxygen will use the # clang parser (see: http://clang.llvm.org/) for more accurate parsing at the # cost of reduced performance. This can be particularly helpful with template # rich C++ code for which doxygen's built-in parser lacks the necessary type # information. # Note: The availability of this option depends on whether or not doxygen was # compiled with the --with-libclang option. # The default value is: NO. CLANG_ASSISTED_PARSING = NO # If clang assisted parsing is enabled you can provide the compiler with command # line options that you would normally use when invoking the compiler. Note that # the include paths will already be set by doxygen for the files and directories # specified with INPUT and INCLUDE_PATH. # This tag requires that the tag CLANG_ASSISTED_PARSING is set to YES. CLANG_OPTIONS = #--------------------------------------------------------------------------- # Configuration options related to the alphabetical class index #--------------------------------------------------------------------------- # If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index of all # compounds will be generated. Enable this if the project contains a lot of # classes, structs, unions or interfaces. # The default value is: YES. ALPHABETICAL_INDEX = YES # The COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns in # which the alphabetical index list will be split. # Minimum value: 1, maximum value: 20, default value: 5. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. COLS_IN_ALPHA_INDEX = 5 # In case all classes in a project start with a common prefix, all classes will # be put under the same header in the alphabetical index. The IGNORE_PREFIX tag # can be used to specify a prefix (or a list of prefixes) that should be ignored # while generating the index headers. # This tag requires that the tag ALPHABETICAL_INDEX is set to YES. IGNORE_PREFIX = #--------------------------------------------------------------------------- # Configuration options related to the HTML output #--------------------------------------------------------------------------- # If the GENERATE_HTML tag is set to YES, doxygen will generate HTML output # The default value is: YES. GENERATE_HTML = YES # The HTML_OUTPUT tag is used to specify where the HTML docs will be put. If a # relative path is entered the value of OUTPUT_DIRECTORY will be put in front of # it. # The default directory is: html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_OUTPUT = html # The HTML_FILE_EXTENSION tag can be used to specify the file extension for each # generated HTML page (for example: .htm, .php, .asp). # The default value is: .html. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FILE_EXTENSION = .html # The HTML_HEADER tag can be used to specify a user-defined HTML header file for # each generated HTML page. If the tag is left blank doxygen will generate a # standard header. # # To get valid HTML the header file that includes any scripts and style sheets # that doxygen needs, which is dependent on the configuration options used (e.g. # the setting GENERATE_TREEVIEW). It is highly recommended to start with a # default header using # doxygen -w html new_header.html new_footer.html new_stylesheet.css # YourConfigFile # and then modify the file new_header.html. See also section "Doxygen usage" # for information on how to generate the default header that doxygen normally # uses. # Note: The header is subject to change so you typically have to regenerate the # default header when upgrading to a newer version of doxygen. For a description # of the possible markers and block names see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_HEADER = # The HTML_FOOTER tag can be used to specify a user-defined HTML footer for each # generated HTML page. If the tag is left blank doxygen will generate a standard # footer. See HTML_HEADER for more information on how to generate a default # footer and what special commands can be used inside the footer. See also # section "Doxygen usage" for information on how to generate the default footer # that doxygen normally uses. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_FOOTER = # The HTML_STYLESHEET tag can be used to specify a user-defined cascading style # sheet that is used by each HTML page. It can be used to fine-tune the look of # the HTML output. If left blank doxygen will generate a default style sheet. # See also section "Doxygen usage" for information on how to generate the style # sheet that doxygen normally uses. # Note: It is recommended to use HTML_EXTRA_STYLESHEET instead of this tag, as # it is more robust and this tag (HTML_STYLESHEET) will in the future become # obsolete. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_STYLESHEET = # The HTML_EXTRA_STYLESHEET tag can be used to specify additional user-defined # cascading style sheets that are included after the standard style sheets # created by doxygen. Using this option one can overrule certain style aspects. # This is preferred over using HTML_STYLESHEET since it does not replace the # standard style sheet and is therefore more robust against future updates. # Doxygen will copy the style sheet files to the output directory. # Note: The order of the extra style sheet files is of importance (e.g. the last # style sheet in the list overrules the setting of the previous ones in the # list). For an example see the documentation. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_STYLESHEET = # The HTML_EXTRA_FILES tag can be used to specify one or more extra images or # other source files which should be copied to the HTML output directory. Note # that these files will be copied to the base HTML output directory. Use the # $relpath^ marker in the HTML_HEADER and/or HTML_FOOTER files to load these # files. In the HTML_STYLESHEET file, use the file name only. Also note that the # files will be copied as-is; there are no commands or markers available. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_EXTRA_FILES = # The HTML_COLORSTYLE_HUE tag controls the color of the HTML output. Doxygen # will adjust the colors in the style sheet and background images according to # this color. Hue is specified as an angle on a colorwheel, see # http://en.wikipedia.org/wiki/Hue for more information. For instance the value # 0 represents red, 60 is yellow, 120 is green, 180 is cyan, 240 is blue, 300 # purple, and 360 is red again. # Minimum value: 0, maximum value: 359, default value: 220. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_HUE = 220 # The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of the colors # in the HTML output. For a value of 0 the output will use grayscales only. A # value of 255 will produce the most vivid colors. # Minimum value: 0, maximum value: 255, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_SAT = 100 # The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to the # luminance component of the colors in the HTML output. Values below 100 # gradually make the output lighter, whereas values above 100 make the output # darker. The value divided by 100 is the actual gamma applied, so 80 represents # a gamma of 0.8, The value 220 represents a gamma of 2.2, and 100 does not # change the gamma. # Minimum value: 40, maximum value: 240, default value: 80. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_COLORSTYLE_GAMMA = 80 # If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML # page will contain the date and time when the page was generated. Setting this # to YES can help to show when doxygen was last run and thus if the # documentation is up to date. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_TIMESTAMP = NO # If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML # documentation will contain sections that can be hidden and shown after the # page has loaded. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_DYNAMIC_SECTIONS = NO # With HTML_INDEX_NUM_ENTRIES one can control the preferred number of entries # shown in the various tree structured indices initially; the user can expand # and collapse entries dynamically later on. Doxygen will expand the tree to # such a level that at most the specified number of entries are visible (unless # a fully collapsed tree already exceeds this amount). So setting the number of # entries 1 will produce a full collapsed tree by default. 0 is a special value # representing an infinite number of entries and will result in a full expanded # tree by default. # Minimum value: 0, maximum value: 9999, default value: 100. # This tag requires that the tag GENERATE_HTML is set to YES. HTML_INDEX_NUM_ENTRIES = 100 # If the GENERATE_DOCSET tag is set to YES, additional index files will be # generated that can be used as input for Apple's Xcode 3 integrated development # environment (see: http://developer.apple.com/tools/xcode/), introduced with # OSX 10.5 (Leopard). To create a documentation set, doxygen will generate a # Makefile in the HTML output directory. Running make will produce the docset in # that directory and running make install will install the docset in # ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find it at # startup. See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html # for more information. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_DOCSET = NO # This tag determines the name of the docset feed. A documentation feed provides # an umbrella under which multiple documentation sets from a single provider # (such as a company or product suite) can be grouped. # The default value is: Doxygen generated docs. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_FEEDNAME = "Doxygen generated docs" # This tag specifies a string that should uniquely identify the documentation # set bundle. This should be a reverse domain-name style string, e.g. # com.mycompany.MyDocSet. Doxygen will append .docset to the name. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_BUNDLE_ID = org.doxygen.Project # The DOCSET_PUBLISHER_ID tag specifies a string that should uniquely identify # the documentation publisher. This should be a reverse domain-name style # string, e.g. com.mycompany.MyDocSet.documentation. # The default value is: org.doxygen.Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_ID = org.doxygen.Publisher # The DOCSET_PUBLISHER_NAME tag identifies the documentation publisher. # The default value is: Publisher. # This tag requires that the tag GENERATE_DOCSET is set to YES. DOCSET_PUBLISHER_NAME = Publisher # If the GENERATE_HTMLHELP tag is set to YES then doxygen generates three # additional HTML index files: index.hhp, index.hhc, and index.hhk. The # index.hhp is a project file that can be read by Microsoft's HTML Help Workshop # (see: http://www.microsoft.com/en-us/download/details.aspx?id=21138) on # Windows. # # The HTML Help Workshop contains a compiler that can convert all HTML output # generated by doxygen into a single compiled HTML file (.chm). Compiled HTML # files are now used as the Windows 98 help format, and will replace the old # Windows help format (.hlp) on all Windows platforms in the future. Compressed # HTML files also contain an index, a table of contents, and you can search for # words in the documentation. The HTML workshop also contains a viewer for # compressed HTML files. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_HTMLHELP = NO # The CHM_FILE tag can be used to specify the file name of the resulting .chm # file. You can add a path in front of the file if the result should not be # written to the html output directory. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_FILE = # The HHC_LOCATION tag can be used to specify the location (absolute path # including file name) of the HTML help compiler (hhc.exe). If non-empty, # doxygen will try to run the HTML help compiler on the generated index.hhp. # The file has to be specified with full path. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. HHC_LOCATION = # The GENERATE_CHI flag controls if a separate .chi index file is generated # (YES) or that it should be included in the master .chm file (NO). # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. GENERATE_CHI = NO # The CHM_INDEX_ENCODING is used to encode HtmlHelp index (hhk), content (hhc) # and project file content. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. CHM_INDEX_ENCODING = # The BINARY_TOC flag controls whether a binary table of contents is generated # (YES) or a normal table of contents (NO) in the .chm file. Furthermore it # enables the Previous and Next buttons. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. BINARY_TOC = NO # The TOC_EXPAND flag can be set to YES to add extra items for group members to # the table of contents of the HTML help documentation and to the tree view. # The default value is: NO. # This tag requires that the tag GENERATE_HTMLHELP is set to YES. TOC_EXPAND = NO # If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and # QHP_VIRTUAL_FOLDER are set, an additional index file will be generated that # can be used as input for Qt's qhelpgenerator to generate a Qt Compressed Help # (.qch) of the generated HTML documentation. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_QHP = NO # If the QHG_LOCATION tag is specified, the QCH_FILE tag can be used to specify # the file name of the resulting .qch file. The path specified is relative to # the HTML output folder. # This tag requires that the tag GENERATE_QHP is set to YES. QCH_FILE = # The QHP_NAMESPACE tag specifies the namespace to use when generating Qt Help # Project output. For more information please see Qt Help Project / Namespace # (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#namespace). # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_NAMESPACE = org.doxygen.Project # The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating Qt # Help Project output. For more information please see Qt Help Project / Virtual # Folders (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#virtual- # folders). # The default value is: doc. # This tag requires that the tag GENERATE_QHP is set to YES. QHP_VIRTUAL_FOLDER = doc # If the QHP_CUST_FILTER_NAME tag is set, it specifies the name of a custom # filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_NAME = # The QHP_CUST_FILTER_ATTRS tag specifies the list of the attributes of the # custom filter to add. For more information please see Qt Help Project / Custom # Filters (see: http://qt-project.org/doc/qt-4.8/qthelpproject.html#custom- # filters). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_CUST_FILTER_ATTRS = # The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this # project's filter section matches. Qt Help Project / Filter Attributes (see: # http://qt-project.org/doc/qt-4.8/qthelpproject.html#filter-attributes). # This tag requires that the tag GENERATE_QHP is set to YES. QHP_SECT_FILTER_ATTRS = # The QHG_LOCATION tag can be used to specify the location of Qt's # qhelpgenerator. If non-empty doxygen will try to run qhelpgenerator on the # generated .qhp file. # This tag requires that the tag GENERATE_QHP is set to YES. QHG_LOCATION = # If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files will be # generated, together with the HTML files, they form an Eclipse help plugin. To # install this plugin and make it available under the help contents menu in # Eclipse, the contents of the directory containing the HTML and XML files needs # to be copied into the plugins directory of eclipse. The name of the directory # within the plugins directory should be the same as the ECLIPSE_DOC_ID value. # After copying Eclipse needs to be restarted before the help appears. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_ECLIPSEHELP = NO # A unique identifier for the Eclipse help plugin. When installing the plugin # the directory name containing the HTML and XML files should also have this # name. Each documentation set should have its own identifier. # The default value is: org.doxygen.Project. # This tag requires that the tag GENERATE_ECLIPSEHELP is set to YES. ECLIPSE_DOC_ID = org.doxygen.Project # If you want full control over the layout of the generated HTML pages it might # be necessary to disable the index and replace it with your own. The # DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs) at top # of each HTML page. A value of NO enables the index and the value YES disables # it. Since the tabs in the index contain the same information as the navigation # tree, you can set this option to YES if you also set GENERATE_TREEVIEW to YES. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. DISABLE_INDEX = NO # The GENERATE_TREEVIEW tag is used to specify whether a tree-like index # structure should be generated to display hierarchical information. If the tag # value is set to YES, a side panel will be generated containing a tree-like # index structure (just like the one that is generated for HTML Help). For this # to work a browser that supports JavaScript, DHTML, CSS and frames is required # (i.e. any modern browser). Windows users are probably better off using the # HTML help feature. Via custom style sheets (see HTML_EXTRA_STYLESHEET) one can # further fine-tune the look of the index. As an example, the default style # sheet generated by doxygen has an example that shows how to put an image at # the root of the tree instead of the PROJECT_NAME. Since the tree basically has # the same information as the tab index, you could consider setting # DISABLE_INDEX to YES when enabling this option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. GENERATE_TREEVIEW = NO # The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values that # doxygen will group on one line in the generated HTML documentation. # # Note that a value of 0 will completely suppress the enum values from appearing # in the overview section. # Minimum value: 0, maximum value: 20, default value: 4. # This tag requires that the tag GENERATE_HTML is set to YES. ENUM_VALUES_PER_LINE = 1 # If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be used # to set the initial width (in pixels) of the frame in which the tree is shown. # Minimum value: 0, maximum value: 1500, default value: 250. # This tag requires that the tag GENERATE_HTML is set to YES. TREEVIEW_WIDTH = 250 # If the EXT_LINKS_IN_WINDOW option is set to YES, doxygen will open links to # external symbols imported via tag files in a separate window. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. EXT_LINKS_IN_WINDOW = NO # Use this tag to change the font size of LaTeX formulas included as images in # the HTML documentation. When you change the font size after a successful # doxygen run you need to manually remove any form_*.png images from the HTML # output directory to force them to be regenerated. # Minimum value: 8, maximum value: 50, default value: 10. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_FONTSIZE = 10 # Use the FORMULA_TRANPARENT tag to determine whether or not the images # generated for formulas are transparent PNGs. Transparent PNGs are not # supported properly for IE 6.0, but are supported on all modern browsers. # # Note that when changing this option you need to delete any form_*.png files in # the HTML output directory before the changes have effect. # The default value is: YES. # This tag requires that the tag GENERATE_HTML is set to YES. FORMULA_TRANSPARENT = YES # Enable the USE_MATHJAX option to render LaTeX formulas using MathJax (see # http://www.mathjax.org) which uses client side Javascript for the rendering # instead of using pre-rendered bitmaps. Use this if you do not have LaTeX # installed or if you want to formulas look prettier in the HTML output. When # enabled you may also need to install MathJax separately and configure the path # to it using the MATHJAX_RELPATH option. # The default value is: NO. # This tag requires that the tag GENERATE_HTML is set to YES. USE_MATHJAX = YES # When MathJax is enabled you can set the default output format to be used for # the MathJax output. See the MathJax site (see: # http://docs.mathjax.org/en/latest/output.html) for more details. # Possible values are: HTML-CSS (which is slower, but has the best # compatibility), NativeMML (i.e. MathML) and SVG. # The default value is: HTML-CSS. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_FORMAT = HTML-CSS # When MathJax is enabled you need to specify the location relative to the HTML # output directory using the MATHJAX_RELPATH option. The destination directory # should contain the MathJax.js script. For instance, if the mathjax directory # is located at the same level as the HTML output directory, then # MATHJAX_RELPATH should be ../mathjax. The default value points to the MathJax # Content Delivery Network so you can quickly see the result without installing # MathJax. However, it is strongly recommended to install a local copy of # MathJax from http://www.mathjax.org before deployment. # The default value is: http://cdn.mathjax.org/mathjax/latest. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_RELPATH = http://cdn.mathjax.org/mathjax/latest # The MATHJAX_EXTENSIONS tag can be used to specify one or more MathJax # extension names that should be enabled during MathJax rendering. For example # MATHJAX_EXTENSIONS = TeX/AMSmath TeX/AMSsymbols # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_EXTENSIONS = # The MATHJAX_CODEFILE tag can be used to specify a file with javascript pieces # of code that will be used on startup of the MathJax code. See the MathJax site # (see: http://docs.mathjax.org/en/latest/output.html) for more details. For an # example see the documentation. # This tag requires that the tag USE_MATHJAX is set to YES. MATHJAX_CODEFILE = # When the SEARCHENGINE tag is enabled doxygen will generate a search box for # the HTML output. The underlying search engine uses javascript and DHTML and # should work on any modern browser. Note that when using HTML help # (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets (GENERATE_DOCSET) # there is already a search function so this one should typically be disabled. # For large projects the javascript based search engine can be slow, then # enabling SERVER_BASED_SEARCH may provide a better solution. It is possible to # search using the keyboard; to jump to the search box use + S # (what the is depends on the OS and browser, but it is typically # , /